fixed stupid bug

master
Avril 4 years ago
parent a9e9e48633
commit 40d9335718
Signed by: flanchan
GPG Key ID: 284488987C31F630

2
Cargo.lock generated

@ -466,7 +466,7 @@ checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
[[package]] [[package]]
name = "rmdupe" name = "rmdupe"
version = "0.1.0" version = "1.0.0"
dependencies = [ dependencies = [
"chrono", "chrono",
"futures", "futures",

@ -1,6 +1,6 @@
[package] [package]
name = "rmdupe" name = "rmdupe"
version = "0.1.0" version = "1.0.0"
authors = ["Avril <flanchan@cumallover.me>"] authors = ["Avril <flanchan@cumallover.me>"]
edition = "2018" edition = "2018"

@ -12,3 +12,20 @@ T: Clone
} }
sz sz
} }
#[inline]
pub fn reinterpret<T>(src: &T) -> &[u8]
where T: ?Sized
{
unsafe {
std::slice::from_raw_parts(src as *const T as *const u8, std::mem::size_of_val(src))
}
}
pub fn reinterpret_back<T>(src: &[u8]) -> &T
where T: ?Sized + Copy
{
unsafe {
&*(&src[0] as *const u8 as *const T)
}
}

@ -1,42 +1,83 @@
use super::*; use super::*;
use std::{ use std::{
collections::HashSet, collections::{HashMap, HashSet},
io::{ io::{
self, self,
Write, Write,
Read, Read,
}, },
path::{
Path,
PathBuf
},
}; };
#[derive(Clone, PartialEq, Eq, Debug)] #[derive(Clone, PartialEq, Eq, Debug)]
pub struct DupeMap(HashSet<hash::Sha256Hash>); pub struct DupeMap
{
iteration: HashSet<hash::Sha256Hash>, // What we calculate
table: HashMap<PathBuf, hash::Sha256Hash>, // What we save and load
}
/// Do we care about windows? nah
#[inline]
fn path_bytes(path: &Path) -> &[u8]
{
use std::os::unix::ffi::OsStrExt;
path.as_os_str().as_bytes()
}
#[inline]
fn bytes_path(bytes: &[u8]) -> &Path
{
use std::os::unix::ffi::OsStrExt;
std::ffi::OsStr::from_bytes(bytes).as_ref()
}
const ENTRY_HEADER: &[u8] = &[0x00, 0xde, 0xad];
impl DupeMap impl DupeMap
{ {
/// Create a new empty dupe map /// Create a new empty dupe map
pub fn new() -> Self pub fn new() -> Self
{ {
Self(HashSet::new()) Self{iteration: HashSet::new(), table: HashMap::new()}
} }
/// Iterator over all added keys /// Iterator over all hashes
pub fn iter(&self) -> std::collections::hash_set::Iter<hash::Sha256Hash> pub fn iter(&self) -> std::collections::hash_set::Iter<hash::Sha256Hash>
{ {
self.0.iter() self.iteration.iter()
}
/// Cache this path's hash
///
/// # Returns
///
/// True if caching was okay, false if key already added.
pub fn cache<T: AsRef<Path>>(&mut self, id: T, hash: hash::Sha256Hash) -> bool
{
if self.table.contains_key(id.as_ref()) {
false
} else {
self.table.insert(id.as_ref().to_owned(), hash);
true
}
} }
/// Is this hash in the set? /// Look for path `id` in cache.
pub fn peek(&self, hash: &hash::Sha256Hash) -> bool { pub fn get_cache<T: AsRef<Path>>(&self, id: T) -> Option<&hash::Sha256Hash>
self.0.contains(hash) {
self.table.get(id.as_ref())
} }
/// Try to add an entry, returns true if was not a dupe, false if it was. /// Try to add to store. True if adding was oke, false if already exists.
pub fn try_add(&mut self, hash: hash::Sha256Hash) -> bool pub fn try_add(&mut self, hash: hash::Sha256Hash) -> bool
{ {
if self.0.contains(&hash) { if self.iteration.contains(&hash) {
false false
} else { } else {
self.0.insert(hash); self.iteration.insert(hash);
true true
} }
} }
@ -45,9 +86,15 @@ impl DupeMap
pub fn save<W: Write>(&self, to: &mut W) -> io::Result<usize> pub fn save<W: Write>(&self, to: &mut W) -> io::Result<usize>
{ {
let mut done=0; let mut done=0;
for x in self.0.iter() for (path, hash) in self.table.iter()
{ {
to.write(x.as_ref())?; let path = path_bytes(path.as_ref());
let hash: &[u8] = hash.as_ref();
to.write(ENTRY_HEADER)?;
to.write(bytes::reinterpret(&path.len()))?;
to.write(path)?;
to.write(hash)?;
done+=1; done+=1;
} }
Ok(done) Ok(done)
@ -60,9 +107,15 @@ impl DupeMap
use tokio::prelude::*; use tokio::prelude::*;
let mut done=0; let mut done=0;
for x in self.0.iter() for (path, hash) in self.table.iter()
{ {
to.write(x.as_ref()).await?; let path = path_bytes(path.as_ref());
let hash: &[u8] = hash.as_ref();
to.write(ENTRY_HEADER).await?;
to.write(bytes::reinterpret(&path.len())).await?;
to.write(path).await?;
to.write(hash).await?;
done+=1; done+=1;
} }
Ok(done) Ok(done)
@ -73,15 +126,26 @@ impl DupeMap
{ {
let mut done=0; let mut done=0;
let mut read; let mut read;
let mut buffer = [0u8; hash::SHA256_SIZE]; let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::<usize>()];
let mut hash_buffer = [0u8; hash::SHA256_SIZE];
while {read = from.read(&mut buffer[..])?; read==hash::SHA256_SIZE} { while {read = from.read(&mut header_buffer[..])?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER}
done += if self.try_add(hash::Sha256Hash::new(buffer)) { {
1 let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]);
} else { if sz > 0 {
0 let mut path = vec![0u8; sz];
}; if from.read(&mut path[..])? == sz {
let path = bytes_path(&path[..]);
if from.read(&mut hash_buffer[..])? == hash::SHA256_SIZE
{
if self.cache(path, hash::Sha256Hash::new(hash_buffer)) {
done +=1;
}
}
}
}
} }
Ok(done) Ok(done)
} }
@ -94,15 +158,26 @@ impl DupeMap
let mut done=0; let mut done=0;
let mut read; let mut read;
let mut buffer = [0u8; hash::SHA256_SIZE]; let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::<usize>()];
let mut hash_buffer = [0u8; hash::SHA256_SIZE];
while {read = from.read(&mut buffer[..]).await?; read==hash::SHA256_SIZE} { while {read = from.read(&mut header_buffer[..]).await?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER}
done += if self.try_add(hash::Sha256Hash::new(buffer)) { {
1 let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]);
} else { if sz > 0 {
0 let mut path = vec![0u8; sz];
}; if from.read(&mut path[..]).await? == sz {
let path = bytes_path(&path[..]);
if from.read(&mut hash_buffer[..]).await? == hash::SHA256_SIZE
{
if self.cache(path, hash::Sha256Hash::new(hash_buffer)) {
done +=1;
}
}
}
}
} }
Ok(done) Ok(done)
} }
} }

@ -96,39 +96,54 @@ impl Default for DupeCount
} }
/// Process a file and add it to the table, returns true if is not a dupe. /// Process a file and add it to the table, returns true if is not a dupe.
pub fn process_file<P: AsRef<Path>>(file: P, set: &mut container::DupeMap) -> Result<bool, error::Error> pub fn process_file<P: AsRef<Path>>(path: P, set: &mut container::DupeMap) -> Result<bool, error::Error>
{ {
let mut file = OpenOptions::new() let path = path.as_ref();
.read(true) if let Some(&hash) = set.get_cache(path) {
.open(file)?; Ok(set.try_add(hash))
let sz: usize = file.metadata()?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; } else {
let mut file = OpenOptions::new()
let mut result = hash::Sha256Hash::default(); .read(true)
error::check_size(sz, hash::compute(&mut file, &mut result)?)?; .open(path)?;
let sz: usize = file.metadata()?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?;
Ok(set.try_add(result))
let mut result = hash::Sha256Hash::default();
error::check_size(sz, hash::compute(&mut file, &mut result)?)?;
set.cache(path, result);
Ok(set.try_add(result))
}
} }
/// Process a file and add it to the table, returns true if is not a dupe. /// Process a file and add it to the table, returns true if is not a dupe.
#[cfg(feature="threads")] #[cfg(feature="threads")]
pub async fn process_file_async<P: AsRef<Path>>(file: P, set: &std::sync::Arc<tokio::sync::Mutex<container::DupeMap>>) -> Result<bool, error::Error> pub async fn process_file_async<P: AsRef<Path>>(path: P, set: &std::sync::Arc<tokio::sync::Mutex<container::DupeMap>>) -> Result<bool, error::Error>
{ {
use tokio::{ use tokio::{
fs::{ fs::{
OpenOptions, OpenOptions,
}, },
}; };
let mut file = OpenOptions::new() let path = path.as_ref();
.read(true) if let Some(hash) = {
.open(file).await?; let set = set.lock().await;
let sz: usize = file.metadata().await?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; set.get_cache(path).and_then(|&h| Some(h))
} {
let mut result = hash::Sha256Hash::default(); let mut set = set.lock().await;
error::check_size(sz, hash::compute_async(&mut file, &mut result).await?)?; Ok(set.try_add(hash))
} else {
let mut file = OpenOptions::new()
.read(true)
.open(path).await?;
let sz: usize = file.metadata().await?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?;
let mut result = hash::Sha256Hash::default();
error::check_size(sz, hash::compute_async(&mut file, &mut result).await?)?;
let mut set = set.lock().await; let mut set = set.lock().await;
Ok(set.try_add(result)) set.cache(path, result);
Ok(set.try_add(result))
}
} }
/// Walk a dir structure and remove all dupes in it /// Walk a dir structure and remove all dupes in it

Loading…
Cancel
Save