From 40d9335718c7ffa008f1be721add09ab18aec4ef Mon Sep 17 00:00:00 2001 From: Avril Date: Fri, 10 Jul 2020 17:19:28 +0100 Subject: [PATCH] fixed stupid bug --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/bytes.rs | 17 ++++++ src/container.rs | 137 ++++++++++++++++++++++++++++++++++++----------- src/proc.rs | 55 ++++++++++++------- 5 files changed, 160 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c450af3..a922a08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -466,7 +466,7 @@ checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" [[package]] name = "rmdupe" -version = "0.1.0" +version = "1.0.0" dependencies = [ "chrono", "futures", diff --git a/Cargo.toml b/Cargo.toml index 344bdbb..47243a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rmdupe" -version = "0.1.0" +version = "1.0.0" authors = ["Avril "] edition = "2018" diff --git a/src/bytes.rs b/src/bytes.rs index 4d8335a..f7d4216 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -12,3 +12,20 @@ T: Clone } sz } + +#[inline] +pub fn reinterpret(src: &T) -> &[u8] + where T: ?Sized +{ + unsafe { + std::slice::from_raw_parts(src as *const T as *const u8, std::mem::size_of_val(src)) + } +} + +pub fn reinterpret_back(src: &[u8]) -> &T + where T: ?Sized + Copy +{ + unsafe { + &*(&src[0] as *const u8 as *const T) + } +} diff --git a/src/container.rs b/src/container.rs index 2181a29..49b2000 100644 --- a/src/container.rs +++ b/src/container.rs @@ -1,42 +1,83 @@ use super::*; use std::{ - collections::HashSet, + collections::{HashMap, HashSet}, io::{ self, Write, Read, }, + path::{ + Path, + PathBuf + }, }; #[derive(Clone, PartialEq, Eq, Debug)] -pub struct DupeMap(HashSet); +pub struct DupeMap +{ + iteration: HashSet, // What we calculate + table: HashMap, // What we save and load +} + +/// Do we care about windows? nah +#[inline] +fn path_bytes(path: &Path) -> &[u8] +{ + use std::os::unix::ffi::OsStrExt; + path.as_os_str().as_bytes() +} + +#[inline] +fn bytes_path(bytes: &[u8]) -> &Path +{ + use std::os::unix::ffi::OsStrExt; + std::ffi::OsStr::from_bytes(bytes).as_ref() +} + +const ENTRY_HEADER: &[u8] = &[0x00, 0xde, 0xad]; impl DupeMap { /// Create a new empty dupe map pub fn new() -> Self { - Self(HashSet::new()) + Self{iteration: HashSet::new(), table: HashMap::new()} } - /// Iterator over all added keys + /// Iterator over all hashes pub fn iter(&self) -> std::collections::hash_set::Iter { - self.0.iter() + self.iteration.iter() + } + + /// Cache this path's hash + /// + /// # Returns + /// + /// True if caching was okay, false if key already added. + pub fn cache>(&mut self, id: T, hash: hash::Sha256Hash) -> bool + { + if self.table.contains_key(id.as_ref()) { + false + } else { + self.table.insert(id.as_ref().to_owned(), hash); + true + } } - - /// Is this hash in the set? - pub fn peek(&self, hash: &hash::Sha256Hash) -> bool { - self.0.contains(hash) + + /// Look for path `id` in cache. + pub fn get_cache>(&self, id: T) -> Option<&hash::Sha256Hash> + { + self.table.get(id.as_ref()) } - - /// Try to add an entry, returns true if was not a dupe, false if it was. + + /// Try to add to store. True if adding was oke, false if already exists. pub fn try_add(&mut self, hash: hash::Sha256Hash) -> bool { - if self.0.contains(&hash) { + if self.iteration.contains(&hash) { false } else { - self.0.insert(hash); + self.iteration.insert(hash); true } } @@ -45,9 +86,15 @@ impl DupeMap pub fn save(&self, to: &mut W) -> io::Result { let mut done=0; - for x in self.0.iter() + for (path, hash) in self.table.iter() { - to.write(x.as_ref())?; + let path = path_bytes(path.as_ref()); + let hash: &[u8] = hash.as_ref(); + + to.write(ENTRY_HEADER)?; + to.write(bytes::reinterpret(&path.len()))?; + to.write(path)?; + to.write(hash)?; done+=1; } Ok(done) @@ -60,9 +107,15 @@ impl DupeMap use tokio::prelude::*; let mut done=0; - for x in self.0.iter() + for (path, hash) in self.table.iter() { - to.write(x.as_ref()).await?; + let path = path_bytes(path.as_ref()); + let hash: &[u8] = hash.as_ref(); + + to.write(ENTRY_HEADER).await?; + to.write(bytes::reinterpret(&path.len())).await?; + to.write(path).await?; + to.write(hash).await?; done+=1; } Ok(done) @@ -73,15 +126,26 @@ impl DupeMap { let mut done=0; let mut read; - let mut buffer = [0u8; hash::SHA256_SIZE]; + let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::()]; + let mut hash_buffer = [0u8; hash::SHA256_SIZE]; - while {read = from.read(&mut buffer[..])?; read==hash::SHA256_SIZE} { - done += if self.try_add(hash::Sha256Hash::new(buffer)) { - 1 - } else { - 0 - }; + while {read = from.read(&mut header_buffer[..])?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER} + { + let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]); + if sz > 0 { + let mut path = vec![0u8; sz]; + if from.read(&mut path[..])? == sz { + let path = bytes_path(&path[..]); + if from.read(&mut hash_buffer[..])? == hash::SHA256_SIZE + { + if self.cache(path, hash::Sha256Hash::new(hash_buffer)) { + done +=1; + } + } + } + } } + Ok(done) } @@ -94,15 +158,26 @@ impl DupeMap let mut done=0; let mut read; - let mut buffer = [0u8; hash::SHA256_SIZE]; + let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::()]; + let mut hash_buffer = [0u8; hash::SHA256_SIZE]; - while {read = from.read(&mut buffer[..]).await?; read==hash::SHA256_SIZE} { - done += if self.try_add(hash::Sha256Hash::new(buffer)) { - 1 - } else { - 0 - }; + while {read = from.read(&mut header_buffer[..]).await?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER} + { + let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]); + if sz > 0 { + let mut path = vec![0u8; sz]; + if from.read(&mut path[..]).await? == sz { + let path = bytes_path(&path[..]); + if from.read(&mut hash_buffer[..]).await? == hash::SHA256_SIZE + { + if self.cache(path, hash::Sha256Hash::new(hash_buffer)) { + done +=1; + } + } + } + } } + Ok(done) } } diff --git a/src/proc.rs b/src/proc.rs index 2069d08..8f114f2 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -96,39 +96,54 @@ impl Default for DupeCount } /// Process a file and add it to the table, returns true if is not a dupe. -pub fn process_file>(file: P, set: &mut container::DupeMap) -> Result +pub fn process_file>(path: P, set: &mut container::DupeMap) -> Result { - let mut file = OpenOptions::new() - .read(true) - .open(file)?; - let sz: usize = file.metadata()?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; - - let mut result = hash::Sha256Hash::default(); - error::check_size(sz, hash::compute(&mut file, &mut result)?)?; - - Ok(set.try_add(result)) + let path = path.as_ref(); + if let Some(&hash) = set.get_cache(path) { + Ok(set.try_add(hash)) + } else { + let mut file = OpenOptions::new() + .read(true) + .open(path)?; + let sz: usize = file.metadata()?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; + + let mut result = hash::Sha256Hash::default(); + error::check_size(sz, hash::compute(&mut file, &mut result)?)?; + set.cache(path, result); + Ok(set.try_add(result)) + } } /// Process a file and add it to the table, returns true if is not a dupe. #[cfg(feature="threads")] -pub async fn process_file_async>(file: P, set: &std::sync::Arc>) -> Result +pub async fn process_file_async>(path: P, set: &std::sync::Arc>) -> Result { use tokio::{ fs::{ OpenOptions, }, }; - let mut file = OpenOptions::new() - .read(true) - .open(file).await?; - let sz: usize = file.metadata().await?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; - - let mut result = hash::Sha256Hash::default(); - error::check_size(sz, hash::compute_async(&mut file, &mut result).await?)?; + let path = path.as_ref(); + if let Some(hash) = { + let set = set.lock().await; + set.get_cache(path).and_then(|&h| Some(h)) + } { + let mut set = set.lock().await; + Ok(set.try_add(hash)) + } else { + let mut file = OpenOptions::new() + .read(true) + .open(path).await?; + let sz: usize = file.metadata().await?.len().try_into().or(Err(error::Error::Arch(Some("Filesize is too large to be known. you have likely compiled the binary for 32-bit architecture or less. This shouldn't happen on 64-bit systems."))))?; + + let mut result = hash::Sha256Hash::default(); + error::check_size(sz, hash::compute_async(&mut file, &mut result).await?)?; - let mut set = set.lock().await; - Ok(set.try_add(result)) + let mut set = set.lock().await; + set.cache(path, result); + Ok(set.try_add(result)) + } } /// Walk a dir structure and remove all dupes in it