use super::*; use std::{ collections::{HashMap, HashSet}, io::{ self, Write, Read, }, path::{ Path, PathBuf }, fmt, }; /// Map of collisions #[derive(Debug, Clone, PartialEq, Eq)] pub struct CollisionMap<'a>(HashMap>); impl<'a> CollisionMap<'a> { #[inline] pub fn len(&self) -> usize { self.0.len() } #[inline] pub fn full_len(&self) -> usize { self.0.iter().map(|(_, v)| v.len()).sum() } #[inline] pub fn iter(&self) -> impl Iterator { self.0.iter().map(|(k, v)| (k, v.as_slice())) } #[inline] pub fn of_hash(&self, name: &hash::Sha256Hash) -> &[&'a Path] { if let Some(vec) = self.0.get(name) { &vec[..] } else { &[] } } #[inline] pub fn hashes(&self) -> impl Iterator { self.0.iter().map(|(k, _)| k) } #[inline] pub fn into_iter(self) -> impl Iterator)> { self.0.into_iter() } } #[derive(Clone, PartialEq, Eq, Debug)] pub struct DupeMap { names: HashMap, iteration: HashSet, // What we calculate table: HashMap, // What we save and load, and if it's transient (ignored in calculate) } impl fmt::Display for DupeMap { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[DupeMap: Iteration: {} unique hashes computed. ", self.iteration.len())?; let len = self.table.len(); let tlen = self.table.iter().filter(|(_, (_, x))| *x).count(); write!(f, "Table: {} cached ({} trans. {} real.]", len, tlen, len-tlen) } } /// Do we care about windows? nah #[inline] fn path_bytes(path: &Path) -> &[u8] { use std::os::unix::ffi::OsStrExt; path.as_os_str().as_bytes() } #[inline] fn bytes_path(bytes: &[u8]) -> &Path { use std::os::unix::ffi::OsStrExt; std::ffi::OsStr::from_bytes(bytes).as_ref() } const ENTRY_HEADER: &[u8] = &[0x00, 0xde, 0xad]; impl DupeMap { /// Create a new empty dupe map pub fn new() -> Self { Self{iteration: HashSet::new(), table: HashMap::new(), names: HashMap::new()} } /// Iterator over all hashes pub fn iter(&self) -> std::collections::hash_set::Iter { self.iteration.iter() } /// Forcefully update the cache pub fn cache_force(&mut self, id: impl AsRef, hash: hash::Sha256Hash, trans: bool) { if let Some((h,t)) = self.table.get_mut(id.as_ref()) { *h = hash; *t = trans; } else { self.table.insert(id.as_ref().to_owned(), (hash,true)); } } /// Remove from the cache if it exists pub fn uncache(&mut self, id: impl AsRef) -> Option<(hash::Sha256Hash, bool)> { self.table.remove(id.as_ref()) } /// The amount of cached items (inc. transient ones) pub fn cache_len(&self) -> usize { self.table.len() } /// Iterate through the cache pub fn cache_iter(&self) -> std::collections::hash_map::Iter { self.table.iter() } /// Iterate through the cache pub fn cache_iter_mut(&mut self) -> std::collections::hash_map::IterMut { self.table.iter_mut() } /// Cache this path's hash /// /// # Returns /// /// True if caching was okay, false if key already added. /// /// # Notes /// /// If value is added and is transient, it is counted as not existing. pub fn cache>(&mut self, id: T, hash: hash::Sha256Hash) -> bool { if self.table.contains_key(id.as_ref()) { if let Some((got_hash, trans @ true)) = self.table.get_mut(id.as_ref()) { *trans = false; *got_hash = hash; true } else { false } } else { self.table.insert(id.as_ref().to_owned(), (hash, false)); true } } /// Cache this path's hash as transient. /// Transient means it is ignored in calculations but is still saved. /// /// # Returns /// /// True if caching was okay, false if already added (transient or not). pub fn cache_trans>(&mut self, id: T, hash: hash::Sha256Hash) -> bool { if self.table.contains_key(id.as_ref()) { false } else { self.table.insert(id.as_ref().to_owned(), (hash,true)); true } } /// Get a mutable reference to the transience of this path, if it is added pub fn transience_mut>(&mut self, id: T) -> Option<&mut bool> { match self.table.get_mut(id.as_ref()) { Some((_, trans)) => Some(trans), _ => None, } } /// Get the transience of this path, if it is added pub fn transience>(&self, id: T) -> Option { if let Some((_, trans)) = self.table.get(id.as_ref()) { Some(*trans) } else { None } } /// Look for path `id` in cache. pub fn get_cache>(&self, id: T) -> Option<&hash::Sha256Hash> { match self.table.get(id.as_ref()) { Some((hash, false)) => Some(hash), _ => None } } /// Try to add to store. True if adding was oke, false if already exists. pub fn try_add(&mut self, hash: hash::Sha256Hash, name: impl Into) -> bool { if self.iteration.insert(hash) { self.names.insert(name.into(), hash); true } else { false } } /// Create a map of all collisions pub fn get_collision_map(&self) -> CollisionMap<'_> { let mut cm = CollisionMap(HashMap::new()); for (name, hash) in self.names.iter() { if let Some(vec) = cm.0.get_mut(hash) { vec.push(name); } else { cm.0.insert(*hash, vec![name]); } } cm } /// Save this list to a file pub fn save(&self, to: &mut W) -> io::Result { use lzzzz::{ lz4f::{ PreferencesBuilder, WriteCompressor, CLEVEL_MAX, }, }; let mut to = WriteCompressor::new(to, PreferencesBuilder::new().compression_level(CLEVEL_MAX).build())?; let mut done=0; for (path, (hash, _)) in self.table.iter() { let path = path_bytes(path.as_ref()); let hash: &[u8] = hash.as_ref(); to.write(ENTRY_HEADER)?; to.write(bytes::reinterpret(&path.len()))?; to.write(path)?; to.write(hash)?; done+=1; } Ok(done) } /// Save this list to a file async #[cfg(feature="threads")] pub async fn save_async(&self, to: &mut W) -> io::Result where W: tokio::io::AsyncWrite + std::marker::Send + std::marker::Sync + std::marker::Unpin { use tokio::prelude::*; let mut done=0usize; use lzzzz::{ lz4f::{ PreferencesBuilder, AsyncWriteCompressor, CLEVEL_MAX, }, }; let mut to = AsyncWriteCompressor::new(to, PreferencesBuilder::new().compression_level(CLEVEL_MAX).build())?; for (path, (hash, _)) in self.table.iter() { let path = path_bytes(path.as_ref()); let hash: &[u8] = hash.as_ref(); to.write_all(ENTRY_HEADER).await?; to.write_all(bytes::reinterpret(&path.len())).await?; ////ASD OASDI AJOSID OAISNDO I to.write_all(path).await?; to.write_all(hash).await?; done+=1; } to.flush().await?; to.shutdown().await?; Ok(done) } /// Load from file. pub fn load(&mut self, from: &mut R, trans: bool) -> io::Result { let mut done=0; let mut read; let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::()]; let mut hash_buffer = [0u8; hash::SHA256_SIZE]; let mut from = lzzzz::lz4f::ReadDecompressor::new(from)?; //XXX: Change to read_exact while {read = from.read(&mut header_buffer[..])?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER} { let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]); if sz > 0 { let mut path = vec![0u8; sz]; if from.read(&mut path[..])? == sz { let path = bytes_path(&path[..]); if from.read(&mut hash_buffer[..])? == hash::SHA256_SIZE { if !trans && self.cache(path, hash::Sha256Hash::new(hash_buffer)) { done +=1; } else if trans && self.cache_trans(path, hash::Sha256Hash::new(hash_buffer)) { done +=1; } } } } } Ok(done) } /// Load from file. #[cfg(feature="threads")] pub async fn load_async(&mut self, from: &mut R, trans: bool) -> io::Result where R: tokio::io::AsyncRead + std::marker::Send + std::marker::Sync + std::marker::Unpin { use tokio::prelude::*; let mut done=0; let mut read; let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::()]; let mut hash_buffer = [0u8; hash::SHA256_SIZE]; let mut from = lzzzz::lz4f::AsyncReadDecompressor::new(from)?; while {read = match from.read_exact(&mut header_buffer[..]).await { Ok(v) => Ok(v), Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => { if let Some(re) = e.get_ref() { if format!("{}", re) == "early eof" { // Is there a better way to compare these? `Any` trait? Is it worth it? Don't care, it's an error anyway. return Ok(done); // This is fine } } Err(e) }, v => v, }?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER} { let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]); if sz > 0 { let mut path = vec![0u8; sz]; if from.read_exact(&mut path[..]).await? == sz { let path = bytes_path(&path[..]); if from.read_exact(&mut hash_buffer[..]).await? == hash::SHA256_SIZE { if !trans && self.cache(path, hash::Sha256Hash::new(hash_buffer)) { done +=1; } else if trans && self.cache_trans(path, hash::Sha256Hash::new(hash_buffer)) { done +=1; } } } } } Ok(done) } }