You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
372 lines
9.6 KiB
372 lines
9.6 KiB
use super::*;
|
|
use std::{
|
|
collections::{HashMap, HashSet},
|
|
io::{
|
|
self,
|
|
Write,
|
|
Read,
|
|
},
|
|
path::{
|
|
Path,
|
|
PathBuf
|
|
},
|
|
fmt,
|
|
};
|
|
|
|
/// Map of collisions
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct CollisionMap<'a>(HashMap<hash::Sha256Hash, Vec<&'a Path>>);
|
|
|
|
impl<'a> CollisionMap<'a>
|
|
{
|
|
#[inline] pub fn len(&self) -> usize
|
|
{
|
|
self.0.len()
|
|
}
|
|
#[inline] pub fn full_len(&self) -> usize
|
|
{
|
|
self.0.iter().map(|(_, v)| v.len()).sum()
|
|
}
|
|
#[inline] pub fn iter(&self) -> impl Iterator<Item = (&hash::Sha256Hash, &[&'a Path])>
|
|
{
|
|
self.0.iter().map(|(k, v)| (k, v.as_slice()))
|
|
}
|
|
#[inline] pub fn of_hash(&self, name: &hash::Sha256Hash) -> &[&'a Path]
|
|
{
|
|
if let Some(vec) = self.0.get(name)
|
|
{
|
|
&vec[..]
|
|
} else {
|
|
&[]
|
|
}
|
|
}
|
|
#[inline] pub fn hashes(&self) -> impl Iterator<Item = &hash::Sha256Hash>
|
|
{
|
|
self.0.iter().map(|(k, _)| k)
|
|
}
|
|
#[inline] pub fn into_iter(self) -> impl Iterator<Item = (hash::Sha256Hash, Vec<&'a Path>)>
|
|
{
|
|
self.0.into_iter()
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, PartialEq, Eq, Debug)]
|
|
pub struct DupeMap
|
|
{
|
|
names: HashMap<PathBuf, hash::Sha256Hash>,
|
|
iteration: HashSet<hash::Sha256Hash>, // What we calculate
|
|
table: HashMap<PathBuf, (hash::Sha256Hash, bool)>, // What we save and load, and if it's transient (ignored in calculate)
|
|
}
|
|
|
|
impl fmt::Display for DupeMap
|
|
{
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
|
|
{
|
|
write!(f, "[DupeMap: Iteration: {} unique hashes computed. ", self.iteration.len())?;
|
|
let len = self.table.len();
|
|
let tlen = self.table.iter().filter(|(_, (_, x))| *x).count();
|
|
write!(f, "Table: {} cached ({} trans. {} real.]", len, tlen, len-tlen)
|
|
}
|
|
}
|
|
|
|
/// Do we care about windows? nah
|
|
#[inline]
|
|
fn path_bytes(path: &Path) -> &[u8]
|
|
{
|
|
use std::os::unix::ffi::OsStrExt;
|
|
path.as_os_str().as_bytes()
|
|
}
|
|
|
|
#[inline]
|
|
fn bytes_path(bytes: &[u8]) -> &Path
|
|
{
|
|
use std::os::unix::ffi::OsStrExt;
|
|
std::ffi::OsStr::from_bytes(bytes).as_ref()
|
|
}
|
|
|
|
const ENTRY_HEADER: &[u8] = &[0x00, 0xde, 0xad];
|
|
|
|
impl DupeMap
|
|
{
|
|
/// Create a new empty dupe map
|
|
pub fn new() -> Self
|
|
{
|
|
Self{iteration: HashSet::new(), table: HashMap::new(), names: HashMap::new()}
|
|
}
|
|
|
|
/// Iterator over all hashes
|
|
pub fn iter(&self) -> std::collections::hash_set::Iter<hash::Sha256Hash>
|
|
{
|
|
self.iteration.iter()
|
|
}
|
|
|
|
/// Forcefully update the cache
|
|
pub fn cache_force(&mut self, id: impl AsRef<Path>, hash: hash::Sha256Hash, trans: bool)
|
|
{
|
|
if let Some((h,t)) = self.table.get_mut(id.as_ref()) {
|
|
*h = hash;
|
|
*t = trans;
|
|
} else {
|
|
self.table.insert(id.as_ref().to_owned(), (hash,true));
|
|
}
|
|
}
|
|
|
|
/// Remove from the cache if it exists
|
|
pub fn uncache(&mut self, id: impl AsRef<Path>) -> Option<(hash::Sha256Hash, bool)>
|
|
{
|
|
self.table.remove(id.as_ref())
|
|
}
|
|
|
|
/// The amount of cached items (inc. transient ones)
|
|
pub fn cache_len(&self) -> usize
|
|
{
|
|
self.table.len()
|
|
}
|
|
|
|
/// Iterate through the cache
|
|
pub fn cache_iter(&self) -> std::collections::hash_map::Iter<PathBuf, (hash::Sha256Hash, bool)>
|
|
{
|
|
self.table.iter()
|
|
}
|
|
|
|
/// Iterate through the cache
|
|
pub fn cache_iter_mut(&mut self) -> std::collections::hash_map::IterMut<PathBuf, (hash::Sha256Hash, bool)>
|
|
{
|
|
self.table.iter_mut()
|
|
}
|
|
|
|
/// Cache this path's hash
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// True if caching was okay, false if key already added.
|
|
///
|
|
/// # Notes
|
|
///
|
|
/// If value is added and is transient, it is counted as not existing.
|
|
pub fn cache<T: AsRef<Path>>(&mut self, id: T, hash: hash::Sha256Hash) -> bool
|
|
{
|
|
if self.table.contains_key(id.as_ref()) {
|
|
if let Some((got_hash, trans @ true)) = self.table.get_mut(id.as_ref()) {
|
|
*trans = false;
|
|
*got_hash = hash;
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
} else {
|
|
self.table.insert(id.as_ref().to_owned(), (hash, false));
|
|
true
|
|
}
|
|
}
|
|
|
|
/// Cache this path's hash as transient.
|
|
/// Transient means it is ignored in calculations but is still saved.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// True if caching was okay, false if already added (transient or not).
|
|
pub fn cache_trans<T: AsRef<Path>>(&mut self, id: T, hash: hash::Sha256Hash) -> bool
|
|
{
|
|
if self.table.contains_key(id.as_ref()) {
|
|
false
|
|
} else {
|
|
self.table.insert(id.as_ref().to_owned(), (hash,true));
|
|
true
|
|
}
|
|
}
|
|
|
|
/// Get a mutable reference to the transience of this path, if it is added
|
|
pub fn transience_mut<T: AsRef<Path>>(&mut self, id: T) -> Option<&mut bool>
|
|
{
|
|
match self.table.get_mut(id.as_ref()) {
|
|
Some((_, trans)) => Some(trans),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
/// Get the transience of this path, if it is added
|
|
pub fn transience<T: AsRef<Path>>(&self, id: T) -> Option<bool>
|
|
{
|
|
if let Some((_, trans)) = self.table.get(id.as_ref()) {
|
|
Some(*trans)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Look for path `id` in cache.
|
|
pub fn get_cache<T: AsRef<Path>>(&self, id: T) -> Option<&hash::Sha256Hash>
|
|
{
|
|
match self.table.get(id.as_ref()) {
|
|
Some((hash, false)) => Some(hash),
|
|
_ => None
|
|
}
|
|
}
|
|
|
|
/// Try to add to store. True if adding was oke, false if already exists.
|
|
pub fn try_add(&mut self, hash: hash::Sha256Hash, name: impl Into<PathBuf>) -> bool
|
|
{
|
|
if self.iteration.insert(hash) {
|
|
self.names.insert(name.into(), hash);
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
/// Create a map of all collisions
|
|
pub fn get_collision_map(&self) -> CollisionMap<'_>
|
|
{
|
|
let mut cm = CollisionMap(HashMap::new());
|
|
|
|
for (name, hash) in self.names.iter()
|
|
{
|
|
if let Some(vec) = cm.0.get_mut(hash)
|
|
{
|
|
vec.push(name);
|
|
} else {
|
|
cm.0.insert(*hash, vec![name]);
|
|
}
|
|
}
|
|
|
|
cm
|
|
}
|
|
|
|
/// Save this list to a file
|
|
pub fn save<W: Write>(&self, to: &mut W) -> io::Result<usize>
|
|
{
|
|
use lzzzz::{
|
|
lz4f::{
|
|
PreferencesBuilder,
|
|
WriteCompressor,
|
|
CLEVEL_MAX,
|
|
},
|
|
};
|
|
let mut to = WriteCompressor::new(to, PreferencesBuilder::new().compression_level(CLEVEL_MAX).build())?;
|
|
let mut done=0;
|
|
for (path, (hash, _)) in self.table.iter()
|
|
{
|
|
let path = path_bytes(path.as_ref());
|
|
let hash: &[u8] = hash.as_ref();
|
|
|
|
to.write(ENTRY_HEADER)?;
|
|
to.write(bytes::reinterpret(&path.len()))?;
|
|
to.write(path)?;
|
|
to.write(hash)?;
|
|
done+=1;
|
|
}
|
|
Ok(done)
|
|
}
|
|
/// Save this list to a file async
|
|
#[cfg(feature="threads")]
|
|
pub async fn save_async<W>(&self, to: &mut W) -> io::Result<usize>
|
|
where W: tokio::io::AsyncWrite + std::marker::Send + std::marker::Sync + std::marker::Unpin
|
|
{
|
|
use tokio::prelude::*;
|
|
|
|
let mut done=0usize;
|
|
|
|
use lzzzz::{
|
|
lz4f::{
|
|
PreferencesBuilder,
|
|
AsyncWriteCompressor,
|
|
CLEVEL_MAX,
|
|
},
|
|
};
|
|
let mut to = AsyncWriteCompressor::new(to, PreferencesBuilder::new().compression_level(CLEVEL_MAX).build())?;
|
|
for (path, (hash, _)) in self.table.iter()
|
|
{
|
|
let path = path_bytes(path.as_ref());
|
|
let hash: &[u8] = hash.as_ref();
|
|
|
|
to.write_all(ENTRY_HEADER).await?;
|
|
to.write_all(bytes::reinterpret(&path.len())).await?; ////ASD OASDI AJOSID OAISNDO I
|
|
to.write_all(path).await?;
|
|
to.write_all(hash).await?;
|
|
done+=1;
|
|
}
|
|
to.flush().await?;
|
|
to.shutdown().await?;
|
|
Ok(done)
|
|
}
|
|
|
|
/// Load from file.
|
|
pub fn load<R: Read>(&mut self, from: &mut R, trans: bool) -> io::Result<usize>
|
|
{
|
|
let mut done=0;
|
|
let mut read;
|
|
let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::<usize>()];
|
|
let mut hash_buffer = [0u8; hash::SHA256_SIZE];
|
|
let mut from = lzzzz::lz4f::ReadDecompressor::new(from)?;
|
|
|
|
//XXX: Change to read_exact
|
|
while {read = from.read(&mut header_buffer[..])?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER}
|
|
{
|
|
let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]);
|
|
if sz > 0 {
|
|
let mut path = vec![0u8; sz];
|
|
if from.read(&mut path[..])? == sz {
|
|
let path = bytes_path(&path[..]);
|
|
if from.read(&mut hash_buffer[..])? == hash::SHA256_SIZE
|
|
{
|
|
if !trans && self.cache(path, hash::Sha256Hash::new(hash_buffer)) {
|
|
done +=1;
|
|
} else if trans && self.cache_trans(path, hash::Sha256Hash::new(hash_buffer)) {
|
|
done +=1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(done)
|
|
}
|
|
|
|
/// Load from file.
|
|
#[cfg(feature="threads")]
|
|
pub async fn load_async<R>(&mut self, from: &mut R, trans: bool) -> io::Result<usize>
|
|
where R: tokio::io::AsyncRead + std::marker::Send + std::marker::Sync + std::marker::Unpin
|
|
{
|
|
use tokio::prelude::*;
|
|
|
|
let mut done=0;
|
|
let mut read;
|
|
let mut header_buffer = [0u8; ENTRY_HEADER.len() + std::mem::size_of::<usize>()];
|
|
let mut hash_buffer = [0u8; hash::SHA256_SIZE];
|
|
|
|
let mut from = lzzzz::lz4f::AsyncReadDecompressor::new(from)?;
|
|
while {read = match from.read_exact(&mut header_buffer[..]).await {
|
|
Ok(v) => Ok(v),
|
|
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
|
|
if let Some(re) = e.get_ref() {
|
|
if format!("{}", re) == "early eof" { // Is there a better way to compare these? `Any` trait? Is it worth it? Don't care, it's an error anyway.
|
|
return Ok(done); // This is fine
|
|
}
|
|
}
|
|
Err(e)
|
|
},
|
|
v => v,
|
|
}?; read == header_buffer.len() && &header_buffer[..ENTRY_HEADER.len()] == ENTRY_HEADER}
|
|
{
|
|
let sz = *bytes::reinterpret_back(&header_buffer[ENTRY_HEADER.len()..]);
|
|
if sz > 0 {
|
|
let mut path = vec![0u8; sz];
|
|
if from.read_exact(&mut path[..]).await? == sz {
|
|
let path = bytes_path(&path[..]);
|
|
if from.read_exact(&mut hash_buffer[..]).await? == hash::SHA256_SIZE
|
|
{
|
|
if !trans && self.cache(path, hash::Sha256Hash::new(hash_buffer)) {
|
|
done +=1;
|
|
} else if trans && self.cache_trans(path, hash::Sha256Hash::new(hash_buffer)) {
|
|
done +=1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(done)
|
|
}
|
|
}
|