You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
videl/src/dedup.rs

128 lines
2.7 KiB

//! De-duplicating functionality
use std::{
marker::PhantomData,
hash::{
Hash,
},
iter,
};
use smallmap::Map;
#[cfg(not(feature="low-prec-arg-dedup"))]
mod paranoid
{
use sha2::{Digest, Sha256};
use std::hash::Hasher;
use crate::util::bytes;
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Sha256Hash([u8; 32]);
pub struct Sha256Hasher(Sha256);
impl Hasher for Sha256Hasher
{
fn write(&mut self, bytes: &[u8]) {
self.0.update(bytes);
}
fn finish(&self) -> u64 {
let mut out = [0u8; std::mem::size_of::<u64>()];
let def = self.0.clone().finalize();
bytes::copy_slice(&mut out[..], &def[..]);
u64::from_le_bytes(out)
}
}
impl Sha256Hasher
{
pub fn new() -> Self
{
Self(Sha256::new())
}
pub fn finish(self) -> Sha256Hash
{
Sha256Hash(self.0.finalize().into())
}
}
}
cfg_if::cfg_if! {
if #[cfg(feature="low-prec-arg-dedup")] {
use std::hash::Hasher;
type HashType = u64;
type DefaultHasher = std::collections::hash_map::DefaultHasher;
} else {
type HashType = paranoid::Sha256Hash;
type DefaultHasher = paranoid::Sha256Hasher;
}
}
//TODO: Use SHA256 or 512 when not using feature flag `low-prec-arg-dedup`.
//This will produce more false-positives as it it now.
fn compute_hash_single<T: Hash>(value: &T) -> HashType
{
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
hasher.finish()
}
/// De-duplicating iterator with low hashing precision (64 bits)
#[derive(Debug, Clone)]
pub struct DedupIter<I, T>
where T: Hash,
{
iter: I,
hashes: Map<HashType, ()>,
_output: PhantomData<Map<T, ()>>,
}
impl<I, T> Iterator for DedupIter<I, T>
where I: Iterator<Item = T>,
T: Hash
{
type Item = T;
fn next(&mut self) -> Option<Self::Item>
{
while let Some(value) = self.iter.next()
{
if self.hashes.insert(compute_hash_single(&value), ()).is_none() {
// Is unique hash
return Some(value);
}
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (min, max) = self.iter.size_hint();
(std::cmp::min(1, min), max)
}
}
impl<I, T> iter::FusedIterator for DedupIter<I, T>
where I: iter::FusedIterator + Iterator<Item= T>,
T: Hash{}
impl<I, T> DedupIter<I, T>
where I: Iterator<Item = T>,
T: Hash
{
pub fn into_inner(self) -> I
{
self.iter
}
}
pub trait DedupIterExt<T: Hash>: Sized
{
fn dedup(self) -> DedupIter<Self, T>;
}
impl<I, T: Hash> DedupIterExt<T> for I
where I: Iterator<Item = T>
{
fn dedup(self) -> DedupIter<Self, T> {
DedupIter{
iter: self,
hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think.
_output: PhantomData
}
}
}