//! De-duplicating functionality use std::{ marker::PhantomData, hash::{ Hash, }, iter, }; use smallmap::Map; #[cfg(not(feature="low-prec-arg-dedup"))] mod paranoid { use sha2::{Digest, Sha256}; use std::hash::Hasher; use crate::util::bytes; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Sha256Hash([u8; 32]); pub struct Sha256Hasher(Sha256); impl Hasher for Sha256Hasher { fn write(&mut self, bytes: &[u8]) { self.0.update(bytes); } fn finish(&self) -> u64 { let mut out = [0u8; std::mem::size_of::()]; let def = self.0.clone().finalize(); bytes::copy_slice(&mut out[..], &def[..]); u64::from_le_bytes(out) } } impl Sha256Hasher { pub fn new() -> Self { Self(Sha256::new()) } pub fn finish(self) -> Sha256Hash { Sha256Hash(self.0.finalize().into()) } } } cfg_if::cfg_if! { if #[cfg(feature="low-prec-arg-dedup")] { use std::hash::Hasher; type HashType = u64; type DefaultHasher = std::collections::hash_map::DefaultHasher; } else { type HashType = paranoid::Sha256Hash; type DefaultHasher = paranoid::Sha256Hasher; } } //TODO: Use SHA256 or 512 when not using feature flag `low-prec-arg-dedup`. //This will produce more false-positives as it it now. fn compute_hash_single(value: &T) -> HashType { let mut hasher = DefaultHasher::new(); value.hash(&mut hasher); hasher.finish() } /// De-duplicating iterator with low hashing precision (64 bits) #[derive(Debug, Clone)] pub struct DedupIter where T: Hash, { iter: I, hashes: Map, _output: PhantomData>, } impl Iterator for DedupIter where I: Iterator, T: Hash { type Item = T; fn next(&mut self) -> Option { while let Some(value) = self.iter.next() { if self.hashes.insert(compute_hash_single(&value), ()).is_none() { // Is unique hash return Some(value); } } None } fn size_hint(&self) -> (usize, Option) { let (min, max) = self.iter.size_hint(); (std::cmp::min(1, min), max) } } impl iter::FusedIterator for DedupIter where I: iter::FusedIterator + Iterator, T: Hash{} impl DedupIter where I: Iterator, T: Hash { pub fn into_inner(self) -> I { self.iter } } pub trait DedupIterExt: Sized { fn dedup(self) -> DedupIter; } impl DedupIterExt for I where I: Iterator { fn dedup(self) -> DedupIter { DedupIter{ iter: self, hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think. _output: PhantomData } } }