|
|
|
//! De-duplicating functionality
|
|
|
|
use std::{
|
|
|
|
marker::PhantomData,
|
|
|
|
hash::{
|
|
|
|
Hash,
|
|
|
|
Hasher,
|
|
|
|
},
|
|
|
|
iter::{
|
|
|
|
self,
|
|
|
|
|
|
|
|
},
|
|
|
|
};
|
|
|
|
use smallmap::Map;
|
|
|
|
|
|
|
|
//TODO: Use SHA256 or 512 when not using feature flag `low-prec-arg-dedup`.
|
|
|
|
//This will produce more false-positives as it it now.
|
|
|
|
fn compute_hash_single<T: Hash>(value: &T) -> u64
|
|
|
|
{
|
|
|
|
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
|
|
|
value.hash(&mut hasher);
|
|
|
|
hasher.finish()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// De-duplicating iterator with low hashing precision (64 bits)
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
pub struct DedupIter<I, T>
|
|
|
|
where T: Hash,
|
|
|
|
{
|
|
|
|
iter: I,
|
|
|
|
hashes: Map<u64, ()>,
|
|
|
|
_output: PhantomData<Map<T, ()>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<I, T> Iterator for DedupIter<I, T>
|
|
|
|
where I: Iterator<Item = T>,
|
|
|
|
T: Hash
|
|
|
|
{
|
|
|
|
type Item = T;
|
|
|
|
fn next(&mut self) -> Option<Self::Item>
|
|
|
|
{
|
|
|
|
while let Some(value) = self.iter.next()
|
|
|
|
{
|
|
|
|
if self.hashes.insert(compute_hash_single(&value), ()).is_none() {
|
|
|
|
// Is unique hash
|
|
|
|
return Some(value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None
|
|
|
|
}
|
|
|
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
|
|
|
let (min, max) = self.iter.size_hint();
|
|
|
|
(std::cmp::min(1, min), max)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
impl<I, T> iter::FusedIterator for DedupIter<I, T>
|
|
|
|
where I: iter::FusedIterator + Iterator<Item= T>,
|
|
|
|
T: Hash{}
|
|
|
|
|
|
|
|
impl<I, T> DedupIter<I, T>
|
|
|
|
where I: Iterator<Item = T>,
|
|
|
|
T: Hash
|
|
|
|
{
|
|
|
|
pub fn into_inner(self) -> I
|
|
|
|
{
|
|
|
|
self.iter
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub trait DedupIterExt<T: Hash>: Sized
|
|
|
|
{
|
|
|
|
fn dedup(self) -> DedupIter<Self, T>;
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<I, T: Hash> DedupIterExt<T> for I
|
|
|
|
where I: Iterator<Item = T>
|
|
|
|
{
|
|
|
|
fn dedup(self) -> DedupIter<Self, T> {
|
|
|
|
DedupIter{
|
|
|
|
iter: self,
|
|
|
|
hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think.
|
|
|
|
_output: PhantomData
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|