//! De-duplicating functionality use std::{ marker::PhantomData, hash::{ Hash, Hasher, }, iter::{ self, }, }; use smallmap::Map; fn compute_hash_single(value: &T) -> u64 { let mut hasher = std::collections::hash_map::DefaultHasher::new(); value.hash(&mut hasher); hasher.finish() } /// De-duplicating iterator with low hashing precision (64 bits) #[derive(Debug, Clone)] pub struct DedupIter where T: Hash, { iter: I, hashes: Map, _output: PhantomData>, } impl Iterator for DedupIter where I: Iterator, T: Hash { type Item = T; fn next(&mut self) -> Option { while let Some(value) = self.iter.next() { if self.hashes.insert(compute_hash_single(&value), ()).is_none() { // Is unique hash return Some(value); } } None } fn size_hint(&self) -> (usize, Option) { let (min, max) = self.iter.size_hint(); (std::cmp::min(1, min), max) } } impl iter::FusedIterator for DedupIter where I: iter::FusedIterator + Iterator, T: Hash{} impl DedupIter where I: Iterator, T: Hash { pub fn into_inner(self) -> I { self.iter } } pub trait DedupIterExt: Sized { fn dedup(self) -> DedupIter; } impl DedupIterExt for I where I: Iterator { fn dedup(self) -> DedupIter { DedupIter{ iter: self, hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think. _output: PhantomData } } }