//! A hash-set analogue that does not own its data. //! //! It can be used to "mark" items without the need to transfer ownership to the map //! //! # Example use case //! ``` //! # use refset::HashRefSet; //! /// Process arguments while ignoring duplicates //! fn process_args(args: impl IntoIterator) { //! let mut same= HashRefSet::new(); //! for argument in args.into_iter() //! { //! if !same.insert(argument.as_str()) { //! // Already processed this input, ignore //! continue; //! } //! //do work... //! } //! } //! ``` //! # Serialisation support with `serde` crate //! `HashRefSet` and `HashType` both implement `Serialize` and `Deserialize` from the `serde` crate if the `serde` feature is enabled. By default it is not. //! # Drawbacks //! Since the item is not inserted itself, we cannot use `Eq` to double check there was not a hash collision. //! While the hashing algorithm used (Sha512) is extremely unlikely to produce collisions, especially for small data types, keep in mind that it is not infallible. use std::{ collections::{ hash_set, HashSet, }, marker::{ PhantomData, Send, Sync, }, hash::Hash, borrow::Borrow, }; mod hashing; /// The type used to store the hash of each item. /// /// It is a result of the `SHA512` algorithm as a newtype 64 byte array marked with `#[repr(transparent)]`. /// If you want to get the bytes from it, you can transmute safely. /// ``` /// # use refset::HashType; /// fn hash_bytes(hash: HashType) -> [u8; 64] /// { /// unsafe { /// std::mem::transmute(hash) /// } /// } /// /// fn hash_bytes_assert() /// { /// assert_eq!(hash_bytes(Default::default()), [0u8; 64]); /// } /// ``` pub type HashType = hashing::Sha512Hash; /// Compute the `HashType` value for this `T`. fn compute_hash_for(value: &T) -> HashType { let mut hasher = hashing::Sha512Hasher::new(); value.hash(&mut hasher); hasher.finalize() } #[allow(dead_code)] #[cold] fn compute_both_hash_for(value: &T) -> (u64, HashType) { use sha2::{ Digest, digest::generic_array::sequence::Split, }; let mut hasher = hashing::Sha512Hasher::new(); value.hash(&mut hasher); let sha512 = hasher.into_inner(); let full = sha512.finalize(); let mut arr = [0u8; hashing::HASH_SIZE]; debug_assert_eq!(arr.len(), full.len()); unsafe { std::ptr::copy_nonoverlapping(&full[0] as *const u8, &mut arr[0] as *mut u8, hashing::HASH_SIZE); } (u64::from_ne_bytes(full.split().0.into()), HashType::from_bytes(arr)) } /// A hash-set of references to an item. /// /// Instead of inserting the item into the set, the set is "marked" with the item. /// Think of this as inserting a reference into the set with no lifetime. /// /// Any type that can borrow to `T` can be used to insert, and neither type needs to be `Sized`. /// `T` need only implement `Hash`. /// /// # Hashing algorithm /// The hasing algorithm used is `Sha512`, which is rather large (64 bytes). /// At present there is no way to change the hasher used, I might implement that functionality in the future. #[derive(Debug, Clone, PartialEq, Eq, Default)] #[cfg_attr(feature="serde", derive(serde::Serialize, serde::Deserialize))] pub struct HashRefSet(HashSet, PhantomData>); unsafe impl Send for HashRefSet{} unsafe impl Sync for HashRefSet{} impl HashRefSet { /// Create a new empty `HashRefSet` pub fn new() -> Self { Self( HashSet::new(), PhantomData ) } /// Create a new `HashRefSet` with a capacity pub fn with_capacity(cap: usize) -> Self { Self(HashSet::with_capacity(cap), PhantomData) } /// Insert a reference into the set. The reference can be any type that borrows to `T`. /// /// Returns `true` if there was no previous item, `false` if there was. pub fn insert(&mut self, value: &Q) -> bool where Q: ?Sized + Borrow { self.0.insert(compute_hash_for(value.borrow())) } /// Remove a reference from the set. /// /// Returns `true` if it existed. pub fn remove(&mut self, value: &Q) -> bool where Q: ?Sized + Borrow { self.0.remove(&compute_hash_for(value.borrow())) } /// Check if this value has been inserted into the set. pub fn contains(&mut self, value: &Q) -> bool where Q: ?Sized + Borrow { self.0.contains(&compute_hash_for(value.borrow())) } /// The number of items stored in the set pub fn len(&self) -> usize { self.0.len() } /// Is the set empty pub fn is_empty(&self) -> bool { self.0.is_empty() } /// An iterator over the hashes stored in the set. pub fn hashes_iter(&self) -> hash_set::Iter<'_, HashType> { self.0.iter() } #[inline] fn into_hashes_iter(self) -> hash_set::IntoIter { self.0.into_iter() } } impl IntoIterator for HashRefSet { type Item= HashType; type IntoIter = hash_set::IntoIter; #[inline] fn into_iter(self) -> Self::IntoIter { self.into_hashes_iter() } } #[cfg(test)] mod tests { use super::*; #[test] fn insert() { let mut refset = HashRefSet::new(); let values= vec![ "hi", "hello", "one", "two", ]; for &string in values.iter() { refset.insert(string); } for string in values { assert!(refset.contains(string)); } assert!(refset.insert("none")); assert!(!refset.insert("two")); } }