You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
215 lines
5.5 KiB
215 lines
5.5 KiB
//! A hash-set analogue that does not own its data.
|
|
//!
|
|
//! It can be used to "mark" items without the need to transfer ownership to the map
|
|
//!
|
|
//! # Example use case
|
|
//! ```
|
|
//! # use refset::HashRefSet;
|
|
//! /// Process arguments while ignoring duplicates
|
|
//! fn process_args(args: impl IntoIterator<Item=String>) {
|
|
//! let mut same= HashRefSet::new();
|
|
//! for argument in args.into_iter()
|
|
//! {
|
|
//! if !same.insert(argument.as_str()) {
|
|
//! // Already processed this input, ignore
|
|
//! continue;
|
|
//! }
|
|
//! //do work...
|
|
//! }
|
|
//! }
|
|
//! ```
|
|
//! # Serialisation support with `serde` crate
|
|
//! `HashRefSet` and `HashType` both implement `Serialize` and `Deserialize` from the `serde` crate if the `serde` feature is enabled. By default it is not.
|
|
//! # Drawbacks
|
|
//! Since the item is not inserted itself, we cannot use `Eq` to double check there was not a hash collision.
|
|
//! While the hashing algorithm used (Sha512) is extremely unlikely to produce collisions, especially for small data types, keep in mind that it is not infallible.
|
|
use std::{
|
|
collections::{
|
|
hash_set,
|
|
HashSet,
|
|
},
|
|
marker::{
|
|
PhantomData,
|
|
Send,
|
|
Sync,
|
|
},
|
|
hash::Hash,
|
|
borrow::Borrow,
|
|
};
|
|
|
|
mod hashing;
|
|
|
|
/// The type used to store the hash of each item.
|
|
///
|
|
/// It is a result of the `SHA512` algorithm as a newtype 64 byte array marked with `#[repr(transparent)]`.
|
|
/// If you want to get the bytes from it, you can transmute safely.
|
|
/// ```
|
|
/// # use refset::HashType;
|
|
/// fn hash_bytes(hash: HashType) -> [u8; 64]
|
|
/// {
|
|
/// unsafe {
|
|
/// std::mem::transmute(hash)
|
|
/// }
|
|
/// }
|
|
///
|
|
/// fn hash_bytes_assert()
|
|
/// {
|
|
/// assert_eq!(hash_bytes(Default::default()), [0u8; 64]);
|
|
/// }
|
|
/// ```
|
|
pub type HashType = hashing::Sha512Hash;
|
|
|
|
/// Compute the `HashType` value for this `T`.
|
|
fn compute_hash_for<T: ?Sized + Hash>(value: &T) -> HashType
|
|
{
|
|
let mut hasher = hashing::Sha512Hasher::new();
|
|
value.hash(&mut hasher);
|
|
hasher.finalize()
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
#[cold] fn compute_both_hash_for<T: ?Sized + Hash>(value: &T) -> (u64, HashType)
|
|
{
|
|
use sha2::{
|
|
Digest,
|
|
digest::generic_array::sequence::Split,
|
|
};
|
|
let mut hasher = hashing::Sha512Hasher::new();
|
|
value.hash(&mut hasher);
|
|
let sha512 = hasher.into_inner();
|
|
|
|
let full = sha512.finalize();
|
|
|
|
let mut arr = [0u8; hashing::HASH_SIZE];
|
|
debug_assert_eq!(arr.len(), full.len());
|
|
unsafe {
|
|
std::ptr::copy_nonoverlapping(&full[0] as *const u8, &mut arr[0] as *mut u8, hashing::HASH_SIZE);
|
|
}
|
|
(u64::from_ne_bytes(full.split().0.into()), HashType::from_bytes(arr))
|
|
}
|
|
|
|
/// A hash-set of references to an item.
|
|
///
|
|
/// Instead of inserting the item into the set, the set is "marked" with the item.
|
|
/// Think of this as inserting a reference into the set with no lifetime.
|
|
///
|
|
/// Any type that can borrow to `T` can be used to insert, and neither type needs to be `Sized`.
|
|
/// `T` need only implement `Hash`.
|
|
///
|
|
/// # Hashing algorithm
|
|
/// The hasing algorithm used is `Sha512`, which is rather large (64 bytes).
|
|
/// At present there is no way to change the hasher used, I might implement that functionality in the future.
|
|
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
|
#[cfg_attr(feature="serde", derive(serde::Serialize, serde::Deserialize))]
|
|
pub struct HashRefSet<T: ?Sized>(HashSet<HashType>, PhantomData<HashSet<*const T>>);
|
|
|
|
unsafe impl<T: ?Sized + Send> Send for HashRefSet<T>{}
|
|
unsafe impl<T: ?Sized + Send + Sync> Sync for HashRefSet<T>{}
|
|
|
|
impl<T:?Sized + Hash> HashRefSet<T>
|
|
{
|
|
/// Create a new empty `HashRefSet`
|
|
pub fn new() -> Self
|
|
{
|
|
Self(
|
|
HashSet::new(),
|
|
PhantomData
|
|
)
|
|
}
|
|
/// Create a new `HashRefSet` with a capacity
|
|
pub fn with_capacity(cap: usize) -> Self
|
|
{
|
|
Self(HashSet::with_capacity(cap), PhantomData)
|
|
}
|
|
|
|
/// Insert a reference into the set. The reference can be any type that borrows to `T`.
|
|
///
|
|
/// Returns `true` if there was no previous item, `false` if there was.
|
|
pub fn insert<Q>(&mut self, value: &Q) -> bool
|
|
where Q: ?Sized + Borrow<T>
|
|
{
|
|
self.0.insert(compute_hash_for(value.borrow()))
|
|
}
|
|
|
|
/// Remove a reference from the set.
|
|
///
|
|
/// Returns `true` if it existed.
|
|
pub fn remove<Q>(&mut self, value: &Q) -> bool
|
|
where Q: ?Sized + Borrow<T>
|
|
{
|
|
self.0.remove(&compute_hash_for(value.borrow()))
|
|
}
|
|
|
|
/// Check if this value has been inserted into the set.
|
|
pub fn contains<Q>(&mut self, value: &Q) -> bool
|
|
where Q: ?Sized + Borrow<T>
|
|
{
|
|
self.0.contains(&compute_hash_for(value.borrow()))
|
|
}
|
|
|
|
/// The number of items stored in the set
|
|
pub fn len(&self) -> usize
|
|
{
|
|
self.0.len()
|
|
}
|
|
|
|
/// Is the set empty
|
|
pub fn is_empty(&self) -> bool
|
|
{
|
|
self.0.is_empty()
|
|
}
|
|
|
|
/// An iterator over the hashes stored in the set.
|
|
pub fn hashes_iter(&self) -> hash_set::Iter<'_, HashType>
|
|
{
|
|
self.0.iter()
|
|
}
|
|
|
|
#[inline] fn into_hashes_iter(self) -> hash_set::IntoIter<HashType>
|
|
{
|
|
self.0.into_iter()
|
|
}
|
|
}
|
|
|
|
impl<T: ?Sized + Hash> IntoIterator for HashRefSet<T>
|
|
{
|
|
type Item= HashType;
|
|
type IntoIter = hash_set::IntoIter<HashType>;
|
|
|
|
#[inline] fn into_iter(self) -> Self::IntoIter
|
|
{
|
|
self.into_hashes_iter()
|
|
}
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
mod tests
|
|
{
|
|
use super::*;
|
|
#[test]
|
|
fn insert()
|
|
{
|
|
let mut refset = HashRefSet::new();
|
|
|
|
let values= vec![
|
|
"hi",
|
|
"hello",
|
|
"one",
|
|
"two",
|
|
];
|
|
for &string in values.iter()
|
|
{
|
|
refset.insert(string);
|
|
}
|
|
|
|
for string in values
|
|
{
|
|
assert!(refset.contains(string));
|
|
}
|
|
|
|
assert!(refset.insert("none"));
|
|
assert!(!refset.insert("two"));
|
|
}
|
|
}
|