You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

215 lines
5.5 KiB

//! A hash-set analogue that does not own its data.
//!
//! It can be used to "mark" items without the need to transfer ownership to the map
//!
//! # Example use case
//! ```
//! # use refset::HashRefSet;
//! /// Process arguments while ignoring duplicates
//! fn process_args(args: impl IntoIterator<Item=String>) {
//! let mut same= HashRefSet::new();
//! for argument in args.into_iter()
//! {
//! if !same.insert(argument.as_str()) {
//! // Already processed this input, ignore
//! continue;
//! }
//! //do work...
//! }
//! }
//! ```
//! # Serialisation support with `serde` crate
//! `HashRefSet` and `HashType` both implement `Serialize` and `Deserialize` from the `serde` crate if the `serde` feature is enabled. By default it is not.
//! # Drawbacks
//! Since the item is not inserted itself, we cannot use `Eq` to double check there was not a hash collision.
//! While the hashing algorithm used (Sha512) is extremely unlikely to produce collisions, especially for small data types, keep in mind that it is not infallible.
use std::{
collections::{
hash_set,
HashSet,
},
marker::{
PhantomData,
Send,
Sync,
},
hash::Hash,
borrow::Borrow,
};
mod hashing;
/// The type used to store the hash of each item.
///
/// It is a result of the `SHA512` algorithm as a newtype 64 byte array marked with `#[repr(transparent)]`.
/// If you want to get the bytes from it, you can transmute safely.
/// ```
/// # use refset::HashType;
/// fn hash_bytes(hash: HashType) -> [u8; 64]
/// {
/// unsafe {
/// std::mem::transmute(hash)
/// }
/// }
///
/// fn hash_bytes_assert()
/// {
/// assert_eq!(hash_bytes(Default::default()), [0u8; 64]);
/// }
/// ```
pub type HashType = hashing::Sha512Hash;
/// Compute the `HashType` value for this `T`.
fn compute_hash_for<T: ?Sized + Hash>(value: &T) -> HashType
{
let mut hasher = hashing::Sha512Hasher::new();
value.hash(&mut hasher);
hasher.finalize()
}
#[allow(dead_code)]
#[cold] fn compute_both_hash_for<T: ?Sized + Hash>(value: &T) -> (u64, HashType)
{
use sha2::{
Digest,
digest::generic_array::sequence::Split,
};
let mut hasher = hashing::Sha512Hasher::new();
value.hash(&mut hasher);
let sha512 = hasher.into_inner();
let full = sha512.finalize();
let mut arr = [0u8; hashing::HASH_SIZE];
debug_assert_eq!(arr.len(), full.len());
unsafe {
std::ptr::copy_nonoverlapping(&full[0] as *const u8, &mut arr[0] as *mut u8, hashing::HASH_SIZE);
}
(u64::from_ne_bytes(full.split().0.into()), HashType::from_bytes(arr))
}
/// A hash-set of references to an item.
///
/// Instead of inserting the item into the set, the set is "marked" with the item.
/// Think of this as inserting a reference into the set with no lifetime.
///
/// Any type that can borrow to `T` can be used to insert, and neither type needs to be `Sized`.
/// `T` need only implement `Hash`.
///
/// # Hashing algorithm
/// The hasing algorithm used is `Sha512`, which is rather large (64 bytes).
/// At present there is no way to change the hasher used, I might implement that functionality in the future.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
#[cfg_attr(feature="serde", derive(serde::Serialize, serde::Deserialize))]
pub struct HashRefSet<T: ?Sized>(HashSet<HashType>, PhantomData<HashSet<*const T>>);
unsafe impl<T: ?Sized + Send> Send for HashRefSet<T>{}
unsafe impl<T: ?Sized + Send + Sync> Sync for HashRefSet<T>{}
impl<T:?Sized + Hash> HashRefSet<T>
{
/// Create a new empty `HashRefSet`
pub fn new() -> Self
{
Self(
HashSet::new(),
PhantomData
)
}
/// Create a new `HashRefSet` with a capacity
pub fn with_capacity(cap: usize) -> Self
{
Self(HashSet::with_capacity(cap), PhantomData)
}
/// Insert a reference into the set. The reference can be any type that borrows to `T`.
///
/// Returns `true` if there was no previous item, `false` if there was.
pub fn insert<Q>(&mut self, value: &Q) -> bool
where Q: ?Sized + Borrow<T>
{
self.0.insert(compute_hash_for(value.borrow()))
}
/// Remove a reference from the set.
///
/// Returns `true` if it existed.
pub fn remove<Q>(&mut self, value: &Q) -> bool
where Q: ?Sized + Borrow<T>
{
self.0.remove(&compute_hash_for(value.borrow()))
}
/// Check if this value has been inserted into the set.
pub fn contains<Q>(&mut self, value: &Q) -> bool
where Q: ?Sized + Borrow<T>
{
self.0.contains(&compute_hash_for(value.borrow()))
}
/// The number of items stored in the set
pub fn len(&self) -> usize
{
self.0.len()
}
/// Is the set empty
pub fn is_empty(&self) -> bool
{
self.0.is_empty()
}
/// An iterator over the hashes stored in the set.
pub fn hashes_iter(&self) -> hash_set::Iter<'_, HashType>
{
self.0.iter()
}
#[inline] fn into_hashes_iter(self) -> hash_set::IntoIter<HashType>
{
self.0.into_iter()
}
}
impl<T: ?Sized + Hash> IntoIterator for HashRefSet<T>
{
type Item= HashType;
type IntoIter = hash_set::IntoIter<HashType>;
#[inline] fn into_iter(self) -> Self::IntoIter
{
self.into_hashes_iter()
}
}
#[cfg(test)]
mod tests
{
use super::*;
#[test]
fn insert()
{
let mut refset = HashRefSet::new();
let values= vec![
"hi",
"hello",
"one",
"two",
];
for &string in values.iter()
{
refset.insert(string);
}
for string in values
{
assert!(refset.contains(string));
}
assert!(refset.insert("none"));
assert!(!refset.insert("two"));
}
}