From 9e7586021c86af4e5698f810b614fc59b4d481a8 Mon Sep 17 00:00:00 2001 From: Avril Date: Sun, 18 Oct 2020 21:38:34 +0100 Subject: [PATCH] added resolve --- Cargo.lock | 10 +++ Cargo.toml | 3 +- src/args.rs | 22 +++--- src/config.rs | 24 +++++++ src/dedup.rs | 82 +++++++++++++++++++++++ src/ext.rs | 177 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 29 ++++++-- src/resolve.rs | 84 +++++++++++++++++++++++ 8 files changed, 416 insertions(+), 15 deletions(-) create mode 100644 src/config.rs create mode 100644 src/dedup.rs create mode 100644 src/ext.rs create mode 100644 src/resolve.rs diff --git a/Cargo.lock b/Cargo.lock index 6b5edf1..716592e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -615,6 +615,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +[[package]] +name = "smallmap" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97ce78b988fb0df3b438d106942c0c2438849ecf40e3418af55044f96d27514d" +dependencies = [ + "rustc_version", +] + [[package]] name = "socket2" version = "0.3.12" @@ -774,6 +783,7 @@ dependencies = [ "lazy_static", "rustc_version", "sha2", + "smallmap", "tokio", ] diff --git a/Cargo.toml b/Cargo.toml index 9d7c422..195f7cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ default = ["threads"] threads = ["tokio/rt-threaded"] # Use base64 encoding of pathnames instead of SHA256. This can increase speed of database rebuilding, but can also cause files with large pathnames to fail. -fast_pathnames = ["base64"] +fast-pathnames = ["base64"] [dependencies] @@ -25,6 +25,7 @@ chrono = "0.4.13" color-eyre = "0.5.1" lazy_static = "1.4.0" futures = "0.3.6" +smallmap = "1.1.5" [build-dependencies] rustc_version = "0.2" diff --git a/src/args.rs b/src/args.rs index 42c2232..61090ce 100644 --- a/src/args.rs +++ b/src/args.rs @@ -21,21 +21,27 @@ pub fn program_name() -> &'static str &PROGRAM[..] } -pub fn process(mut callback: F) -> impl Future> +/// Process program args in parallel spawning the `callback` closure of the argument in a new task for each. +/// +/// The returned future can be awaited to wait for all tasks to complete. If one or more tasks are cancelled or panic, this future will immediately output `Err()`, if they all complete successfully, it will output an aggregate `Vec` of the output of each argument in order. +pub fn process(mut callback: F) -> impl Future>> where F: FnMut(String) -> T, T: Future + Send + 'static, - T::Output: Send + T::Output: Send, { let args = std::env::args(); - let output: Vec<_> = args.skip(1).map(|arg| tokio::spawn(callback(arg))).collect(); + let output: Vec<_> = args.skip(1).dedup().map(|arg| tokio::spawn(callback(arg))).collect(); + let mut real_output = Vec::with_capacity(output.len()); async move { let mut j=0; - for (i, x) in (0..).zip(futures::future::join_all(output).await) { - x - .wrap_err(eyre!("Child panic or cancel")) - .with_note(|| format!("Child {}", i).header("While processing"))?; + for x in futures::future::try_join_all(output).await + .wrap_err(eyre!("Child panic or cancel.")) + .with_note(|| format!("Child for argument {}", j).header("While processing")) + .with_section(|| format!("{:?}", std::env::args().skip(1).nth(j)).header("Argument was"))? + { + real_output.push(x); j+=1; } - Ok(j) + Ok(real_output) } } diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..f3bb122 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,24 @@ +//! Videl configuration +use std::{ + path::{ + PathBuf, + }, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Config +{ + pub base_dir: PathBuf, +} + +impl Default for Config +{ + #[inline] + fn default() -> Self + { + Self { + base_dir: PathBuf::default(), + } + } +} + diff --git a/src/dedup.rs b/src/dedup.rs new file mode 100644 index 0000000..01bbd94 --- /dev/null +++ b/src/dedup.rs @@ -0,0 +1,82 @@ +//! De-duplicating functionality +use std::{ + marker::PhantomData, + hash::{ + Hash, + Hasher, + }, + iter::{ + self, + + }, +}; +use smallmap::Map; + +fn compute_hash_single(value: &T) -> u64 +{ + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + value.hash(&mut hasher); + hasher.finish() +} + +/// De-duplicating iterator +#[derive(Debug, Clone)] +pub struct DedupIter +where T: Hash, +{ + iter: I, + hashes: Map, + _output: PhantomData>, +} + +impl Iterator for DedupIter +where I: Iterator, + T: Hash +{ + type Item = T; + fn next(&mut self) -> Option + { + while let Some(value) = self.iter.next() + { + if self.hashes.insert(compute_hash_single(&value), ()).is_none() { + // Is unique hash + return Some(value); + } + } + None + } + fn size_hint(&self) -> (usize, Option) { + let (min, max) = self.iter.size_hint(); + (std::cmp::min(1, min), max) + } +} +impl iter::FusedIterator for DedupIter +where I: iter::FusedIterator + Iterator, + T: Hash{} + +impl DedupIter +where I: Iterator, + T: Hash +{ + pub fn into_inner(self) -> I + { + self.iter + } +} + +pub trait DedupIterExt: Sized +{ + fn dedup(self) -> DedupIter; +} + +impl DedupIterExt for I +where I: Iterator +{ + fn dedup(self) -> DedupIter { + DedupIter{ + iter: self, + hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think. + _output: PhantomData + } + } +} diff --git a/src/ext.rs b/src/ext.rs new file mode 100644 index 0000000..016aaeb --- /dev/null +++ b/src/ext.rs @@ -0,0 +1,177 @@ +//! Extensions +use super::*; + +use std::{ + collections::HashMap, + hash::Hash, + borrow::{ + Borrow, + ToOwned, + }, + num::NonZeroU8, +}; + +pub use dedup::DedupIterExt; + +/// Iterator that maps `T` -> `U` +pub struct ReplacingIter<'a, I,T, U=T> +{ + iter: I, + table: &'a HashMap, +} + +impl<'a, I,T,U> Iterator for ReplacingIter<'a, I,T,U> +where I: Iterator, + T: Hash+ Eq + ToOwned, + U: Borrow + Clone, +{ + type Item = U; + fn next(&mut self) -> Option { + if let Some(item) = self.iter.next() + { + Some(self.table.get(&item) + .map(Clone::clone) + .unwrap_or(item.to_owned())) + } else { + None + } + } + + #[inline] fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl<'a, I,T,U> ExactSizeIterator for ReplacingIter<'a, I,T,U> +where I: Iterator + ExactSizeIterator, + T: Hash+ Eq + ToOwned, + U: Borrow + Clone{} + +impl<'a, I,T,U> std::iter::FusedIterator for ReplacingIter<'a, I,T,U> +where I: Iterator + std::iter::FusedIterator, + T: Hash+ Eq + ToOwned, + U: Borrow + Clone{} + +impl<'a, I,T,U> std::iter::DoubleEndedIterator for ReplacingIter<'a, I,T,U> +where I: Iterator + std::iter::DoubleEndedIterator, + T: Hash+ Eq + ToOwned, + U: Borrow + Clone +{ + fn next_back(&mut self) -> Option { + if let Some(item) = self.iter.next_back() + { + Some(self.table.get(&item) + .map(Clone::clone) + .unwrap_or(item.to_owned())) + } else { + None + } + } +} + +impl<'a ,I,T,U> ReplacingIter<'a, I,T,U> +{ + pub fn into_inner(self) -> I + { + self.iter + } + pub fn table(&self) -> &'a HashMap + { + self.table + } +} + +pub trait ReplacingIterExt: Sized +{ + fn replace_with<'a>(self, table: &'a HashMap) -> ReplacingIter<'a, Self, T, U>; +} + +impl ReplacingIterExt for I +where I: Iterator, + T: Hash+ Eq + ToOwned, + U: Borrow + Clone, +{ + fn replace_with<'a>(self, table: &'a HashMap) -> ReplacingIter<'a, Self, T, U> { + ReplacingIter { + iter: self, + table, + } + } +} + + +const fn create_hex_map() -> [(u8, u8); 256] +{ + let mut out = [(0, 0); 256]; + const HEX: &[u8; 16] = b"0123456789abcdef"; + let mut i = 0usize; + while i <= 255 + { + out[i] = ( + HEX[i >> 4], + HEX[i & 0xf] + ); + i+=1; + } + out +} +const HEX_MAP: [(u8, u8); 256] = create_hex_map(); + +pub struct HexStrIterator +{ + iter: I, + buf: Option, //we don't need full `char` here, since we can only have 0-9a-f anyway +} + +impl Iterator for HexStrIterator +where I: Iterator +{ + type Item = char; + fn next(&mut self) -> Option + { + if let Some(buf) = self.buf.take() + { + return Some(u8::from(buf) as char); + } + + if let Some(next) = self.iter.next() { + let buf = HEX_MAP[next as usize]; + debug_assert_ne!(buf.1, 0); + //SAFETY: We know `HEX_MAP` contains only non-zero bytes. + unsafe { + self.buf = Some(NonZeroU8::new_unchecked(buf.1)); + } + Some(buf.0 as char) + } else { + None + } + } + fn size_hint(&self) -> (usize, Option) { + let (min, max) = self.iter.size_hint(); + + (min * 2, max.map(|x| x * 2)) + } +} + +impl ExactSizeIterator for HexStrIterator +where I: Iterator + ExactSizeIterator{} + +impl std::iter::FusedIterator for HexStrIterator +where I: Iterator + std::iter::FusedIterator{} + +pub trait HexStrIterExt: Sized +{ + fn hex(self) -> HexStrIterator; +} + +impl HexStrIterExt for I +where I: Iterator +{ + fn hex(self) -> HexStrIterator + { + HexStrIterator{ + iter: self, + buf: None, + } + } +} diff --git a/src/main.rs b/src/main.rs index 290b5a5..58d4ce7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,9 +15,18 @@ use color_eyre::{ }, SectionExt as _, Help as _, }; +use std::{ + sync::Arc, +}; + +mod ext; +use ext::*; mod util; mod args; +mod config; +mod resolve; +mod dedup; cfg_if!{ if #[cfg(nightly)] { @@ -36,20 +45,28 @@ fn install() -> eyre::Result<()> Ok(()) } -async fn process(file: String) +async fn process(config: Arc, file: String) { - println!(" -> {}", file); - //TODO: Process this file + println!(" -> {:?}", file); + let dbdir = resolve::mangle_path(&config, &file); + println!("Database path for this file {:?}", dbdir); + println!("Demangle: {:?}", resolve::demangle_path(&dbdir).await); } async fn begin() -> eyre::Result { install()?; - - if args::process(process).await - .wrap_err(eyre!("One or more child workers failed to complete successfully"))? == 0 { + let config = Arc::new(config::Config::default()); + if args::process(|file| { + let config = Arc::clone(&config); + process(config, file) + }).await + .wrap_err(eyre!("One or more child workers failed to complete successfully"))? + .len() == 0 + { args::usage(); } + Ok(0) } diff --git a/src/resolve.rs b/src/resolve.rs new file mode 100644 index 0000000..5506b43 --- /dev/null +++ b/src/resolve.rs @@ -0,0 +1,84 @@ +//! Videl path resolution +use super::*; +use std::{ + path::{ + Path, + PathBuf, + }, + collections::HashMap, + fmt, + error, +}; +use std::os::unix::ffi::{OsStrExt, OsStringExt}; + +#[cfg(not(feature="fast-pathnames"))] +fn compute_hash_string(from: impl AsRef<[u8]>) -> String +{ + use sha2::{Digest, Sha256}; + let mut sha2 = Sha256::new(); + sha2.update(from.as_ref()); + let output = sha2.finalize(); + output.into_iter().hex().collect() +} + +lazy_static!{ + static ref B64_TO: HashMap = { + let mut table = HashMap::new(); + table.insert('/', '-'); //cannot appear in file paths, to + table + }; + static ref B64_FROM: HashMap = { + B64_TO.iter().map(|(&x,&y)| (y,x)).collect() + }; +} + +fn replace_base64_to(string: impl AsRef) -> String +{ + string.as_ref().chars().replace_with(&B64_TO).collect() +} + +fn replace_base64_from(string: impl AsRef) -> String +{ + string.as_ref().chars().replace_with(&B64_FROM).collect() +} + +/// Resolve the database path for a certain file +pub fn mangle_path(config: &config::Config, path: impl AsRef) -> PathBuf +{ + cfg_if!{ + if #[cfg(feature="fast-pathnames")] { + config.base_dir.join(replace_base64_to(base64::encode(path.as_ref().as_os_str().as_bytes()))) + } else { + config.base_dir.join(compute_hash_string(path.as_ref().as_os_str().as_bytes())) + } + } +} + +#[derive(Debug)] +pub struct ResolutionError; + +impl error::Error for ResolutionError{} +impl fmt::Display for ResolutionError +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result + { + write!(f, "database path was in an invalid format") + } +} + +/// Find the original path from a database one +pub async fn demangle_path(path: impl AsRef) -> Result +{ + cfg_if! { + if #[cfg(feature="fast-pathnames")] { + let part = path.as_ref().file_name().ok_or(ResolutionError)?; //get the base64 encoded part + let part = replace_base64_from(part.to_str().ok_or(ResolutionError)?); //replace characters back + let bytes = base64::decode(part).map_err(|_| ResolutionError)?; + + Ok(std::ffi::OsString::from_vec(bytes).into()) + } else { + //TODO: Look up in `path/metadata` file + todo!() + } + } +}