added resolve

work
Avril 4 years ago
parent c5dea71892
commit 9e7586021c
Signed by: flanchan
GPG Key ID: 284488987C31F630

10
Cargo.lock generated

@ -615,6 +615,15 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "smallmap"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97ce78b988fb0df3b438d106942c0c2438849ecf40e3418af55044f96d27514d"
dependencies = [
"rustc_version",
]
[[package]] [[package]]
name = "socket2" name = "socket2"
version = "0.3.12" version = "0.3.12"
@ -774,6 +783,7 @@ dependencies = [
"lazy_static", "lazy_static",
"rustc_version", "rustc_version",
"sha2", "sha2",
"smallmap",
"tokio", "tokio",
] ]

@ -13,7 +13,7 @@ default = ["threads"]
threads = ["tokio/rt-threaded"] threads = ["tokio/rt-threaded"]
# Use base64 encoding of pathnames instead of SHA256. This can increase speed of database rebuilding, but can also cause files with large pathnames to fail. # Use base64 encoding of pathnames instead of SHA256. This can increase speed of database rebuilding, but can also cause files with large pathnames to fail.
fast_pathnames = ["base64"] fast-pathnames = ["base64"]
[dependencies] [dependencies]
@ -25,6 +25,7 @@ chrono = "0.4.13"
color-eyre = "0.5.1" color-eyre = "0.5.1"
lazy_static = "1.4.0" lazy_static = "1.4.0"
futures = "0.3.6" futures = "0.3.6"
smallmap = "1.1.5"
[build-dependencies] [build-dependencies]
rustc_version = "0.2" rustc_version = "0.2"

@ -21,21 +21,27 @@ pub fn program_name() -> &'static str
&PROGRAM[..] &PROGRAM[..]
} }
pub fn process<F, T>(mut callback: F) -> impl Future<Output= eyre::Result<usize>> /// Process program args in parallel spawning the `callback` closure of the argument in a new task for each.
///
/// The returned future can be awaited to wait for all tasks to complete. If one or more tasks are cancelled or panic, this future will immediately output `Err()`, if they all complete successfully, it will output an aggregate `Vec` of the output of each argument in order.
pub fn process<F, T>(mut callback: F) -> impl Future<Output= eyre::Result<Vec<T::Output>>>
where F: FnMut(String) -> T, where F: FnMut(String) -> T,
T: Future + Send + 'static, T: Future + Send + 'static,
T::Output: Send T::Output: Send,
{ {
let args = std::env::args(); let args = std::env::args();
let output: Vec<_> = args.skip(1).map(|arg| tokio::spawn(callback(arg))).collect(); let output: Vec<_> = args.skip(1).dedup().map(|arg| tokio::spawn(callback(arg))).collect();
let mut real_output = Vec::with_capacity(output.len());
async move { async move {
let mut j=0; let mut j=0;
for (i, x) in (0..).zip(futures::future::join_all(output).await) { for x in futures::future::try_join_all(output).await
x .wrap_err(eyre!("Child panic or cancel."))
.wrap_err(eyre!("Child panic or cancel")) .with_note(|| format!("Child for argument {}", j).header("While processing"))
.with_note(|| format!("Child {}", i).header("While processing"))?; .with_section(|| format!("{:?}", std::env::args().skip(1).nth(j)).header("Argument was"))?
{
real_output.push(x);
j+=1; j+=1;
} }
Ok(j) Ok(real_output)
} }
} }

@ -0,0 +1,24 @@
//! Videl configuration
use std::{
path::{
PathBuf,
},
};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Config
{
pub base_dir: PathBuf,
}
impl Default for Config
{
#[inline]
fn default() -> Self
{
Self {
base_dir: PathBuf::default(),
}
}
}

@ -0,0 +1,82 @@
//! De-duplicating functionality
use std::{
marker::PhantomData,
hash::{
Hash,
Hasher,
},
iter::{
self,
},
};
use smallmap::Map;
fn compute_hash_single<T: Hash>(value: &T) -> u64
{
let mut hasher = std::collections::hash_map::DefaultHasher::new();
value.hash(&mut hasher);
hasher.finish()
}
/// De-duplicating iterator
#[derive(Debug, Clone)]
pub struct DedupIter<I, T>
where T: Hash,
{
iter: I,
hashes: Map<u64, ()>,
_output: PhantomData<Map<T, ()>>,
}
impl<I, T> Iterator for DedupIter<I, T>
where I: Iterator<Item = T>,
T: Hash
{
type Item = T;
fn next(&mut self) -> Option<Self::Item>
{
while let Some(value) = self.iter.next()
{
if self.hashes.insert(compute_hash_single(&value), ()).is_none() {
// Is unique hash
return Some(value);
}
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (min, max) = self.iter.size_hint();
(std::cmp::min(1, min), max)
}
}
impl<I, T> iter::FusedIterator for DedupIter<I, T>
where I: iter::FusedIterator + Iterator<Item= T>,
T: Hash{}
impl<I, T> DedupIter<I, T>
where I: Iterator<Item = T>,
T: Hash
{
pub fn into_inner(self) -> I
{
self.iter
}
}
pub trait DedupIterExt<T: Hash>: Sized
{
fn dedup(self) -> DedupIter<Self, T>;
}
impl<I, T: Hash> DedupIterExt<T> for I
where I: Iterator<Item = T>
{
fn dedup(self) -> DedupIter<Self, T> {
DedupIter{
iter: self,
hashes: Map::with_capacity(8), // there are 8 bytes in u64, so preallocate pages to hold all possible key values. This is 16kb, I think.
_output: PhantomData
}
}
}

@ -0,0 +1,177 @@
//! Extensions
use super::*;
use std::{
collections::HashMap,
hash::Hash,
borrow::{
Borrow,
ToOwned,
},
num::NonZeroU8,
};
pub use dedup::DedupIterExt;
/// Iterator that maps `T` -> `U`
pub struct ReplacingIter<'a, I,T, U=T>
{
iter: I,
table: &'a HashMap<T, U>,
}
impl<'a, I,T,U> Iterator for ReplacingIter<'a, I,T,U>
where I: Iterator<Item=T>,
T: Hash+ Eq + ToOwned<Owned=U>,
U: Borrow<T> + Clone,
{
type Item = U;
fn next(&mut self) -> Option<Self::Item> {
if let Some(item) = self.iter.next()
{
Some(self.table.get(&item)
.map(Clone::clone)
.unwrap_or(item.to_owned()))
} else {
None
}
}
#[inline] fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a, I,T,U> ExactSizeIterator for ReplacingIter<'a, I,T,U>
where I: Iterator<Item=T> + ExactSizeIterator,
T: Hash+ Eq + ToOwned<Owned=U>,
U: Borrow<T> + Clone{}
impl<'a, I,T,U> std::iter::FusedIterator for ReplacingIter<'a, I,T,U>
where I: Iterator<Item=T> + std::iter::FusedIterator,
T: Hash+ Eq + ToOwned<Owned=U>,
U: Borrow<T> + Clone{}
impl<'a, I,T,U> std::iter::DoubleEndedIterator for ReplacingIter<'a, I,T,U>
where I: Iterator<Item=T> + std::iter::DoubleEndedIterator,
T: Hash+ Eq + ToOwned<Owned=U>,
U: Borrow<T> + Clone
{
fn next_back(&mut self) -> Option<Self::Item> {
if let Some(item) = self.iter.next_back()
{
Some(self.table.get(&item)
.map(Clone::clone)
.unwrap_or(item.to_owned()))
} else {
None
}
}
}
impl<'a ,I,T,U> ReplacingIter<'a, I,T,U>
{
pub fn into_inner(self) -> I
{
self.iter
}
pub fn table(&self) -> &'a HashMap<T,U>
{
self.table
}
}
pub trait ReplacingIterExt<T, U>: Sized
{
fn replace_with<'a>(self, table: &'a HashMap<T,U>) -> ReplacingIter<'a, Self, T, U>;
}
impl<I,T,U> ReplacingIterExt<T,U> for I
where I: Iterator<Item=T>,
T: Hash+ Eq + ToOwned<Owned=U>,
U: Borrow<T> + Clone,
{
fn replace_with<'a>(self, table: &'a HashMap<T,U>) -> ReplacingIter<'a, Self, T, U> {
ReplacingIter {
iter: self,
table,
}
}
}
const fn create_hex_map() -> [(u8, u8); 256]
{
let mut out = [(0, 0); 256];
const HEX: &[u8; 16] = b"0123456789abcdef";
let mut i = 0usize;
while i <= 255
{
out[i] = (
HEX[i >> 4],
HEX[i & 0xf]
);
i+=1;
}
out
}
const HEX_MAP: [(u8, u8); 256] = create_hex_map();
pub struct HexStrIterator<I>
{
iter: I,
buf: Option<NonZeroU8>, //we don't need full `char` here, since we can only have 0-9a-f anyway
}
impl<I> Iterator for HexStrIterator<I>
where I: Iterator<Item = u8>
{
type Item = char;
fn next(&mut self) -> Option<Self::Item>
{
if let Some(buf) = self.buf.take()
{
return Some(u8::from(buf) as char);
}
if let Some(next) = self.iter.next() {
let buf = HEX_MAP[next as usize];
debug_assert_ne!(buf.1, 0);
//SAFETY: We know `HEX_MAP` contains only non-zero bytes.
unsafe {
self.buf = Some(NonZeroU8::new_unchecked(buf.1));
}
Some(buf.0 as char)
} else {
None
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (min, max) = self.iter.size_hint();
(min * 2, max.map(|x| x * 2))
}
}
impl<I> ExactSizeIterator for HexStrIterator<I>
where I: Iterator<Item = u8> + ExactSizeIterator{}
impl<I> std::iter::FusedIterator for HexStrIterator<I>
where I: Iterator<Item = u8> + std::iter::FusedIterator{}
pub trait HexStrIterExt: Sized
{
fn hex(self) -> HexStrIterator<Self>;
}
impl<I> HexStrIterExt for I
where I: Iterator<Item = u8>
{
fn hex(self) -> HexStrIterator<Self>
{
HexStrIterator{
iter: self,
buf: None,
}
}
}

@ -15,9 +15,18 @@ use color_eyre::{
}, },
SectionExt as _, Help as _, SectionExt as _, Help as _,
}; };
use std::{
sync::Arc,
};
mod ext;
use ext::*;
mod util; mod util;
mod args; mod args;
mod config;
mod resolve;
mod dedup;
cfg_if!{ cfg_if!{
if #[cfg(nightly)] { if #[cfg(nightly)] {
@ -36,20 +45,28 @@ fn install() -> eyre::Result<()>
Ok(()) Ok(())
} }
async fn process(file: String) async fn process(config: Arc<config::Config>, file: String)
{ {
println!(" -> {}", file); println!(" -> {:?}", file);
//TODO: Process this file let dbdir = resolve::mangle_path(&config, &file);
println!("Database path for this file {:?}", dbdir);
println!("Demangle: {:?}", resolve::demangle_path(&dbdir).await);
} }
async fn begin() -> eyre::Result<i32> async fn begin() -> eyre::Result<i32>
{ {
install()?; install()?;
let config = Arc::new(config::Config::default());
if args::process(process).await if args::process(|file| {
.wrap_err(eyre!("One or more child workers failed to complete successfully"))? == 0 { let config = Arc::clone(&config);
process(config, file)
}).await
.wrap_err(eyre!("One or more child workers failed to complete successfully"))?
.len() == 0
{
args::usage(); args::usage();
} }
Ok(0) Ok(0)
} }

@ -0,0 +1,84 @@
//! Videl path resolution
use super::*;
use std::{
path::{
Path,
PathBuf,
},
collections::HashMap,
fmt,
error,
};
use std::os::unix::ffi::{OsStrExt, OsStringExt};
#[cfg(not(feature="fast-pathnames"))]
fn compute_hash_string(from: impl AsRef<[u8]>) -> String
{
use sha2::{Digest, Sha256};
let mut sha2 = Sha256::new();
sha2.update(from.as_ref());
let output = sha2.finalize();
output.into_iter().hex().collect()
}
lazy_static!{
static ref B64_TO: HashMap<char, char> = {
let mut table = HashMap::new();
table.insert('/', '-'); //cannot appear in file paths, to
table
};
static ref B64_FROM: HashMap<char, char> = {
B64_TO.iter().map(|(&x,&y)| (y,x)).collect()
};
}
fn replace_base64_to(string: impl AsRef<str>) -> String
{
string.as_ref().chars().replace_with(&B64_TO).collect()
}
fn replace_base64_from(string: impl AsRef<str>) -> String
{
string.as_ref().chars().replace_with(&B64_FROM).collect()
}
/// Resolve the database path for a certain file
pub fn mangle_path(config: &config::Config, path: impl AsRef<Path>) -> PathBuf
{
cfg_if!{
if #[cfg(feature="fast-pathnames")] {
config.base_dir.join(replace_base64_to(base64::encode(path.as_ref().as_os_str().as_bytes())))
} else {
config.base_dir.join(compute_hash_string(path.as_ref().as_os_str().as_bytes()))
}
}
}
#[derive(Debug)]
pub struct ResolutionError;
impl error::Error for ResolutionError{}
impl fmt::Display for ResolutionError
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{
write!(f, "database path was in an invalid format")
}
}
/// Find the original path from a database one
pub async fn demangle_path(path: impl AsRef<Path>) -> Result<PathBuf, ResolutionError>
{
cfg_if! {
if #[cfg(feature="fast-pathnames")] {
let part = path.as_ref().file_name().ok_or(ResolutionError)?; //get the base64 encoded part
let part = replace_base64_from(part.to_str().ok_or(ResolutionError)?); //replace characters back
let bytes = base64::decode(part).map_err(|_| ResolutionError)?;
Ok(std::ffi::OsString::from_vec(bytes).into())
} else {
//TODO: Look up in `path/metadata` file
todo!()
}
}
}
Loading…
Cancel
Save