diff --git a/src/args.rs b/src/args.rs index 3199edf..f86428c 100644 --- a/src/args.rs +++ b/src/args.rs @@ -187,7 +187,9 @@ where T: str::FromStr { #[derive(Debug, Args)] pub struct Config { - /// Use the PCRE (JS-like) extended regular expression compiler + /// Use the PCRE (JS-like) extended regular expression compiler. + /// + /// __NOTE__: The binary must have been compiled with build feature `perl` to use this option. /// /// # Feature difference /// By default, the expression syntax does not support things like negative lookahead and other backtrack-requiring regex features. @@ -197,8 +199,8 @@ pub struct Config /// /// It is ill-advised to enable PCRE on large inputs unless those features are required. //TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?) - #[arg(short, long)] - #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off. + #[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.) +// #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off. extended: bool, /// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines. @@ -215,12 +217,14 @@ impl Config /// # Interaction with feature gating of ~actual~ PCRE support via `feature="perl"` /// Note that if the "perl" feature is not enabled, this may still return `true`. /// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that. - #[inline] + #[inline(always)] + //TODO: Make `extended` public and remove this accessor? pub fn use_pcre(&self) -> bool { - #![allow(unreachable_code)] - #[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended` - false + //#![allow(unreachable_code)] + //#[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended` + //false + self.extended } } diff --git a/src/main.rs b/src/main.rs index 4fad2dc..c32a321 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +#![cfg_attr(feature="unstable", feature(impl_trait_in_assoc_type))] // XXX: Re-work `re::RegexEngine` to be able to remove this if we can, so we can use non-allocating `try_exec()` on stable... mod re; mod text; diff --git a/src/re.rs b/src/re.rs index f0fa46e..b30857f 100644 --- a/src/re.rs +++ b/src/re.rs @@ -9,19 +9,316 @@ use std::{ sync::{ Arc, Mutex, - } + }, + num::NonZeroUsize, + convert::Infallible, + borrow::{ + Cow, + Borrow, + }, }; -pub type Groups = Vec; +pub type FrozenVector = Box<[T]>; +pub type FrozenString = Box; + +//TODO: Re-work this to allow non-matched groups (i.e. `Option>` or something...) to be communicated without `"".into()`. +pub type Groups = FrozenVector; + +//TODO: We need to provide a `NonPCRERegex` that we can runtime-polymorphicly use in the case PCRE is disabled/enabled by the user's Cli options (see `args::Config::extended`.) +// This `NonPCRERegex` can be written agnostic to the `perl` feature being enabled, as `Regex` below will use the optionally-included package `pcre` when the feature is enabled, but the `regex` package is *always* available. +//compile_error!("TODO: Remove this trait and refactor this shit. XXX: We don't need all this dynamic dispatch shit, we can just have an `enum` of `regex::Regex` & `Regex` if we need to, dispatching the `exec` call through that; as the compile error type differs & there is no exec error for non-PCRE regex exec. "); +//compile_error!("XXX: TODO: (I don't think we'll even need to do that though, just a helper ext-trait with the same types as the below trait and non-dyn methods -- mostly just `exec() -> Result, Self::ExecError>` -- is good enough.)") + +pub trait RegexMatcher +{ + /// Attempt to match this regular expression against `string`, and if successful, pass each to callback `result` while `result` returns `Ok(true)`. + /// + /// # Callback feeding from match `try_exec()` as an iterator. + /// Once `result(i, n)` -- where `i` is the index of the group returned from the iterator of `try_exec()`, and `n` is the borrowed string of item -- returns a result other than `Ok(true)`, the function will short-circuit in the following way: + /// + /// * `Err(e)` - `Err(e.into())` will be returned. + /// * `Ok(false)` - `Ok(Some(()))` will be returned (a *successful* result, despite the rest of the iterator being ignored.) + /// And if the iterator completes before either of the first two are returned from `result`, `Ok(Some(()))` will be returned as well. + /// + /// The short-circuit will happen before the callback is invoked at all if `RegexEngine::try_exec()` returns the following: + /// - `Err(e)` will short-circuit to `return Err(e)`. + /// - `Ok(None)` will short-circuit to `return Ok(None)`. + /// + /// Note that the case that `Output<'_>` is a lazy iterator works best when working through this dynamic interface. + /// + /// # Return + /// The only time `Ok(None)` is returned is if `result` is never executed because the returned value of `try_exec()` is `None`. + /// An empty iterator wrapped in a `Some(_)` will still be returned as `Ok(Some(()))` from this function. + /// + /// Any `Err(_)` result will be propagated from this function (from `try_exec()` or any call to `result(i, n)`) to the caller via `Err(e.into())` whenever it may appear. + fn try_exec_into<'s>(&self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result)) -> crate::eyre::Result> + where Self: 's; + + /// Same as `try_exec_into()`, but can rely on being the *soul owner of* self *while invoked*. + /// + /// __NOTE__: The generic implementation of this function does not distinguish ownership, and thus `try_exec_into()` should be preferred unless an explicit owning version has been implemented. + // (__XXX__: Can we impl this for `Regex` when using PCRE to bypass need to lock mutex?) + #[inline(always)] + fn try_owned_exec_into<'s>(&mut self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result)) -> crate::eyre::Result> + where Self: 's { + self.try_exec_into(string, result) + } + + /// Same as `try_exec_into()`, but can rely on `self` outliving all references within the call. + /// + /// Whether `Ok(_)` is returned or not, this `Arc` ref of `self` is consumed after this call. + /// + /// __NOTE__: In the generic implementation of this function, If `self` is the only owner of the `Arc`, it *may* try to dispatch to the owning `try_owned_exec_into()` instead. + /// But **also note that** the generic implementation of `try_owned_exec_into()` defers to `try_exec_into()` anyway. + #[inline] + fn try_shared_exec_into<'s>(self: Arc, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result)) -> crate::eyre::Result> + where Self: Sized + 's { + match Arc::try_unwrap(self) { // Unfortunately, we can't go from `Arc<_>` -> `Box<_>` via `try_from()` or `into()`... + Err(this) => this.try_exec_into(string, result), + Ok(mut this) => this.try_owned_exec_into(string, result), + } + } + + /// Identical purpose to `RegexEngine::prepare_regex()`, provided for parallel dynamic dispatch over `self`. + fn do_prepare_regex(&mut self, num: Option); + + /// Identical value to `RegexEngine::should_prefer_run_in_parallel()`, provided for parallel dynamic dispatch over `self`. + fn prefer_run_in_parallel_p(&self, num: Option) -> Option; +} + +impl RegexMatcher for T +where T: RegexEngine, +for <'a> T::ExecError<'a>: Send + Sync + 'static +{ + #[inline] + fn try_exec_into<'s>(&self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result)) -> crate::eyre::Result> where Self: 's { + //use crate::*; + + // Try to match on `string`. + let Some(res) = self.try_exec(string)? else { + return Ok(None); + }; + + // Call `result` callback on each item with its index. + for (i, x) in res.into_iter().enumerate() { + match result((i, x.borrow()))? { + false => break, + _ => (), + } + } + + Ok(Some(())) + } + + #[inline] + fn prefer_run_in_parallel_p(&self, num: Option) -> Option + { + RegexEngine::should_prefer_run_in_parallel(self, num) + } + + #[inline(always)] + fn do_prepare_regex(&mut self, num: Option) { + RegexEngine::prepare_regex(self, num); + } +} + +impl<'a, T: Send + Sync + 'a> From> for Box + where T: RegexMatcher + RegexEngine +{ + #[inline] + fn from(from: Box) -> Self + { + from + } +} + +/// Trait represents a regular-expression object that can be compiled from a string and can match on any number of strings from a shared-reference (possibly in parallel, see below.) +/// +/// The output of the match operation is a generic iterator over the match groups that matched (__XXX__: with empty strings denoting non-matches for now to keep the indecies valid. __TODO__: I-it does keep them valid, right??) wrapped in an `Option<_>`, which will return `None` if the string provided does not match the whole regular expression. +pub trait RegexEngine +{ + type Output<'string>: IntoIterator> + 'string + where Self: 'string; + + type CompileError<'s>: error::Error; + type ExecError<'s>: error::Error; + + /// Attempt to compile `string` into a new boxed instance of `Self`. + /// + /// Useful for dispatching with a dynamic `RegexMatcher` instead of `RegexEngine`. + fn try_compile_boxed<'s>(string: &'s str) -> Result, Self::CompileError<'s>>; + + /// Attempt to compile `string` into a new instance of `Self`. + #[inline(always)] + fn try_compile<'s>(string: &'s str) -> Result> + where Self: Sized { + Self::try_compile_boxed(string).map(|x| *x) + } + + /// Attempt to run match groups on `string`, returning them as `Self::Output`. + /// If there are no matches, `Ok(None)` should be returned. + fn try_exec<'s>(&self, string: &'s str) -> Result>, Self::ExecError<'s>>; + + /// Should `try_exec()` be ran over an iterator of `string`s in parallel or sequence? Or, does it not matter? + /// Where `num` is the number of `string`s (if known by caller.) + /// + /// We assume 0 `string`s will not cause any execution. + /// + /// # Returns + /// - `Some(true)` - Yes, do prefer run in parallel. + /// - `Some(false)` - No, do **not** run in parallel if possible. + /// - ~default~ `None` - Unknown. It is possible to run in parallel, but it either does not matter, or may not cause tangible performance benefits over running in sequence. + #[inline(always)] + fn should_prefer_run_in_parallel(&self, _num: Option) -> Option { None } + /// Prepare for the regex to be used. If it is to be used on a known number of `string`s, then that count is passed. + /// + /// This allows things like re-compilation / optimisation where the regex may be used more than once that could speed up matching. + /// It doesn't have to do anything though. + #[inline(always)] + fn prepare_regex(&mut self, _num: Option) {} +} + +// compile_error!("XXX: Can we do this GAT-generically...?"); +// pub trait RegexEngineDetachedOutput: RegexEngine +// { +// fn try_exec_detached<'s>(&self, string: &'s str) -> Result, Self::ExecError<'s>>; +// } + +impl RegexEngine for NonPCRERegex +{ + #[cfg(feature="unstable")] + type Output<'string> = impl Iterator> /*+ impl std::iter::ExactSizeIterator*/ + 'string; //XXX: TODO: No, ehhh..... Rework this shite.... .//impl Iterator> + 'this; + +#[cfg(not(feature="unstable"))] + type Output<'string> = Box<[Cow<'string, str>]>;//impl Iterator> + 'string; //XXX: TODO: No, ehhh..... Rework this shite.... .//impl Iterator> + 'this; + + type CompileError<'s> = regex::Error; + type ExecError<'s> = Infallible; + + #[inline] + fn try_compile<'s>(string: &'s str) -> Result> { + Self::new(string) + } + + #[inline(always)] + fn try_compile_boxed<'s>(string: &'s str) -> Result, Self::CompileError<'s>> { + Self::new(string).map(Box::new) + } + + #[inline] + fn try_exec<'s>(&self, string: &'s str) -> Result>, Self::ExecError<'s>> { + Ok(match self.captures(string) { + Some(m) => { + + let res = (0..m.len()).map(move |i| match m.get(i) { Some(ma) => Cow::Borrowed(ma.as_str()), None => Cow::Borrowed("") }); + + // If `unstable` is not enabled, we cannot skip this allocation (for now...) + #[cfg(not(feature="unstable"))] + let res = Some(res.collect()); + + // Otherwise, use ITiAT to return the iterator. + #[cfg(feature="unstable")] + let res = Some(res); + + res + }, + None => None, + }) + } + + #[inline(always)] + fn should_prefer_run_in_parallel(&self, num: Option) -> Option { + Some(match num.map(NonZeroUsize::get) { + Some(1) => return None, + _ => true, + }) + } +} + +impl RegexEngine for Regex +{ + type Output<'string> = Groups; // XXX: Can we have a ref-only output here...? Maybe for non-PCRE... So keep as this for now. + + type CompileError<'s> = Error; + type ExecError<'s> = Infallible; + + #[inline] + fn try_compile<'s>(string: &'s str) -> Result> { + Self::compile(string) + } + + #[inline(always)] + fn try_compile_boxed<'s>(string: &'s str) -> Result, Self::CompileError<'s>> { + Self::compile(string).map(Box::new) + } + + #[inline(always)] + fn try_exec<'t, 's>(&'t self, string: &'s str) -> Result>, Self::ExecError<'s>> { + // SAFETY: The implementation of `Regex::exec()` has no path that can return an error (XXX: Why does it even return `Result` anyway...?) + Ok(unsafe { + Self::exec(&self, string).unwrap_unchecked() + }) + } + + /// PCRE supports `study()`ing the regular expression, which we might want to do if we have more than a few strings to match on. + /// + /// If PCRE is not enabled, and we use the Rust regex `regex::Regex`; it does not require/support additional optimisations, so keep the default noop-impl from the trait if this feature is not enabled. + #[cfg(feature="perl")] + fn prepare_regex(&mut self, num: Option) { + match num.map(NonZeroUsize::get) { + Some(1..=2) | None => return, + _ => (), + } + // XXX: Eh.. The `Arc` means we gotta lock here... + // match (&mut self.internal).get_mut() { + // Ok(v) => v.study(), + // Err(mut v) => v.get_mut().study(), + // }; + + // NOTE: If there is another lock held while *this* method is being invoked, it can *only* make logical sense that it is calling the same method on a different thread. So do not block to call this. (XXX: This is only required because of the silly locking shit we gotta do here...) + match self.internal.try_lock() { + Ok(mut re) => { + re.study(); + + self.internal.clear_poison(); + drop(re); + }, + _ => (), + }; + + } + + #[inline(always)] + fn should_prefer_run_in_parallel(&self, num: Option) -> Option { + match num.map(NonZeroUsize::get) { + Some(1) => return Some(false), + _ => (), + } + + Some(! Self::IS_EXTENDED) + } +} + +/// Non-PCRE / non-extended regex (regardless of if the `perl` feature is enabled.) +pub type NonPCRERegex = regex::Regex; + +/// PCRE-enabled (if feature is enabled, see [`IS_EXTENDED`]) regex. +#[derive(Debug, Clone)] pub struct Regex { #[cfg(feature="perl")] - internal: Arc>, + internal: Arc>, // XXX: Can we make parallel usage a bit less... expensive? TODO: How expensive is it to clone these into a thread-local cache, for instance? #[cfg(not(feature = "perl"))] internal: regex::Regex, } +impl Regex +{ + /// If the implementation uses PCRE instead of default regex. + pub const IS_EXTENDED: bool = cfg!(feature="perl"); +} + #[derive(Debug)] pub enum Error { @@ -65,9 +362,9 @@ impl Regex { let len = m.string_count(); let mut output = Vec::with_capacity(len); for i in 0..len { - output.push(m.group(i).to_owned()); + output.push(m.group(i).into()); } - Some(output) + Some(output.into_boxed_slice()) }, None => None, }) @@ -76,14 +373,16 @@ impl Regex { return { Ok(match self.internal.captures(string.as_ref()) { Some(m) => { - let mut output = Vec::with_capacity(m.len()); - for i in 0..m.len() { - let ma = m.get(i).unwrap(); - let mut op = String::with_capacity(ma.range().len()); - write!(op, "{}", ma.as_str())?; - output.push(op); - } - Some(output) + Some((0..m.len()).map(move |i| match m.get(i) { Some(ma) => ma.as_str().into(), None => "".into()} ).collect()) + + // let mut output = Vec::with_capacity(m.len()); + // for i in 0..m.len() { + // let ma = m.get(i).unwrap(); + // //let mut op = String::with_capacity(ma.range().len()); + // //let op = format!("{}", ma.as_str().into()) + // output.push(ma.as_str().into()); + // } + // Some(output.into_boxed_slice()) }, None => None, }) @@ -99,7 +398,7 @@ impl From for Error } } -#[cfg(not(feature = "perl"))] +//#[cfg(not(feature = "perl"))] impl From for Error { fn from(er: regex::Error) -> Self