re: Started generic (not dispatched) interface `RegexEngine` for `NonPCRERegex` & `Regex`.

re: Added `RegexMatcher`, a dynamic-dispatch-friendly version of the trait providing methods for execution-only: The value returned by `RegexEngine::try_compile_boxed()` can be consumed into `Box<dyn RegexMatcher + ...>` via an `Into::into()` call if dynamic dispatch over the compiled regular-expression is needed in the future. (TODO: There are also extension methods for `&mut self` & `Arc<Self>` in `RegexMatcher` that currently are unused, but may be useful for PCRE `Regex` if used.)

NOTE: The purpose of this is to allow something like: `let compiled_regex: Box<dyn re::RegexMatcher + Send + Sync + "static> = if cli.use_pcre() { re::Regex::try_compile_boxed(cli.regex)?.into() } else { re::NonPCRERegex::try_compile_boxed(cli.regex)[.inspect_mut(|re| re.prepare_regex())]?.into() };`

Fortune for rematch's current commit: Half blessing − 半吉
cli-refactor
Avril 2 days ago
parent 9fdf0817ae
commit 5def7d668c
Signed by: flanchan
GPG Key ID: 284488987C31F630

@ -187,7 +187,9 @@ where T: str::FromStr {
#[derive(Debug, Args)]
pub struct Config
{
/// Use the PCRE (JS-like) extended regular expression compiler
/// Use the PCRE (JS-like) extended regular expression compiler.
///
/// __NOTE__: The binary must have been compiled with build feature `perl` to use this option.
///
/// # Feature difference
/// By default, the expression syntax does not support things like negative lookahead and other backtrack-requiring regex features.
@ -197,8 +199,8 @@ pub struct Config
///
/// It is ill-advised to enable PCRE on large inputs unless those features are required.
//TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?)
#[arg(short, long)]
#[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off.
#[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.)
// #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off.
extended: bool,
/// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines.
@ -215,12 +217,14 @@ impl Config
/// # Interaction with feature gating of ~actual~ PCRE support via `feature="perl"`
/// Note that if the "perl" feature is not enabled, this may still return `true`.
/// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that.
#[inline]
#[inline(always)]
//TODO: Make `extended` public and remove this accessor?
pub fn use_pcre(&self) -> bool
{
#![allow(unreachable_code)]
#[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended`
false
//#![allow(unreachable_code)]
//#[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended`
//false
self.extended
}
}

@ -1,3 +1,4 @@
#![cfg_attr(feature="unstable", feature(impl_trait_in_assoc_type))] // XXX: Re-work `re::RegexEngine` to be able to remove this if we can, so we can use non-allocating `try_exec()` on stable...
mod re;
mod text;

@ -9,19 +9,316 @@ use std::{
sync::{
Arc,
Mutex,
}
},
num::NonZeroUsize,
convert::Infallible,
borrow::{
Cow,
Borrow,
},
};
pub type Groups = Vec<String>;
pub type FrozenVector<T> = Box<[T]>;
pub type FrozenString = Box<str>;
//TODO: Re-work this to allow non-matched groups (i.e. `Option<Cow<'static, str>>` or something...) to be communicated without `"".into()`.
pub type Groups = FrozenVector<FrozenString>;
//TODO: We need to provide a `NonPCRERegex` that we can runtime-polymorphicly use in the case PCRE is disabled/enabled by the user's Cli options (see `args::Config::extended`.)
// This `NonPCRERegex` can be written agnostic to the `perl` feature being enabled, as `Regex` below will use the optionally-included package `pcre` when the feature is enabled, but the `regex` package is *always* available.
//compile_error!("TODO: Remove this trait and refactor this shit. XXX: We don't need all this dynamic dispatch shit, we can just have an `enum` of `regex::Regex` & `Regex` if we need to, dispatching the `exec` call through that; as the compile error type differs & there is no exec error for non-PCRE regex exec. ");
//compile_error!("XXX: TODO: (I don't think we'll even need to do that though, just a helper ext-trait with the same types as the below trait and non-dyn methods -- mostly just `exec() -> Result<Option<Groups>, Self::ExecError>` -- is good enough.)")
pub trait RegexMatcher
{
/// Attempt to match this regular expression against `string`, and if successful, pass each to callback `result` while `result` returns `Ok(true)`.
///
/// # Callback feeding from match `try_exec()` as an iterator.
/// Once `result(i, n)` -- where `i` is the index of the group returned from the iterator of `try_exec()`, and `n` is the borrowed string of item -- returns a result other than `Ok(true)`, the function will short-circuit in the following way:
///
/// * `Err(e)` - `Err(e.into())` will be returned.
/// * `Ok(false)` - `Ok(Some(()))` will be returned (a *successful* result, despite the rest of the iterator being ignored.)
/// And if the iterator completes before either of the first two are returned from `result`, `Ok(Some(()))` will be returned as well.
///
/// The short-circuit will happen before the callback is invoked at all if `RegexEngine::try_exec()` returns the following:
/// - `Err(e)` will short-circuit to `return Err(e)`.
/// - `Ok(None)` will short-circuit to `return Ok(None)`.
///
/// Note that the case that `Output<'_>` is a lazy iterator works best when working through this dynamic interface.
///
/// # Return
/// The only time `Ok(None)` is returned is if `result` is never executed because the returned value of `try_exec()` is `None`.
/// An empty iterator wrapped in a `Some(_)` will still be returned as `Ok(Some(()))` from this function.
///
/// Any `Err(_)` result will be propagated from this function (from `try_exec()` or any call to `result(i, n)`) to the caller via `Err(e.into())` whenever it may appear.
fn try_exec_into<'s>(&self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result<bool>)) -> crate::eyre::Result<Option<()>>
where Self: 's;
/// Same as `try_exec_into()`, but can rely on being the *soul owner of* self *while invoked*.
///
/// __NOTE__: The generic implementation of this function does not distinguish ownership, and thus `try_exec_into()` should be preferred unless an explicit owning version has been implemented.
// (__XXX__: Can we impl this for `Regex` when using PCRE to bypass need to lock mutex?)
#[inline(always)]
fn try_owned_exec_into<'s>(&mut self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result<bool>)) -> crate::eyre::Result<Option<()>>
where Self: 's {
self.try_exec_into(string, result)
}
/// Same as `try_exec_into()`, but can rely on `self` outliving all references within the call.
///
/// Whether `Ok(_)` is returned or not, this `Arc` ref of `self` is consumed after this call.
///
/// __NOTE__: In the generic implementation of this function, If `self` is the only owner of the `Arc<Self>`, it *may* try to dispatch to the owning `try_owned_exec_into()` instead.
/// But **also note that** the generic implementation of `try_owned_exec_into()` defers to `try_exec_into()` anyway.
#[inline]
fn try_shared_exec_into<'s>(self: Arc<Self>, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result<bool>)) -> crate::eyre::Result<Option<()>>
where Self: Sized + 's {
match Arc::try_unwrap(self) { // Unfortunately, we can't go from `Arc<_>` -> `Box<_>` via `try_from()` or `into()`...
Err(this) => this.try_exec_into(string, result),
Ok(mut this) => this.try_owned_exec_into(string, result),
}
}
/// Identical purpose to `RegexEngine::prepare_regex()`, provided for parallel dynamic dispatch over `self`.
fn do_prepare_regex(&mut self, num: Option<NonZeroUsize>);
/// Identical value to `RegexEngine::should_prefer_run_in_parallel()`, provided for parallel dynamic dispatch over `self`.
fn prefer_run_in_parallel_p(&self, num: Option<NonZeroUsize>) -> Option<bool>;
}
impl<T: ?Sized> RegexMatcher for T
where T: RegexEngine,
for <'a> T::ExecError<'a>: Send + Sync + 'static
{
#[inline]
fn try_exec_into<'s>(&self, string: &'s str, result: &mut (dyn FnMut((usize, &str)) -> crate::eyre::Result<bool>)) -> crate::eyre::Result<Option<()>> where Self: 's {
//use crate::*;
// Try to match on `string`.
let Some(res) = self.try_exec(string)? else {
return Ok(None);
};
// Call `result` callback on each item with its index.
for (i, x) in res.into_iter().enumerate() {
match result((i, x.borrow()))? {
false => break,
_ => (),
}
}
Ok(Some(()))
}
#[inline]
fn prefer_run_in_parallel_p(&self, num: Option<NonZeroUsize>) -> Option<bool>
{
RegexEngine::should_prefer_run_in_parallel(self, num)
}
#[inline(always)]
fn do_prepare_regex(&mut self, num: Option<NonZeroUsize>) {
RegexEngine::prepare_regex(self, num);
}
}
impl<'a, T: Send + Sync + 'a> From<Box<T>> for Box<dyn RegexMatcher + Send + Sync + 'a>
where T: RegexMatcher + RegexEngine
{
#[inline]
fn from(from: Box<T>) -> Self
{
from
}
}
/// Trait represents a regular-expression object that can be compiled from a string and can match on any number of strings from a shared-reference (possibly in parallel, see below.)
///
/// The output of the match operation is a generic iterator over the match groups that matched (__XXX__: with empty strings denoting non-matches for now to keep the indecies valid. __TODO__: I-it does keep them valid, right??) wrapped in an `Option<_>`, which will return `None` if the string provided does not match the whole regular expression.
pub trait RegexEngine
{
type Output<'string>: IntoIterator<Item: Borrow<str>> + 'string
where Self: 'string;
type CompileError<'s>: error::Error;
type ExecError<'s>: error::Error;
/// Attempt to compile `string` into a new boxed instance of `Self`.
///
/// Useful for dispatching with a dynamic `RegexMatcher` instead of `RegexEngine`.
fn try_compile_boxed<'s>(string: &'s str) -> Result<Box<Self>, Self::CompileError<'s>>;
/// Attempt to compile `string` into a new instance of `Self`.
#[inline(always)]
fn try_compile<'s>(string: &'s str) -> Result<Self, Self::CompileError<'s>>
where Self: Sized {
Self::try_compile_boxed(string).map(|x| *x)
}
/// Attempt to run match groups on `string`, returning them as `Self::Output`.
/// If there are no matches, `Ok(None)` should be returned.
fn try_exec<'s>(&self, string: &'s str) -> Result<Option<Self::Output<'s>>, Self::ExecError<'s>>;
/// Should `try_exec()` be ran over an iterator of `string`s in parallel or sequence? Or, does it not matter?
/// Where `num` is the number of `string`s (if known by caller.)
///
/// We assume 0 `string`s will not cause any execution.
///
/// # Returns
/// - `Some(true)` - Yes, do prefer run in parallel.
/// - `Some(false)` - No, do **not** run in parallel if possible.
/// - ~default~ `None` - Unknown. It is possible to run in parallel, but it either does not matter, or may not cause tangible performance benefits over running in sequence.
#[inline(always)]
fn should_prefer_run_in_parallel(&self, _num: Option<NonZeroUsize>) -> Option<bool> { None }
/// Prepare for the regex to be used. If it is to be used on a known number of `string`s, then that count is passed.
///
/// This allows things like re-compilation / optimisation where the regex may be used more than once that could speed up matching.
/// It doesn't have to do anything though.
#[inline(always)]
fn prepare_regex(&mut self, _num: Option<NonZeroUsize>) {}
}
// compile_error!("XXX: Can we do this GAT-generically...?");
// pub trait RegexEngineDetachedOutput: RegexEngine
// {
// fn try_exec_detached<'s>(&self, string: &'s str) -> Result<Self::Output<'static>, Self::ExecError<'s>>;
// }
impl RegexEngine for NonPCRERegex
{
#[cfg(feature="unstable")]
type Output<'string> = impl Iterator<Item = Cow<'string, str>> /*+ impl std::iter::ExactSizeIterator*/ + 'string; //XXX: TODO: No, ehhh..... Rework this shite.... .//impl Iterator<Item = Cow<'static, str>> + 'this;
#[cfg(not(feature="unstable"))]
type Output<'string> = Box<[Cow<'string, str>]>;//impl Iterator<Item = Cow<'string, str>> + 'string; //XXX: TODO: No, ehhh..... Rework this shite.... .//impl Iterator<Item = Cow<'static, str>> + 'this;
type CompileError<'s> = regex::Error;
type ExecError<'s> = Infallible;
#[inline]
fn try_compile<'s>(string: &'s str) -> Result<Self, Self::CompileError<'s>> {
Self::new(string)
}
#[inline(always)]
fn try_compile_boxed<'s>(string: &'s str) -> Result<Box<Self>, Self::CompileError<'s>> {
Self::new(string).map(Box::new)
}
#[inline]
fn try_exec<'s>(&self, string: &'s str) -> Result<Option<Self::Output<'s>>, Self::ExecError<'s>> {
Ok(match self.captures(string) {
Some(m) => {
let res = (0..m.len()).map(move |i| match m.get(i) { Some(ma) => Cow::Borrowed(ma.as_str()), None => Cow::Borrowed("") });
// If `unstable` is not enabled, we cannot skip this allocation (for now...)
#[cfg(not(feature="unstable"))]
let res = Some(res.collect());
// Otherwise, use ITiAT to return the iterator.
#[cfg(feature="unstable")]
let res = Some(res);
res
},
None => None,
})
}
#[inline(always)]
fn should_prefer_run_in_parallel(&self, num: Option<NonZeroUsize>) -> Option<bool> {
Some(match num.map(NonZeroUsize::get) {
Some(1) => return None,
_ => true,
})
}
}
impl RegexEngine for Regex
{
type Output<'string> = Groups; // XXX: Can we have a ref-only output here...? Maybe for non-PCRE... So keep as this for now.
type CompileError<'s> = Error;
type ExecError<'s> = Infallible;
#[inline]
fn try_compile<'s>(string: &'s str) -> Result<Self, Self::CompileError<'s>> {
Self::compile(string)
}
#[inline(always)]
fn try_compile_boxed<'s>(string: &'s str) -> Result<Box<Self>, Self::CompileError<'s>> {
Self::compile(string).map(Box::new)
}
#[inline(always)]
fn try_exec<'t, 's>(&'t self, string: &'s str) -> Result<Option<Self::Output<'s>>, Self::ExecError<'s>> {
// SAFETY: The implementation of `Regex::exec()` has no path that can return an error (XXX: Why does it even return `Result` anyway...?)
Ok(unsafe {
Self::exec(&self, string).unwrap_unchecked()
})
}
/// PCRE supports `study()`ing the regular expression, which we might want to do if we have more than a few strings to match on.
///
/// If PCRE is not enabled, and we use the Rust regex `regex::Regex`; it does not require/support additional optimisations, so keep the default noop-impl from the trait if this feature is not enabled.
#[cfg(feature="perl")]
fn prepare_regex(&mut self, num: Option<NonZeroUsize>) {
match num.map(NonZeroUsize::get) {
Some(1..=2) | None => return,
_ => (),
}
// XXX: Eh.. The `Arc` means we gotta lock here...
// match (&mut self.internal).get_mut() {
// Ok(v) => v.study(),
// Err(mut v) => v.get_mut().study(),
// };
// NOTE: If there is another lock held while *this* method is being invoked, it can *only* make logical sense that it is calling the same method on a different thread. So do not block to call this. (XXX: This is only required because of the silly locking shit we gotta do here...)
match self.internal.try_lock() {
Ok(mut re) => {
re.study();
self.internal.clear_poison();
drop(re);
},
_ => (),
};
}
#[inline(always)]
fn should_prefer_run_in_parallel(&self, num: Option<NonZeroUsize>) -> Option<bool> {
match num.map(NonZeroUsize::get) {
Some(1) => return Some(false),
_ => (),
}
Some(! Self::IS_EXTENDED)
}
}
/// Non-PCRE / non-extended regex (regardless of if the `perl` feature is enabled.)
pub type NonPCRERegex = regex::Regex;
/// PCRE-enabled (if feature is enabled, see [`IS_EXTENDED`]) regex.
#[derive(Debug, Clone)]
pub struct Regex
{
#[cfg(feature="perl")]
internal: Arc<Mutex<pcre::Pcre>>,
internal: Arc<Mutex<pcre::Pcre>>, // XXX: Can we make parallel usage a bit less... expensive? TODO: How expensive is it to clone these into a thread-local cache, for instance?
#[cfg(not(feature = "perl"))]
internal: regex::Regex,
}
impl Regex
{
/// If the implementation uses PCRE instead of default regex.
pub const IS_EXTENDED: bool = cfg!(feature="perl");
}
#[derive(Debug)]
pub enum Error
{
@ -65,9 +362,9 @@ impl Regex {
let len = m.string_count();
let mut output = Vec::with_capacity(len);
for i in 0..len {
output.push(m.group(i).to_owned());
output.push(m.group(i).into());
}
Some(output)
Some(output.into_boxed_slice())
},
None => None,
})
@ -76,14 +373,16 @@ impl Regex {
return {
Ok(match self.internal.captures(string.as_ref()) {
Some(m) => {
let mut output = Vec::with_capacity(m.len());
for i in 0..m.len() {
let ma = m.get(i).unwrap();
let mut op = String::with_capacity(ma.range().len());
write!(op, "{}", ma.as_str())?;
output.push(op);
}
Some(output)
Some((0..m.len()).map(move |i| match m.get(i) { Some(ma) => ma.as_str().into(), None => "".into()} ).collect())
// let mut output = Vec::with_capacity(m.len());
// for i in 0..m.len() {
// let ma = m.get(i).unwrap();
// //let mut op = String::with_capacity(ma.range().len());
// //let op = format!("{}", ma.as_str().into())
// output.push(ma.as_str().into());
// }
// Some(output.into_boxed_slice())
},
None => None,
})
@ -99,7 +398,7 @@ impl From<fmt::Error> for Error
}
}
#[cfg(not(feature = "perl"))]
//#[cfg(not(feature = "perl"))]
impl From<regex::Error> for Error
{
fn from(er: regex::Error) -> Self

Loading…
Cancel
Save