From 6a74f54a264e75321508bb8f3e405e26279637bb Mon Sep 17 00:00:00 2001 From: Avril Date: Tue, 1 Apr 2025 20:21:12 +0100 Subject: [PATCH] Imported cli-refactor's `args.rs` (for future CLI refactor) with PCRE2 dependency update. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fortune for rematch's current commit: Small curse − 小凶 --- Cargo.toml | 17 ++- src/args.rs | 294 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 27 ++++- 3 files changed, 332 insertions(+), 6 deletions(-) create mode 100644 src/args.rs diff --git a/Cargo.toml b/Cargo.toml index 0a4cb86..5729d56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "rematch" -version = "0.1.0+1" +version = "0.2.0" authors = ["Avril "] -edition = "2018" +edition = "2024" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -13,10 +13,19 @@ codegen-units = 1 panic = "unwind" strip = true +[profile.symbols] +inherits = "release" +strip = false + [features] -default= ["perl"] +default= ["perl", "unstable"] + perl = ["dep:pcre2"] +unstable = ["regex/unstable"] [dependencies] -regex = "1" pcre2 = { version = "0.2.9", optional = true } +clap = { version = "4.5.35", features = ["derive", "env", "string"] } +regex = { version = "1.11.1", features = ["use_std"] } +color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] } +rayon = "1.10.0" diff --git a/src/args.rs b/src/args.rs new file mode 100644 index 0000000..f86428c --- /dev/null +++ b/src/args.rs @@ -0,0 +1,294 @@ +//! Arguments and Cli-parsing +use super::*; +use std::{ + str, + error, fmt, + borrow::{ + Borrow, Cow, ToOwned, + }, + path::{ + Path, PathBuf, + }, + //collections::BTreeSet as Set, +}; +use clap::{ + Parser, + Args, + Subcommand, + ValueEnum, +}; + +/// A value that may be provided, or may be deferred to be provided by `stdin` (/ written to `stdout`.) +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] +pub enum MaybeValue +{ + Stdio, + Value(T), +} + +impl MaybeValue +{ + pub const STDIO_SYMBOL: &'static str = "-"; + + #[inline] + pub const fn is_stdio(&self) -> bool + { + match self { + Self::Stdio => true, + _ => false, + } + } + #[inline] + pub const fn value(&self) -> Option<&T> + { + match self { + Self::Value(v) => Some(&v), + _ => None + } + } + #[inline(always)] + pub const fn has_value(&self) -> bool + { + self.value().is_some() + } + + /// Convert the value type to `U` (if there is one.) + /// + /// e.g. to convert `let _: MaybeValue = MaybeString::map_into();` + #[inline] + pub fn map_into>(self) -> MaybeValue + { + match self { + Self::Value(v) => MaybeValue::Value(v.into()), + Self::Stdio => MaybeValue::Stdio, + } + } + + /// Consume into the `Value(T)` if possible, if not, return `Err(Self)`. + #[inline] + #[must_use] + pub fn try_into_value(self) -> Result + { + match self { + x @ Self::Stdio => Err(x), + Self::Value(v) => Ok(v), + } + } +} + +impl> AsRef for MaybeValue +{ + #[inline] + fn as_ref(&self) -> &str + { + match self { + Self::Stdio => Self::STDIO_SYMBOL, + Self::Value(v) => v.as_ref(), + } + } +} + +impl> Borrow for MaybeValue +{ + #[inline] + fn borrow(&self) -> &str + { + match self { + Self::Stdio => Self::STDIO_SYMBOL, + Self::Value(v) => v.borrow(), + } + } +} + + +impl Default for MaybeValue +{ + #[inline] + fn default() -> Self + { + Self::Stdio + } +} + +impl> MaybeValue +{ + #[inline] + pub fn into_string(self) -> Cow<'static, str> + { + match self { + Self::Value(v) => Cow::Owned(v.into()), + Self::Stdio => Cow::Borrowed(Self::STDIO_SYMBOL), + } + } +} + +impl> MaybeValue +{ + #[inline] + pub fn into_path(self) -> Cow<'static, Path> + { + match self { + Self::Value(v) => Cow::Owned(v.into()), + Self::Stdio => Cow::Borrowed(Path::new(Self::STDIO_SYMBOL)), + } + } +} + +impl> From> for Box +{ + #[inline] + fn from(value: MaybeValue) -> Self { + value.into_path().into_owned().into_boxed_path() + } +} +impl> From> for Box +{ + #[inline] + fn from(value: MaybeValue) -> Self { + value.into_string().into_owned().into_boxed_str() + } +} + +impl> From> for Cow<'static, str> +{ + fn from(from: MaybeValue) -> Self + { + from.into_string() + } +} + + +impl> From for MaybeValue +{ + #[inline] + fn from(from: String) -> Self + { + match &from[..] { + Self::STDIO_SYMBOL => Self::Stdio, + _ => Self::Value(from.into()), + } + } +} + +impl str::FromStr for MaybeValue +where T: str::FromStr { + type Err = T::Err; + + #[inline] + fn from_str(s: &str) -> Result { + match s { + Self::STDIO_SYMBOL => Ok(Self::Stdio), + s => T::from_str(s).map(Self::Value) + } + } +} + +/// User-provied configuration of how the program should behave here +#[derive(Debug, Args)] +pub struct Config +{ + /// Use the PCRE (JS-like) extended regular expression compiler. + /// + /// __NOTE__: The binary must have been compiled with build feature `perl` to use this option. + /// + /// # Feature difference + /// By default, the expression syntax does not support things like negative lookahead and other backtrack-requiring regex features. + /// + /// ## Efficiency + /// Note that non-PCRE expressions are more efficient in general, and can also enable parallel processing of strings where there are many (e.g. a long list of lines from `stdin` can be matched against in parallel.) + /// + /// It is ill-advised to enable PCRE on large inputs unless those features are required. + //TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?) + #[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.) +// #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off. + extended: bool, + + /// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines. + /// + /// This only affects the output of each string's match groups, not the groups themselves, those will still be delimited by TAB literals in the output. + #[arg(short='0', long)] + pub zero: bool, //XXX: Add `--field=`/`--ifs` option, put these in same group. Maybe add `--delimit-groups=` to change the group delimiter from `\t` to user-specified value. +} + +impl Config +{ + /// Whether it is requested to use PCRE regex instead of regular regex. + /// + /// # Interaction with feature gating of ~actual~ PCRE support via `feature="perl"` + /// Note that if the "perl" feature is not enabled, this may still return `true`. + /// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that. + #[inline(always)] + //TODO: Make `extended` public and remove this accessor? + pub fn use_pcre(&self) -> bool + { + //#![allow(unreachable_code)] + //#[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended` + //false + self.extended + } +} + +/// A string value that may be provided to the CLI, or delegated to `stdio`. +pub type MaybeString = MaybeValue>; +/// A path that may represent an `stdio` file-descriptor instead of a named file. +pub type MaybePath = MaybeValue>; + +/// `rematch` is a simple command-line tool for matching & printing capture groups of an input string(s) against a regular expression. +/// +/// The input string(s) can be provided in the command-line, or they can be provided as line delimited (by default) stream from `stdin`. +#[derive(Debug, Parser)] +#[command(name = env!("CARGO_PKG_NAME"), version, about, long_about)] +pub struct Cli +{ + /// Configuration of the execution + #[command(flatten)] + pub config: Config, + + //XXX: Should we make these fields public? + /// The input string to use, or `-` to read from stdin. + //TODO: Support multiple input strings in non-`stdin` case too. (XXX: How should this be handled...?) + string: MaybeString, + /// The regular expression to match `string` on. + regex: String, + /// The regex capture group indecies to print when matches on `string`. + #[arg(required= true, trailing_var_arg = true, allow_hyphen_values = false, num_args=1..)] + //TODO: Allow ranges & fallible captures, so lines that match group 1 but not 2 will not cause output failure if given `1 2?` but will if given `1 2` (XXX: Is this actually meaningful/possible? Can we do this at all? I'm pretty sure `/(?:(.))?/` still creates an (empty) group? So perhaps, syntax for failing on *empty* group matches...? like, `1! 2` for "group #1 *required*, group #2 is not requested?") + groups: Vec, // TODO: How to dedup (XXX: Do we want to de-dup? Maybe the user wants group `1` twice? I think it's fine (also we need to preserve user ordering of group indecied)) +} + +impl Cli { + /// Get the input string to match on + /// + /// If the requested input is `stdin`, `None` is returned. + #[inline] + pub fn input_string(&self) -> Option<&str> + { + self.string.value().map(AsRef::as_ref) + } + + /// Get the string to build the regular expression from + pub fn regex_string(&self) -> &str + { + &self.regex[..] + } + + /// Get the match group(s) to print in the output + #[inline] + pub fn groups(&self) -> &[usize] + { + &self.groups[..] + } + + /// Get the number of match groups requested. + #[inline] + pub fn num_groups(&self) -> usize + { + self.groups.len() + } +} + +/// Parse the command-line arguments passed to the program +pub fn parse_cli() -> Cli +{ + clap::Parser::parse() +} diff --git a/src/main.rs b/src/main.rs index c35845d..e54d328 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,22 @@ mod re; mod text; +mod args; + +use color_eyre::{ + eyre::{ + self, + eyre, + WrapErr as _, + }, + SectionExt as _, Help as _, +}; + +fn initialise() -> eyre::Result<()> +{ + color_eyre::install()?; + Ok(()) +} fn print_group(to: &mut S, g: G, group: usize) -> std::io::Result<()> where S: std::io::Write, @@ -15,8 +31,15 @@ where S: std::io::Write, } } -fn main() -> Result<(), Box> +fn main() -> eyre::Result<()> { + initialise().wrap_err("Fatal: Failed to install panic handle")?; + + // let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?; + // + // eprintln!("{:#?}", cli); + // return Ok(()); + let args: Vec = std::env::args().collect(); if args.len() < 4 { @@ -32,7 +55,7 @@ fn main() -> Result<(), Box> let mut stdout = std::io::stdout(); if text == "-" { - text::stdin_lines(|text| -> Result> { + text::stdin_lines(|text| -> eyre::Result { let mut stdout = stdout.lock(); match re.exec(&text)? { Some(g) if g.len() > group => print_group(&mut stdout, g, group)?, //println!("{}", &g[group]),