Compare commits

..

11 Commits
master ... cli

Author SHA1 Message Date
Avril b8723cc8d0
Merged old-interface colourised extra help + feature info.
2 days ago
Avril e2358483c9
Fixed colour output being printed to non-TTY outputs & to `NO_COLOR` environments.
5 days ago
Avril 003986677f
Finished pretty-formatting of extra info in help message (about version & enabled features.)
5 days ago
Avril c830b1728b
Deferred comptime string concatenation in cases where near-duplicate strings would end up in the binary with no performance benefit from not having the extra indirection in printing. (See `ext::disjoint!` macro.)
6 days ago
Avril d31d09366f
Added extra info in old interface help message (about version & enabled features.)
1 week ago
Avril 8d7f9399d2
`GroupConfig`: Added some documentation explaining fields & defaults.
1 week ago
Avril 3b1299c176
old interface: Added more efficient flow of output `stdout` object(s) through main branches. Internal buffer is flushed into lock before it is dropped in the `-` case, & only the non-locked handle is flushed in the cli-string case.
1 week ago
Avril 765270feaf
Remove unneccisary multi-`stdout` locking. Added internal buffering.
1 week ago
Avril 4539f87528
Added `GroupConfig` argument to `print_groups()`: Configurable group printing delimiters (default: Field delim (specified groups) <TAB>-literal, Line delim (input strings) <LF>-literal.)
1 week ago
Avril e6c0714575
Added multi-group printing (to old interface.)
2 weeks ago
Avril bcdbec60ca
Started PCRE2 dep-updated Cli refactor
2 weeks ago

@ -1,6 +1,6 @@
[package] [package]
name = "rematch" name = "rematch"
version = "0.3.2" version = "1.1.0+1"
authors = ["Avril <flanchan@cumallover.me>"] authors = ["Avril <flanchan@cumallover.me>"]
edition = "2024" edition = "2024"
@ -25,6 +25,8 @@ unstable = ["regex/unstable"]
[dependencies] [dependencies]
pcre2 = { version = "0.2.9", optional = true } pcre2 = { version = "0.2.9", optional = true }
clap = { version = "4.5.35", features = ["derive", "env", "string"] }
regex = { version = "1.11.1", features = ["use_std"] } regex = { version = "1.11.1", features = ["use_std"] }
color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] } color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] }
rayon = "1.10.0"
owo-colors = { version = "3.5.0", features = ["alloc", "supports-colors"] } owo-colors = { version = "3.5.0", features = ["alloc", "supports-colors"] }

@ -0,0 +1,295 @@
//! Arguments and Cli-parsing
use super::*;
use std::{
str,
error, fmt,
borrow::{
Borrow, Cow, ToOwned,
},
path::{
Path, PathBuf,
},
//collections::BTreeSet as Set,
};
use clap::{
Parser,
Args,
Subcommand,
ValueEnum,
};
/// A value that may be provided, or may be deferred to be provided by `stdin` (/ written to `stdout`.)
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
pub enum MaybeValue<T = String>
{
Stdio,
Value(T),
}
impl<T> MaybeValue<T>
{
pub const STDIO_SYMBOL: &'static str = "-";
#[inline]
pub const fn is_stdio(&self) -> bool
{
match self {
Self::Stdio => true,
_ => false,
}
}
#[inline]
pub const fn value(&self) -> Option<&T>
{
match self {
Self::Value(v) => Some(&v),
_ => None
}
}
#[inline(always)]
pub const fn has_value(&self) -> bool
{
self.value().is_some()
}
/// Convert the value type to `U` (if there is one.)
///
/// e.g. to convert `let _: MaybeValue<PathBuf> = MaybeString::map_into();`
#[inline]
pub fn map_into<U: From<T>>(self) -> MaybeValue<U>
{
match self {
Self::Value(v) => MaybeValue::Value(v.into()),
Self::Stdio => MaybeValue::Stdio,
}
}
/// Consume into the `Value(T)` if possible, if not, return `Err(Self)`.
#[inline]
#[must_use]
pub fn try_into_value(self) -> Result<T, Self>
{
match self {
x @ Self::Stdio => Err(x),
Self::Value(v) => Ok(v),
}
}
}
impl<T: AsRef<str>> AsRef<str> for MaybeValue<T>
{
#[inline]
fn as_ref(&self) -> &str
{
match self {
Self::Stdio => Self::STDIO_SYMBOL,
Self::Value(v) => v.as_ref(),
}
}
}
impl<T: Borrow<str>> Borrow<str> for MaybeValue<T>
{
#[inline]
fn borrow(&self) -> &str
{
match self {
Self::Stdio => Self::STDIO_SYMBOL,
Self::Value(v) => v.borrow(),
}
}
}
impl<T> Default for MaybeValue<T>
{
#[inline]
fn default() -> Self
{
Self::Stdio
}
}
impl<T: Into<String>> MaybeValue<T>
{
#[inline]
pub fn into_string(self) -> Cow<'static, str>
{
match self {
Self::Value(v) => Cow::Owned(v.into()),
Self::Stdio => Cow::Borrowed(Self::STDIO_SYMBOL),
}
}
}
impl<T: Into<PathBuf>> MaybeValue<T>
{
#[inline]
pub fn into_path(self) -> Cow<'static, Path>
{
match self {
Self::Value(v) => Cow::Owned(v.into()),
Self::Stdio => Cow::Borrowed(Path::new(Self::STDIO_SYMBOL)),
}
}
}
impl<T: Into<PathBuf>> From<MaybeValue<T>> for Box<Path>
{
#[inline]
fn from(value: MaybeValue<T>) -> Self {
value.into_path().into_owned().into_boxed_path()
}
}
impl<T: Into<String>> From<MaybeValue<T>> for Box<str>
{
#[inline]
fn from(value: MaybeValue<T>) -> Self {
value.into_string().into_owned().into_boxed_str()
}
}
impl<T: Into<String>> From<MaybeValue<T>> for Cow<'static, str>
{
fn from(from: MaybeValue<T>) -> Self
{
from.into_string()
}
}
impl<T: From<String>> From<String> for MaybeValue<T>
{
#[inline]
fn from(from: String) -> Self
{
match &from[..] {
Self::STDIO_SYMBOL => Self::Stdio,
_ => Self::Value(from.into()),
}
}
}
impl<T> str::FromStr for MaybeValue<T>
where T: str::FromStr {
type Err = T::Err;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
Self::STDIO_SYMBOL => Ok(Self::Stdio),
s => T::from_str(s).map(Self::Value)
}
}
}
/// User-provied configuration of how the program should behave here
#[derive(Debug, Args)]
pub struct Config
{
/// Use the PCRE (JS-like) extended regular expression compiler.
///
/// __NOTE__: The binary must have been compiled with build feature `perl` to use this option.
///
/// # Feature difference
/// By default, the expression syntax does not support things like negative lookahead and other backtrack-requiring regex features.
///
/// ## Efficiency
/// Note that non-PCRE expressions are more efficient in general, and can also enable parallel processing of strings where there are many (e.g. a long list of lines from `stdin` can be matched against in parallel.)
///
/// It is ill-advised to enable PCRE on large inputs unless those features are required.
//TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?)
#[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.)
// #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off.
pub extended: bool,
/// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines.
///
/// This only affects the output of each string's match groups, not the groups themselves, those will still be delimited by TAB literals in the output.
#[arg(short='0', long)]
pub zero: bool, //XXX: Add `--field=`/`--ifs` option, put these in same group. Maybe add `--delimit-groups=` to change the group delimiter from `\t` to user-specified value.
}
impl Config
{
/// Whether it is requested to use PCRE regex instead of regular regex.
///
/// # Interaction with feature gating of ~actual~ PCRE support via `feature="perl"`
/// Note that if the "perl" feature is not enabled, this may still return `true`.
/// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that.
#[inline(always)]
#[deprecated(note = "Access field `extended` instead.")]
//TODO: Make `extended` public and remove this accessor?
pub fn use_pcre(&self) -> bool
{
//#![allow(unreachable_code)]
//#[cfg(feature="perl")] return self.extended; //TODO: See above comment on un-gating `self.extended`
//false
self.extended
}
}
/// A string value that may be provided to the CLI, or delegated to `stdio`.
pub type MaybeString = MaybeValue<Box<str>>;
/// A path that may represent an `stdio` file-descriptor instead of a named file.
pub type MaybePath = MaybeValue<Box<Path>>;
/// `rematch` is a simple command-line tool for matching & printing capture groups of an input string(s) against a regular expression.
///
/// The input string(s) can be provided in the command-line, or they can be provided as line delimited (by default) stream from `stdin`.
#[derive(Debug, Parser)]
#[command(name = env!("CARGO_PKG_NAME"), version, about, long_about)]
pub struct Cli
{
/// Configuration of the execution
#[command(flatten)]
pub config: Config,
//XXX: Should we make these fields public?
/// The input string to use, or `-` to read from stdin.
//TODO: Support multiple input strings in non-`stdin` case too. (XXX: How should this be handled...?)
string: MaybeString,
/// The regular expression to match `string` on.
regex: String,
/// The regex capture group indecies to print when matches on `string`.
#[arg(required= true, trailing_var_arg = true, allow_hyphen_values = false, num_args=1..)]
//TODO: Allow ranges & fallible captures, so lines that match group 1 but not 2 will not cause output failure if given `1 2?` but will if given `1 2` (XXX: Is this actually meaningful/possible? Can we do this at all? I'm pretty sure `/(?:(.))?/` still creates an (empty) group? So perhaps, syntax for failing on *empty* group matches...? like, `1! 2` for "group #1 *required*, group #2 is not requested?")
groups: Vec<usize>, // TODO: How to dedup (XXX: Do we want to de-dup? Maybe the user wants group `1` twice? I think it's fine (also we need to preserve user ordering of group indecied))
}
impl Cli {
/// Get the input string to match on
///
/// If the requested input is `stdin`, `None` is returned.
#[inline]
pub fn input_string(&self) -> Option<&str>
{
self.string.value().map(AsRef::as_ref)
}
/// Get the string to build the regular expression from
pub fn regex_string(&self) -> &str
{
&self.regex[..]
}
/// Get the match group(s) to print in the output
#[inline]
pub fn groups(&self) -> &[usize]
{
&self.groups[..]
}
/// Get the number of match groups requested.
#[inline]
pub fn num_groups(&self) -> usize
{
self.groups.len()
}
}
/// Parse the command-line arguments passed to the program
pub fn parse_cli() -> Cli
{
clap::Parser::parse()
}

@ -2,6 +2,7 @@
mod re; mod re;
mod text; mod text;
mod args;
mod ext; use ext::*; mod ext; use ext::*;
use color_eyre::{ use color_eyre::{
@ -19,15 +20,100 @@ fn initialise() -> eyre::Result<()>
Ok(()) Ok(())
} }
/// Provides the group output writer with references to the byte-strings to be used to delimit single groups' output & to delimit each input string (or line)'s output.
///
/// The defaults are `\n` (LF-literal) for [line_delimiter], and `\t` for [field_delimiter], see the fields for more information.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord/*, Copy*/)]
pub struct GroupConfig<'a>
{
/// Delimits the output of each individual input string.
///
/// By default, this is a newline `\n` character.
pub line_delimiter: &'a [u8],
/// Delimits the output of each requested capture group (per [line_delimiter].)
/// If there is only 1 (or less) requested groups (that may be *empty/unmatched*, are **not** *invalid*,)
/// then there is no field delimit for that individual output string line.
///
/// By default, this is a tab literal `\t` character.
pub field_delimiter: &'a [u8],
}
impl<'a> Default for GroupConfig<'a>
{
#[inline]
fn default() -> Self
{
Self::new()
}
}
impl<'a> GroupConfig<'a>
{
pub const fn new() -> Self {
Self {
line_delimiter: b"\n",
field_delimiter: b"\t",
}
}
pub const fn has_line_delimiter(&self) -> bool {
! self.line_delimiter.is_empty()
}
pub const fn has_field_delimiter(&self) -> bool {
! self.field_delimiter.is_empty()
}
#[inline]
pub const fn with_field_delimiter<'b>(self, field_delimiter: &'b [u8]) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
field_delimiter,
..self
}
}
#[inline]
pub const fn with_field_delimiter_str<'b>(self, field: &'b str) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
field_delimiter: field.as_bytes(),
..self
}
}
#[inline]
pub const fn with_line_delimiter<'b>(self, line_delimiter: &'b [u8]) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
line_delimiter,
..self
}
}
#[inline]
pub const fn with_line_delimiter_str<'b>(self, line: &'b str) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
line_delimiter: line.as_bytes(),
..self
}
}
}
#[inline] #[inline]
fn print_groups<'a, S: ?Sized, G, T: 'a, I>(to: &mut S, g: G, groups: I) -> std::io::Result<()> fn print_groups<'c, 'a, S: ?Sized, G, T: 'a, I>(to: &mut S, g: G, groups: I, how: impl std::borrow::Borrow<GroupConfig<'c>>) -> std::io::Result<()>
where S: std::io::Write, where S: std::io::Write + 'c, // NOTE: This lifetime bound is not yet used, as it is just `Write`, but if we change this to a context wrapper, then we can copy the `how`'s `'c` references into the context object without direct write/format/cloning.
G: IntoIterator<Item = &'a Option<T>> + Clone + Copy, // NOTE: Copy bound to ensure we're not accidentally doing deep clones of `g`. G: IntoIterator<Item = &'a Option<T>> + Clone + Copy, // NOTE: Copy bound to ensure we're not accidentally doing deep clones of `g`.
//G: std::ops::Index<usize>, G::Output: std::borrow::Borrow<Option<T>>, //G: std::ops::Index<usize>, G::Output: std::borrow::Borrow<Option<T>>,
T: std::borrow::Borrow<str>, T: std::borrow::Borrow<str>,
I: IntoIterator<Item: std::borrow::Borrow<usize>/*, IntoIter: ExactSizeIterator*/>, I: IntoIterator<Item: std::borrow::Borrow<usize>/*, IntoIter: ExactSizeIterator*/>,
{ {
use std::borrow::Borrow; use std::{
borrow::Borrow,
io::Write,
};
let how = how.borrow(); //std::borrow::ToOwned::clone_into(&self, target);
let mut first = true; let mut first = true;
for group in groups.into_iter() { for group in groups.into_iter() {
let group = group.borrow(); let group = group.borrow();
@ -35,10 +121,13 @@ where S: std::io::Write,
// if !first { // if !first {
// write!(to, "\t")?; // write!(to, "\t")?;
// } // }
let print_delim = || first.then_some("").unwrap_or("\t"); // If it's not the first iteration, print `\t`. let print_delim = move |to: &mut S| to.write_all(first.then_some(&[][..]).unwrap_or(&how.field_delimiter[..]).as_ref()); // If it's not the first iteration, print `\t`.
match g.into_iter().nth(*group) { match g.into_iter().nth(*group) {
Some(None) => write!(to, "{}", print_delim()), Some(None) => print_delim(to),
Some(Some(g)) => write!(to, "{}{}", print_delim(), g.borrow()), Some(Some(g)) => {
print_delim(to)?;
write!(to, "{}", g.borrow())
},
//TODO: What should be the behaviour of a non-existent group index here? (NOTE: This now corresponds to the previous `g.len() > group` check in caller.) // (NOTE: The original behaviour is to just ignore groups that are out of range entirely (i.e. no printing, no delimit char, no error,) maybe treat non-existent groups as non-matched groups and *just* print the delim char?) //TODO: What should be the behaviour of a non-existent group index here? (NOTE: This now corresponds to the previous `g.len() > group` check in caller.) // (NOTE: The original behaviour is to just ignore groups that are out of range entirely (i.e. no printing, no delimit char, no error,) maybe treat non-existent groups as non-matched groups and *just* print the delim char?)
// (NOTE: Moved out of branch, see above ^) // None if !first => write!(to, "\t"), // (NOTE: Moved out of branch, see above ^) // None if !first => write!(to, "\t"),
@ -52,8 +141,8 @@ where S: std::io::Write,
first = false; first = false;
} }
// If `first == true`, no groups were printed, so we do not print the new-line. // If `first == true`, no groups were printed, so we do not print the new-line.
if !first { if !first && how.has_line_delimiter() {
to.write_all(b"\n") to.write_all(how.line_delimiter.as_ref())
} else { } else {
Ok(()) Ok(())
} }
@ -63,9 +152,9 @@ fn main() -> eyre::Result<()>
{ {
initialise().wrap_err("Fatal: Failed to install panic handle")?; initialise().wrap_err("Fatal: Failed to install panic handle")?;
//let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?; let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?;
//eprintln!("{:#?}", cli); eprintln!("{:#?}", cli);
// return Ok(()); // return Ok(());
let args: re::FrozenVec<re::FrozenString> = std::env::args().map(String::into_boxed_str).collect(); let args: re::FrozenVec<re::FrozenString> = std::env::args().map(String::into_boxed_str).collect();
@ -104,49 +193,57 @@ fn main() -> eyre::Result<()>
let re = re::Regex::compile(&args[2])?; let re = re::Regex::compile(&args[2])?;
let text = &args[1]; let text = &args[1];
let groups = &args[3..]; let print_cfg = GroupConfig::new();
let groups = {
let groups = &args[3..];
if groups.len() < 1 { if groups.len() < 1 {
eprintln!("Warning: No capture groups requested."); eprintln!("Warning: No capture groups requested.");
// NOTE: Unexpected branch... // NOTE: Unexpected branch...
return Ok(()); return Ok(());
} }
let groups = groups.iter().enumerate()
.map(|(i, x)| x.parse()
.with_section(|| format!("{:?}", groups).header("Groups specified were"))
.with_section(|| x.clone().header("Specified capture group index was"))
.with_section(move || i.header("Argument index in provided groups")))
.collect::<Result<Box<[usize]>, _>>()
.wrap_err("Invalid group index specified")?;
//TODO: XXX: How to handle multiple groups in `stdin_lines()` case? // Parse each group index into `groups`.
//let group = groups[0]; //args[3].parse().expect("Invalid group number."); groups.iter().enumerate()
.map(|(i, x)| x.parse()
.with_section(|| format!("{:?}", groups).header("Groups specified were"))
.with_section(|| x.clone().header("Specified capture group index was"))
.with_section(move || i.header("Argument index in provided groups")))
.collect::<Result<Box<[usize]>, _>>()
.wrap_err("Invalid group index specified")?
};
use std::io::Write; use std::io::Write;
let mut stdout = std::io::stdout(); let mut stdout = std::io::stdout();
// Take the kind of `stdout` used (locked & buffered, or not locked & buffered) ..
let stdout = if &text[..] == "-" { let stdout = if &text[..] == "-" {
// Lock the output for the duration of the read lines.
// Buffer the output in program memory to make processing a bit faster (i.e. the segmented 'write' operations in `print_groups()`, which may be called many times here) & not have to wait on write lines for no reason (since we're already waiting on read lines.)
let mut stdout = std::io::BufWriter::new(stdout.lock()); let mut stdout = std::io::BufWriter::new(stdout.lock());
text::stdin_lines(|text| -> eyre::Result<bool> { text::stdin_lines(|text| -> eyre::Result<bool> {
match re.exec(&text)? { match re.exec(&text)? {
Some(g) /*if g.len() > group*/ => // NOTE: This check branch has now been moved into `print_groups()` Some(g) /*if g.len() > group*/ => // NOTE: This check branch has now been moved into `print_groups()`
print_groups(&mut stdout, &g, &groups)?, //println!("{}", &g[group]), print_groups(&mut stdout, &g, &groups, &print_cfg)?, //println!("{}", &g[group]),
_ => (), _ => (),
} }
Ok(true) Ok(true)
})?; })?;
Some(stdout) // Return the buffer to the main block to be flushed to output (see above & below.)
Some(stdout)
} else { } else {
match re.exec(&text)? { match re.exec(&text)? {
Some(g) /*if g.len() > group*/ => print_groups(&mut stdout, &g[..], &groups)?,//println!("{}", &g.nth(group).unwrap().map(|x| x.as_ref()).unwrap_or("")), Some(g) /*if g.len() > group*/ => print_groups(&mut stdout, &g[..], &groups, print_cfg)?,//println!("{}", &g.nth(group).unwrap().map(|x| x.as_ref()).unwrap_or("")),
_ => (), _ => (),
} }
None // As there is no internal buffer used, there is no reason to pass it to be flushed by the program when finished like the `-` case above.
}.ok_or_else(move || stdout); None
}.ok_or_else(move || stdout); // NOTE ^: Instead, we have it flush the non-memory-buffered output handle before exiting the program.
// and .. Ensure the stream (and buffer, if used) is flushed (then dropped.)
unwrap_either!(mut stdout => stdout.flush()).unwrap(); unwrap_either!(mut stdout => stdout.flush()).unwrap();
} }
Ok(()) Ok(())

@ -11,7 +11,7 @@ pub type FrozenString = Box<str>;
// TODO: to return some kind of `Either<&'s str, impl bytes::Buf + 's>` type, which would use `str` on non-PCRE, but opaque `bytes::Buf` on PCRE?) // TODO: to return some kind of `Either<&'s str, impl bytes::Buf + 's>` type, which would use `str` on non-PCRE, but opaque `bytes::Buf` on PCRE?)
pub type FrozenBytes = FrozenVec<u8>; pub type FrozenBytes = FrozenVec<u8>;
pub type Groups<String = FrozenString> = FrozenVec<Option<String>>; pub type Groups<String = FrozenString> = FrozenVec<Option<String>>; // TODO: See `exec()` comment below about named groups and switching to `BTreeMap<{enum : CaptureGroupIdent::Index, Name}, Option<Cow<'s str>>>`
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Regex pub struct Regex
@ -55,7 +55,7 @@ impl Regex {
return Ok(Self{internal: regex::Regex::new(string.as_ref())?}); return Ok(Self{internal: regex::Regex::new(string.as_ref())?});
} }
pub fn exec<'s>(&self, string: &'s str) -> Result<Option<Groups<Cow<'s, str>>>, Error> pub fn exec<'s>(&self, string: &'s str) -> Result<Option<Groups<Cow<'s, str>>>, Error> //TODO: Can we also add named groups with a `BTreeMap<{CG::Index(usize) | CG:Name(String)}, Option<Groups<Cow<'s, str>>>>` (XXX: And maybe also be able to simplefy `V` to just `Option<&'s str>` / `Option<Cow<'s, str>>`, since the group index is already encoded in `K` (group index / group name mapped to potential match of associated group).)
{ {
#[cfg(feature = "perl")] #[cfg(feature = "perl")]
return { return {

Loading…
Cancel
Save