Compare commits

...

10 Commits

Author SHA1 Message Date
Avril e2358483c9
Fixed colour output being printed to non-TTY outputs & to `NO_COLOR` environments.
5 days ago
Avril 003986677f
Finished pretty-formatting of extra info in help message (about version & enabled features.)
5 days ago
Avril c830b1728b
Deferred comptime string concatenation in cases where near-duplicate strings would end up in the binary with no performance benefit from not having the extra indirection in printing. (See `ext::disjoint!` macro.)
6 days ago
Avril d31d09366f
Added extra info in old interface help message (about version & enabled features.)
1 week ago
Avril 8d7f9399d2
`GroupConfig`: Added some documentation explaining fields & defaults.
1 week ago
Avril 3b1299c176
old interface: Added more efficient flow of output `stdout` object(s) through main branches. Internal buffer is flushed into lock before it is dropped in the `-` case, & only the non-locked handle is flushed in the cli-string case.
1 week ago
Avril 765270feaf
Remove unneccisary multi-`stdout` locking. Added internal buffering.
1 week ago
Avril 4539f87528
Added `GroupConfig` argument to `print_groups()`: Configurable group printing delimiters (default: Field delim (specified groups) <TAB>-literal, Line delim (input strings) <LF>-literal.)
1 week ago
Avril e6c0714575
Added multi-group printing (to old interface.)
2 weeks ago
Avril bcdbec60ca
Started PCRE2 dep-updated Cli refactor
2 weeks ago

@ -1,6 +1,6 @@
[package] [package]
name = "rematch" name = "rematch"
version = "0.2.0" version = "1.1.0+1"
authors = ["Avril <flanchan@cumallover.me>"] authors = ["Avril <flanchan@cumallover.me>"]
edition = "2024" edition = "2024"
@ -29,3 +29,4 @@ clap = { version = "4.5.35", features = ["derive", "env", "string"] }
regex = { version = "1.11.1", features = ["use_std"] } regex = { version = "1.11.1", features = ["use_std"] }
color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] } color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] }
rayon = "1.10.0" rayon = "1.10.0"
owo-colors = { version = "3.5.0", features = ["alloc", "supports-colors"] }

@ -201,7 +201,7 @@ pub struct Config
//TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?) //TODO: Should we have PCRE on by default or not...? I think we should maybe have it on by default if the feature is enabled... But that will mess with input parallelism... XXX: Perhaps we can auto-detect if to use PCRE or not (e.g. try compiling to regex first, then PCRE if that fails?)
#[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.) #[arg(short, long)] // XXX: Can we add a clap `value_parser!(FeatureOnBool<"perl">)` which fails to parse its `from_str()` impl if the feature is not enabled. Is this possible with what we currently have? We may be able to with macros, e.g expand a macro to `FeatureOnBool<"perl", const { cfg!(feature="perl") }>` or something similar? (NOTE: If `clap` has a better mechanism for this, use that instead of re-inventing it tho.)
// #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off. // #[cfg(feature="perl")] //XXX: Do we want this option to be feature-gated? Or should we fail with error `if (! cfg!(feature="perl")) && self.extended)`? I think the latter would make things more easily (since the Regex engine gates PCRE-compilation transparently to the API user [see `crate::re::Regex`], we don't need to gate it this way outside of `re`, if we remove this gate we can just use `cfg!()` everywhere here which makes things **MUCH** cleaner..) It also means the user of a non-PCRE build will at least know why their PCRE flag is failing and that it can be built with the "perl" feature, instead of it being *totally* invisible to the user if the feature is off.
extended: bool, pub extended: bool,
/// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines. /// Delimit read input/output strings from/to `stdin`/`stdout` by NUL ('\0') characters instead of newlines.
/// ///
@ -218,6 +218,7 @@ impl Config
/// Note that if the "perl" feature is not enabled, this may still return `true`. /// Note that if the "perl" feature is not enabled, this may still return `true`.
/// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that. /// If the user requests PCRE where it is not available, the caller should return an error/panic to the user telling her that.
#[inline(always)] #[inline(always)]
#[deprecated(note = "Access field `extended` instead.")]
//TODO: Make `extended` public and remove this accessor? //TODO: Make `extended` public and remove this accessor?
pub fn use_pcre(&self) -> bool pub fn use_pcre(&self) -> bool
{ {

@ -0,0 +1,127 @@
//! Extensions
use super::*;
use std::{
fmt,
};
/// Run an expression on an named value with a result type `Result<T, U>`.
/// Where `T` and `U` have *the same API surface* for the duration of the provided expression.
///
/// # Example
/// If there is a value `let mut value: Result<T, U>`, where `T: Write` & `U: BufWrite`;
/// the expression `value.flush()` is valid for both `T` and `U`.
/// Therefore, it can be simplified to be called as so: `unwrap_either(mut value => value.flush())`.
///
/// # Reference capture vs. `move` capture.
/// Note that by default, the identified value is **moved** *into* the expression.
/// The type of reference can be controlled by appending `ref`, `mut`, or `ref mut` to the ident.
///
/// Identifier capture table:
/// - **none** ~default~ - Capture by move, value is immutable in expression.
/// - `mut` - Capture by move, value is mutable in expression.
/// - `ref` - Capture by ref, value is immutable (`&value`) in expression.
/// - `ref mut` - Capture by mutable ref, value is mutable (`&mut value`) in expression. (__NOTE__: `value` must be defined as mutable to take a mutable reference of it.)
///
/// Essentially the same rules as any `match` branch pattern.
macro_rules! unwrap_either {
($res:ident => $($rest:tt)+) => {
match $res {
Ok(ref mut $res) => $($rest)+,
Err(ref mut $res) => $($rest)+,
}
};
(ref mut $res:ident => $($rest:tt)+) => {
match $res {
Ok(ref mut $res) => $($rest)+,
Err(ref mut $res) => $($rest)+,
}
};
(ref $res:ident => $($rest:tt)+) => {
match $res {
Ok(ref $res) => $($rest)+,
Err(ref $res) => $($rest)+,
}
};
(mut $res:ident => $($rest:tt)+) => {
match $res {
Ok(mut $res) => $($rest)+,
Err(mut $res) => $($rest)+,
}
};
}
pub(crate) use unwrap_either;
#[derive(Debug, PartialEq, Eq, Hash)]
#[repr(transparent)]
pub struct DisjointString<'a, T: ?Sized>([&'a T]);
macro_rules! disjoint {
[$($ex:expr),+] => {
$crate::ext::DisjointString::from_array(& [$($ex),+])
};
}
impl<'a, T: ?Sized> DisjointString<'a, T>
where T: fmt::Display
{
#[inline]
pub const fn from_array<'o: 'a, const N: usize>(strings: &'o [&'a T; N]) -> &'o Self
{
Self::new(strings.as_slice())
}
#[inline]
pub const fn new<'o: 'a>(strings: &'o [&'a T]) -> &'o Self
{
// SAFETY: Transparent newtype wrapper over `[&'a T]`
unsafe {
std::mem::transmute(strings)
}
}
}
impl<'a, T: ?Sized> DisjointString<'a, T>
{
#[inline]
pub const fn len(&self) -> usize
{
self.0.len()
}
#[inline]
pub fn iter(&self) -> impl Iterator<Item = &T> + ExactSizeIterator + std::iter::FusedIterator + std::iter::DoubleEndedIterator
{
self.0.iter().map(|&x| x)
}
#[inline]
pub fn into_iter<'o: 'a>(&'o self) -> impl Iterator<Item = &'a T> + ExactSizeIterator + std::iter::FusedIterator + std::iter::DoubleEndedIterator + 'o
{
self.0.into_iter().map(|&x|x)
}
}
impl<'a, T: ?Sized> AsRef<[&'a T]> for DisjointString<'a, T>
{
#[inline]
fn as_ref(&self) -> &[&'a T]
{
&self.0
}
}
impl<'a, T: ?Sized> fmt::Display for DisjointString<'a, T>
where T: fmt::Display
{
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{
for &s in &self.0 {
s.fmt(f)?;
}
Ok(())
}
}
pub(crate) use disjoint;

@ -3,6 +3,7 @@
mod re; mod re;
mod text; mod text;
mod args; mod args;
mod ext; use ext::*;
use color_eyre::{ use color_eyre::{
eyre::{ eyre::{
@ -19,15 +20,131 @@ fn initialise() -> eyre::Result<()>
Ok(()) Ok(())
} }
fn print_group<S: ?Sized, G, T>(to: &mut S, g: G, group: usize) -> std::io::Result<()> /// Provides the group output writer with references to the byte-strings to be used to delimit single groups' output & to delimit each input string (or line)'s output.
where S: std::io::Write, ///
G: IntoIterator<Item = Option<T>>, /// The defaults are `\n` (LF-literal) for [line_delimiter], and `\t` for [field_delimiter], see the fields for more information.
T: std::borrow::Borrow<str> #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord/*, Copy*/)]
pub struct GroupConfig<'a>
{ {
match g.into_iter().nth(group) { /// Delimits the output of each individual input string.
Some(None) => writeln!(to, ""), ///
Some(Some(g)) => writeln!(to, "{}", g.borrow()), /// By default, this is a newline `\n` character.
None => Ok(()), pub line_delimiter: &'a [u8],
/// Delimits the output of each requested capture group (per [line_delimiter].)
/// If there is only 1 (or less) requested groups (that may be *empty/unmatched*, are **not** *invalid*,)
/// then there is no field delimit for that individual output string line.
///
/// By default, this is a tab literal `\t` character.
pub field_delimiter: &'a [u8],
}
impl<'a> Default for GroupConfig<'a>
{
#[inline]
fn default() -> Self
{
Self::new()
}
}
impl<'a> GroupConfig<'a>
{
pub const fn new() -> Self {
Self {
line_delimiter: b"\n",
field_delimiter: b"\t",
}
}
pub const fn has_line_delimiter(&self) -> bool {
! self.line_delimiter.is_empty()
}
pub const fn has_field_delimiter(&self) -> bool {
! self.field_delimiter.is_empty()
}
#[inline]
pub const fn with_field_delimiter<'b>(self, field_delimiter: &'b [u8]) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
field_delimiter,
..self
}
}
#[inline]
pub const fn with_field_delimiter_str<'b>(self, field: &'b str) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
field_delimiter: field.as_bytes(),
..self
}
}
#[inline]
pub const fn with_line_delimiter<'b>(self, line_delimiter: &'b [u8]) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
line_delimiter,
..self
}
}
#[inline]
pub const fn with_line_delimiter_str<'b>(self, line: &'b str) -> GroupConfig<'b>
where 'a: 'b
{
GroupConfig {
line_delimiter: line.as_bytes(),
..self
}
}
}
#[inline]
fn print_groups<'c, 'a, S: ?Sized, G, T: 'a, I>(to: &mut S, g: G, groups: I, how: impl std::borrow::Borrow<GroupConfig<'c>>) -> std::io::Result<()>
where S: std::io::Write + 'c, // NOTE: This lifetime bound is not yet used, as it is just `Write`, but if we change this to a context wrapper, then we can copy the `how`'s `'c` references into the context object without direct write/format/cloning.
G: IntoIterator<Item = &'a Option<T>> + Clone + Copy, // NOTE: Copy bound to ensure we're not accidentally doing deep clones of `g`.
//G: std::ops::Index<usize>, G::Output: std::borrow::Borrow<Option<T>>,
T: std::borrow::Borrow<str>,
I: IntoIterator<Item: std::borrow::Borrow<usize>/*, IntoIter: ExactSizeIterator*/>,
{
use std::{
borrow::Borrow,
io::Write,
};
let how = how.borrow(); //std::borrow::ToOwned::clone_into(&self, target);
let mut first = true;
for group in groups.into_iter() {
let group = group.borrow();
// // Moved to into match group (skipping invalid groups.)
// if !first {
// write!(to, "\t")?;
// }
let print_delim = move |to: &mut S| to.write_all(first.then_some(&[][..]).unwrap_or(&how.field_delimiter[..]).as_ref()); // If it's not the first iteration, print `\t`.
match g.into_iter().nth(*group) {
Some(None) => print_delim(to),
Some(Some(g)) => {
print_delim(to)?;
write!(to, "{}", g.borrow())
},
//TODO: What should be the behaviour of a non-existent group index here? (NOTE: This now corresponds to the previous `g.len() > group` check in caller.) // (NOTE: The original behaviour is to just ignore groups that are out of range entirely (i.e. no printing, no delimit char, no error,) maybe treat non-existent groups as non-matched groups and *just* print the delim char?)
// (NOTE: Moved out of branch, see above ^) // None if !first => write!(to, "\t"),
// XXX: Should this do what it does now...? Or should it `break` to prevent the checking for more groups...? Print a warning maybe...?
None => {
eprintln!("Warning: Invalid group index {}!", group);
continue; // Do not set `first = false` if it was an invalid index.
//Ok(())
},
}?;
first = false;
}
// If `first == true`, no groups were printed, so we do not print the new-line.
if !first && how.has_line_delimiter() {
to.write_all(how.line_delimiter.as_ref())
} else {
Ok(())
} }
} }
@ -36,41 +153,98 @@ fn main() -> eyre::Result<()>
initialise().wrap_err("Fatal: Failed to install panic handle")?; initialise().wrap_err("Fatal: Failed to install panic handle")?;
//let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?; //let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?;
//
//eprintln!("{:#?}", cli); //eprintln!("{:#?}", cli);
// return Ok(()); // return Ok(());
let args: Vec<String> = std::env::args().collect(); let args: re::FrozenVec<re::FrozenString> = std::env::args().map(String::into_boxed_str).collect();
if args.len() < 4 { if args.len() < 4 {
println!("Usage: {} <str> <regex> <group>", args[0]); use owo_colors::OwoColorize;
use owo_colors::Stream;
macro_rules! colour {
(in $name:ident: $fmt:expr => $col:ident) => {
$fmt.if_supports_color(Stream::$name, |text| text.$col())
};
($fmt:expr => $col:ident) => {
colour!(in Stdout: $fmt => $col)
}
}
println!("rematch v{}: Regular-expression group matcher", env!("CARGO_PKG_VERSION"));
println!("");
println!("Usage: {} <str> <regex> <group>...", args[0]);
println!("Pass `-' as `<str>' to read lines from stdin"); println!("Pass `-' as `<str>' to read lines from stdin");
std::process::exit(1); println!("");
println!("Enabled Features:");
if cfg!(feature="perl") {
println!("{}\t\t\tEnable PCRE2 (extended) regular-expressions.\n\t\t\tNote that PCRE2 regex engine matches on *bytes*, not *characters*; meaning if a match cuts a vlid UTF8 codepoint into an invalid one, the output will replace the invalid characters with U+FFFD REPLACEMENT CHARACTER.", colour!(disjoint!["+", "perl"] => bright_red));
} else {
println!("{}\t\t\tPCRE2 (extended) features are disabled; a faster but less featureful regular expression engine (that matches on UTF8 strings instead of raw bytes) is used instead.", colour!(disjoint!["-", "perl"] => blue));
}
if cfg!(feature="unstable") {
println!("{}\t\tUnstable optimisations evailable & enabled for build.", colour!(disjoint!["+", "unstable"] => red));
} else {
println!("{}\t\tUnstable optimisations disabled / not available for build.", colour!(disjoint!["-", "unstable"] => bright_blue));
}
std::process::exit(1)
} else { } else {
let re = re::Regex::compile(&args[2])?; let re = re::Regex::compile(&args[2])?;
let text = &args[1]; let text = &args[1];
let group: usize = args[3].parse().expect("Invalid group number.");
let print_cfg = GroupConfig::new();
let groups = {
let groups = &args[3..];
if groups.len() < 1 {
eprintln!("Warning: No capture groups requested.");
// NOTE: Unexpected branch...
return Ok(());
}
// Parse each group index into `groups`.
groups.iter().enumerate()
.map(|(i, x)| x.parse()
.with_section(|| format!("{:?}", groups).header("Groups specified were"))
.with_section(|| x.clone().header("Specified capture group index was"))
.with_section(move || i.header("Argument index in provided groups")))
.collect::<Result<Box<[usize]>, _>>()
.wrap_err("Invalid group index specified")?
};
use std::io::Write; use std::io::Write;
let mut stdout = std::io::stdout(); let mut stdout = std::io::stdout();
if text == "-" { // Take the kind of `stdout` used (locked & buffered, or not locked & buffered) ..
let stdout = if &text[..] == "-" {
// Lock the output for the duration of the read lines.
// Buffer the output in program memory to make processing a bit faster (i.e. the segmented 'write' operations in `print_groups()`, which may be called many times here) & not have to wait on write lines for no reason (since we're already waiting on read lines.)
let mut stdout = std::io::BufWriter::new(stdout.lock());
text::stdin_lines(|text| -> eyre::Result<bool> { text::stdin_lines(|text| -> eyre::Result<bool> {
let mut stdout = stdout.lock();
match re.exec(&text)? { match re.exec(&text)? {
Some(g) if g.len() > group => print_group(&mut stdout, g, group)?, //println!("{}", &g[group]), Some(g) /*if g.len() > group*/ => // NOTE: This check branch has now been moved into `print_groups()`
print_groups(&mut stdout, &g, &groups, &print_cfg)?, //println!("{}", &g[group]),
_ => (), _ => (),
} }
Ok(true) Ok(true)
})?; })?;
} else {
// Return the buffer to the main block to be flushed to output (see above & below.)
Some(stdout)
} else {
match re.exec(&text)? { match re.exec(&text)? {
Some(g) if g.len() > group => print_group(&mut stdout, g, group)?,//println!("{}", &g.nth(group).unwrap().map(|x| x.as_ref()).unwrap_or("")), Some(g) /*if g.len() > group*/ => print_groups(&mut stdout, &g[..], &groups, print_cfg)?,//println!("{}", &g.nth(group).unwrap().map(|x| x.as_ref()).unwrap_or("")),
_ => (), _ => (),
} }
}
stdout.flush().unwrap(); // As there is no internal buffer used, there is no reason to pass it to be flushed by the program when finished like the `-` case above.
None
}.ok_or_else(move || stdout); // NOTE ^: Instead, we have it flush the non-memory-buffered output handle before exiting the program.
// and .. Ensure the stream (and buffer, if used) is flushed (then dropped.)
unwrap_either!(mut stdout => stdout.flush()).unwrap();
} }
Ok(()) Ok(())
} }

@ -11,7 +11,7 @@ pub type FrozenString = Box<str>;
// TODO: to return some kind of `Either<&'s str, impl bytes::Buf + 's>` type, which would use `str` on non-PCRE, but opaque `bytes::Buf` on PCRE?) // TODO: to return some kind of `Either<&'s str, impl bytes::Buf + 's>` type, which would use `str` on non-PCRE, but opaque `bytes::Buf` on PCRE?)
pub type FrozenBytes = FrozenVec<u8>; pub type FrozenBytes = FrozenVec<u8>;
pub type Groups<String = FrozenString> = FrozenVec<Option<String>>; pub type Groups<String = FrozenString> = FrozenVec<Option<String>>; // TODO: See `exec()` comment below about named groups and switching to `BTreeMap<{enum : CaptureGroupIdent::Index, Name}, Option<Cow<'s str>>>`
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Regex pub struct Regex
@ -55,7 +55,7 @@ impl Regex {
return Ok(Self{internal: regex::Regex::new(string.as_ref())?}); return Ok(Self{internal: regex::Regex::new(string.as_ref())?});
} }
pub fn exec<'s>(&self, string: &'s str) -> Result<Option<Groups<Cow<'s, str>>>, Error> pub fn exec<'s>(&self, string: &'s str) -> Result<Option<Groups<Cow<'s, str>>>, Error> //TODO: Can we also add named groups with a `BTreeMap<{CG::Index(usize) | CG:Name(String)}, Option<Groups<Cow<'s, str>>>>` (XXX: And maybe also be able to simplefy `V` to just `Option<&'s str>` / `Option<Cow<'s, str>>`, since the group index is already encoded in `K` (group index / group name mapped to potential match of associated group).)
{ {
#[cfg(feature = "perl")] #[cfg(feature = "perl")]
return { return {

Loading…
Cancel
Save