Compare commits

...

6 Commits

Author SHA1 Message Date
Avril 8bb4062126
Added extra info in help message (about version & enabled features.) 5 hours ago
Avril 3bc8e9d214
Imported from cli: Added more efficient flow of output `stdout` object(s) through main branches. Internal buffer is flushed into lock before it is dropped in the `-` case, & only the non-locked handle is flushed in the single-string case. 5 hours ago
Avril 3d464fe72e
Version bump 0.3.0: Allow multiple groups specified, delimited per line by TAB-literal. Print warning when a non-existent group is requested instead of silently ignoring it. 1 day ago
Avril 007ba8781c
Version bump 0.2.0: Refactored use to PCRE2, added `unstable` flag, update deps & edition to 2024. 5 days ago
Avril 6a74f54a26
Imported cli-refactor's `args.rs` (for future CLI refactor) with PCRE2 dependency update. 5 days ago
Avril 100b3a9afd
Switched PCRE backend to **much** better PCRE2 library. (Allows for concurrency directly, etc.) 5 days ago

@ -1,20 +1,29 @@
[package]
name = "rematch"
version = "0.1.0"
version = "0.3.1"
authors = ["Avril <flanchan@cumallover.me>"]
edition = "2018"
edition = "2024"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[profile.release]
opt-level = 3
lto = "fat"
lto = true
codegen-units = 1
panic = "unwind"
strip = true
[profile.symbols]
inherits = "release"
strip = false
[features]
perl = ["pcre"]
default= ["perl", "unstable"]
perl = ["dep:pcre2"]
unstable = ["regex/unstable"]
[dependencies]
regex = "1"
pcre = { version = "0.2.3", optional = true }
pcre2 = { version = "0.2.9", optional = true }
regex = { version = "1.11.1", features = ["use_std"] }
color-eyre = { version = "0.6.3", default-features = false, features = ["track-caller"] }

@ -3,34 +3,184 @@
mod re;
mod text;
fn main() -> Result<(), Box<dyn std::error::Error>>
/// Run an expression on an named value with a result type `Result<T, U>`.
/// Where `T` and `U` have *the same API surface* for the duration of the provided expression.
///
/// # Example
/// If there is a value `let mut value: Result<T, U>`, where `T: Write` & `U: BufWrite`;
/// the expression `value.flush()` is valid for both `T` and `U`.
/// Therefore, it can be simplified to be called as so: `unwrap_either(mut value => value.flush())`.
///
/// # Reference capture vs. `move` capture.
/// Note that by default, the identified value is **moved** *into* the expression.
/// The type of reference can be controlled by appending `ref`, `mut`, or `ref mut` to the ident.
///
/// Identifier capture table:
/// - **none** ~default~ - Capture by move, value is immutable in expression.
/// - `mut` - Capture by move, value is mutable in expression.
/// - `ref` - Capture by ref, value is immutable (`&value`) in expression.
/// - `ref mut` - Capture by mutable ref, value is mutable (`&mut value`) in expression. (__NOTE__: `value` must be defined as mutable to take a mutable reference of it.)
///
/// Essentially the same rules as any `match` branch pattern.
macro_rules! unwrap_either {
($res:ident => $($rest:tt)+) => {
match $res {
Ok(ref mut $res) => $($rest)+,
Err(ref mut $res) => $($rest)+,
}
};
(ref mut $res:ident => $($rest:tt)+) => {
match $res {
Ok(ref mut $res) => $($rest)+,
Err(ref mut $res) => $($rest)+,
}
};
(ref $res:ident => $($rest:tt)+) => {
match $res {
Ok(ref $res) => $($rest)+,
Err(ref $res) => $($rest)+,
}
};
(mut $res:ident => $($rest:tt)+) => {
match $res {
Ok(mut $res) => $($rest)+,
Err(mut $res) => $($rest)+,
}
};
}
use color_eyre::{
eyre::{
self,
eyre,
WrapErr as _,
},
SectionExt as _, Help as _,
};
fn initialise() -> eyre::Result<()>
{
color_eyre::install()?;
Ok(())
}
#[inline]
fn print_groups<'a, S: ?Sized, G, T: 'a, I>(to: &mut S, g: G, groups: I) -> std::io::Result<()>
where S: std::io::Write,
G: IntoIterator<Item = &'a Option<T>> + Clone + Copy, // NOTE: Copy bound to ensure we're not accidentally doing deep clones of `g`.
//G: std::ops::Index<usize>, G::Output: std::borrow::Borrow<Option<T>>,
T: std::borrow::Borrow<str>,
I: IntoIterator<Item: std::borrow::Borrow<usize>/*, IntoIter: ExactSizeIterator*/>,
{
let args: Vec<String> = std::env::args().collect();
use std::borrow::Borrow;
let mut first = true;
for group in groups.into_iter() {
let group = group.borrow();
// // Moved to into match group (skipping invalid groups.)
// if !first {
// write!(to, "\t")?;
// }
let print_delim = || first.then_some("").unwrap_or("\t"); // If it's not the first iteration, print `\t`.
match g.into_iter().nth(*group) {
Some(None) => write!(to, "{}", print_delim()),
Some(Some(g)) => write!(to, "{}{}", print_delim(), g.borrow()),
//TODO: What should be the behaviour of a non-existent group index here? (NOTE: This now corresponds to the previous `g.len() > group` check in caller.) // (NOTE: The original behaviour is to just ignore groups that are out of range entirely (i.e. no printing, no delimit char, no error,) maybe treat non-existent groups as non-matched groups and *just* print the delim char?)
// (NOTE: Moved out of branch, see above ^) // None if !first => write!(to, "\t"),
// XXX: Should this do what it does now...? Or should it `break` to prevent the checking for more groups...? Print a warning maybe...?
None => {
eprintln!("Warning: Invalid group index {}!", group);
continue; // Do not set `first = false` if it was an invalid index.
//Ok(())
},
}?;
first = false;
}
// If `first == true`, no groups were printed, so we do not print the new-line.
if !first {
to.write_all(b"\n")
} else {
Ok(())
}
}
fn main() -> eyre::Result<()>
{
initialise().wrap_err("Fatal: Failed to install panic handle")?;
//let cli = args::parse_cli();//.wrap_err("Error parsing command-line arguments")?;
//eprintln!("{:#?}", cli);
// return Ok(());
let args: re::FrozenVec<re::FrozenString> = std::env::args().map(String::into_boxed_str).collect();
if args.len() < 4 {
println!("Usage: {} <str> <regex> <group>", args[0]);
println!("rematch v{}: Regular-expression group matcher", env!("CARGO_PKG_VERSION"));
println!("");
println!("Usage: {} <str> <regex> <group>...", args[0]);
println!("Pass `-' as `<str>' to read lines from stdin");
std::process::exit(1);
println!("");
println!("Enabled Features:");
if cfg!(feature="perl") {
println!("+perl\t\t\tEnable PCRE2 (extended) regular-expressions.\n\t\t\tNote that PCRE2 regex engine matches on *bytes*, not *characters*; meaning if a match cuts a vlid UTF8 codepoint into an invalid one, the output will replace the invalid characters with U+FFFD REPLACEMENT CHARACTER.");
} else {
println!("-perl\t\tPCRE2 (extended) features are disabled; a faster but less featureful regular expression engine (that matches on UTF8 strings instead of raw bytes) is used instead.");
}
if cfg!(feature="unstable") {
println!("+unstable\t\tUnstable optimisations evailable & enabled for build.");
} else {
println!("-unstable\t\tUnstable optimisations disabled / not available for build.");
}
std::process::exit(1)
} else {
let re = re::Regex::compile(&args[2])?;
let text = &args[1];
let group: usize = args[3].parse().expect("Invalid group number.");
if text == "-" {
text::stdin_lines(|text| -> Result<bool, re::Error> {
let groups = &args[3..];
if groups.len() < 1 {
eprintln!("Warning: No capture groups requested.");
// NOTE: Unexpected branch...
return Ok(());
}
let groups = groups.iter().enumerate()
.map(|(i, x)| x.parse()
.with_section(|| format!("{:?}", groups).header("Groups specified were"))
.with_section(|| x.clone().header("Specified capture group index was"))
.with_section(move || i.header("Argument index in provided groups")))
.collect::<Result<Box<[usize]>, _>>()
.wrap_err("Invalid group index specified")?;
//TODO: XXX: How to handle multiple groups in `stdin_lines()` case?
//let group = groups[0]; //args[3].parse().expect("Invalid group number.");
use std::io::Write;
let mut stdout = std::io::stdout();
let stdout = if &text[..] == "-" {
let mut stdout = std::io::BufWriter::new(stdout.lock());
text::stdin_lines(|text| -> eyre::Result<bool> {
match re.exec(&text)? {
Some(g) if g.len() > group => println!("{}", &g[group]),
Some(g) /*if g.len() > group*/ => // NOTE: This check branch has now been moved into `print_groups()`
print_groups(&mut stdout, &g, &groups)?, //println!("{}", &g[group]),
_ => (),
}
Ok(true)
})?;
Some(stdout)
} else {
match re.exec(&text)? {
Some(g) if g.len() > group => println!("{}", &g[group]),
Some(g) /*if g.len() > group*/ => print_groups(&mut stdout, &g[..], &groups)?,//println!("{}", &g.nth(group).unwrap().map(|x| x.as_ref()).unwrap_or("")),
_ => (),
}
}
None
}.ok_or_else(move || stdout);
unwrap_either!(mut stdout => stdout.flush()).unwrap();
}
Ok(())
}

@ -1,23 +1,23 @@
#![allow(unused_imports)]
use std::{
error,
fmt::{
self,
Write,
},
sync::{
Arc,
Mutex,
}
fmt,
borrow::Cow,
};
pub type Groups = Vec<String>;
pub type FrozenVec<T> = Box<[T]>;
pub type FrozenString = Box<str>;
// NOTE: Currently unused, as we use `to_utf8_lossy()` for PCRE2 `byte`-matching (XXX: Should we change?)
// TODO: to return some kind of `Either<&'s str, impl bytes::Buf + 's>` type, which would use `str` on non-PCRE, but opaque `bytes::Buf` on PCRE?)
pub type FrozenBytes = FrozenVec<u8>;
pub type Groups<String = FrozenString> = FrozenVec<Option<String>>;
#[derive(Debug, Clone)]
pub struct Regex
{
#[cfg(feature="perl")]
internal: Arc<Mutex<pcre::Pcre>>,
internal: pcre2::bytes::Regex,
#[cfg(not(feature = "perl"))]
internal: regex::Regex,
}
@ -50,24 +50,18 @@ impl Regex {
pub fn compile(string: impl AsRef<str>) -> Result<Self, Error>
{
#[cfg(feature = "perl")]
return Ok(Self{internal: Arc::new(Mutex::new(pcre::Pcre::compile(string.as_ref())?))});
return Ok(Self{internal: pcre2::bytes::RegexBuilder::new().build(string.as_ref())?});
#[cfg(not(feature = "perl"))]
return Ok(Self{internal: regex::Regex::new(string.as_ref())?});
}
pub fn exec(&self, string: impl AsRef<str>) -> Result<Option<Groups>, Error>
pub fn exec<'s>(&self, string: &'s str) -> Result<Option<Groups<Cow<'s, str>>>, Error>
{
#[cfg(feature = "perl")]
return {
let mut re = self.internal.lock().unwrap();
Ok(match re.exec(string.as_ref()) {
Ok(match self.internal.captures(string.as_ref())? {
Some(m) => {
let len = m.string_count();
let mut output = Vec::with_capacity(len);
for i in 0..len {
output.push(m.group(i).to_owned());
}
Some(output)
Some((0..m.len()).map(move |i| m.get(i).map(|x| String::from_utf8_lossy(x.as_bytes()) )).collect())
},
None => None,
})
@ -76,14 +70,7 @@ impl Regex {
return {
Ok(match self.internal.captures(string.as_ref()) {
Some(m) => {
let mut output = Vec::with_capacity(m.len());
for i in 0..m.len() {
let ma = m.get(i).unwrap();
let mut op = String::with_capacity(ma.range().len());
write!(op, "{}", ma.as_str())?;
output.push(op);
}
Some(output)
Some((0..m.len()).map(move |i| m.get(i).map(|x| Cow::Borrowed(x.as_str()) )).collect())
},
None => None,
})
@ -99,7 +86,7 @@ impl From<fmt::Error> for Error
}
}
#[cfg(not(feature = "perl"))]
//#[cfg(not(feature = "perl"))]
impl From<regex::Error> for Error
{
fn from(er: regex::Error) -> Self
@ -109,9 +96,9 @@ impl From<regex::Error> for Error
}
#[cfg(feature = "perl")]
impl From<pcre::CompilationError> for Error
impl From<pcre2::Error> for Error
{
fn from(er: pcre::CompilationError) -> Self
fn from(er: pcre2::Error) -> Self
{
Self::Compile(format!("{}", er))
}

Loading…
Cancel
Save