part: started `partition_one_with()`: Perform one nearest-needle-to-half partition on a buffer. The behaviour of this partition can be controlled with its `method` parameter. There are currently two: `SearchSeq` - search forward then backward in sequence. `SearchPar` - search forward and backward in parallel (NOTE: `SearchPar` is heavy and not well optimised, it will *always* spawn at least one thread.)

Fortune for reverse's current commit: Future blessing − 末吉
refactor-search-capext
Avril 9 months ago
parent ff898fc9b3
commit f9068beca1
Signed by: flanchan
GPG Key ID: 284488987C31F630

34
Cargo.lock generated

@ -184,6 +184,28 @@ version = "0.1.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d68bc55329711cd719c2687bb147bc06211b0521f97ef398280108ccb23227e9"
[[package]]
name = "crossbeam"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-epoch",
"crossbeam-queue",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3db02a9c5b5121e1e42fbdb1aeb65f5e02624cc58c43f2884c6ccac0b82f95"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
@ -360,6 +382,9 @@ name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
dependencies = [
"spin",
]
[[package]]
name = "libc"
@ -553,8 +578,9 @@ dependencies = [
"bytes",
"clap",
"color-eyre",
"crossbeam-queue",
"crossbeam",
"futures",
"lazy_static",
"mapped-file",
"memchr",
"num_cpus",
@ -639,6 +665,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "spin"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"

@ -7,14 +7,14 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[features]
default = ["mapped-file"]
default = ["mapped-file", "threads"]
unstable = ["parking_lot?/nightly", "memchr/compiler_builtins", "crossbeam-queue?/nightly", "futures?/unstable", "color-eyre/track-caller"]
unstable = ["parking_lot?/nightly", "memchr/compiler_builtins", "crossbeam?/nightly", "futures?/unstable", "color-eyre/track-caller"]
# Enables parallelising the operation where possible.
#
# Cli config options can control this, but in default auto setting the process will decide if parallelisation is worth using on the dimensions of the provided input and system.
threads = ["dep:rayon", "dep:parking_lot", "dep:crossbeam-queue", "dep:num_cpus", "threads-async"]
threads = ["dep:rayon", "dep:parking_lot", "dep:crossbeam", "dep:num_cpus", "threads-async"]
# Attempt `mmap()`ping the input instead of reading it.
# If the output can be mapped, it may also be.
@ -22,9 +22,9 @@ threads = ["dep:rayon", "dep:parking_lot", "dep:crossbeam-queue", "dep:num_cpus"
# Useable on unix-family systems. (TODO: Make this dependency target-dependent for unix instead of a manual feature addition like this.)
mapped-file = ["dep:mapped-file"]
# We are using `tokio_uring` & `tokio-stream` to allow parallel queued writes. (TODO: If we end up using (or needing) this feature for partitioning (e.g. we need a controlled IO thread that takes the data gathered by the parallelised partition system or we need a task pool (NOTE: the task pool will execute on tokio_uring's thread only; rayon's thread pool is being used for auto parallelisation currently) to spawn reverse-reader tasks per partition.) then: Remove the use of crossbeam (XXX: and maybe rayon?), and use Tokio runtime for manual parallelised partitioning/line-reversing.)
# TODO: Make this non-dependent feature of `threads`: use `tokio_uring` tasks for I/O, and pull in `tokio` **instead of** rayon for operations. For now, we'll see if we can get away with single-threaded async for `tokio_uring` usage and `rayon`'s auto parallisation for the chunk partitioning; if not, change this to not be internal.
#
# NOTE: **internal feature**
# We are using `tokio_uring` & `tokio-stream` to allow parallel queued writes. (TODO: If we end up using (or needing) this feature for partitioning (e.g. we need a controlled IO thread that takes the data gathered by the parallelised partition system or we need a task pool (NOTE: the task pool will execute on tokio_uring's thread only; rayon's thread pool is being used for auto parallelisation currently) to spawn reverse-reader tasks per partition.) then: Remove the use of crossbeam (XXX: and maybe rayon?), and use Tokio runtime for manual parallelised partitioning/line-reversing.)
threads-async = ["threads", "dep:futures", "dep:tokio-uring", "dep:tokio-stream"]
#rayon = ["dep:rayon"]
@ -72,8 +72,10 @@ strip=false
bytes = "1.6.0"
clap = { version = "4.5.4", features = ["derive", "string", "unicode", "wrap_help"] }
color-eyre = { version = "0.6.3", default-features = false }
crossbeam-queue = { version = "0.3.11", optional = true }
crossbeam = { version = "0.8.4", optional = true, features = ["crossbeam-queue", "crossbeam-channel", "crossbeam-epoch"] }
#crossbeam-queue = { version = "0.3.11", optional = true }
futures = { version = "0.3.30", optional = true, default-features = false, features = ["alloc", "std", "async-await"] }
lazy_static = { version = "1.4.0", features = ["spin"] }
mapped-file = { version = "0.0.8", features = ["file"], optional = true }
memchr = "2.7.2"
num_cpus = { version = "1.16.0", optional = true }

@ -1,3 +1,268 @@
//! Partitioning even areas by delimitor byte.
use super::*;
use std::{
num::NonZeroUsize,
};
/// Midpoint searcher (forwards & backwards)
trait MidpointFBSearcher<T=u8>
{
fn search_forward<'a>(&self, haystack: &'a [T], needle: T) -> Option<&'a T>;
fn search_backward<'a>(&self, haystack: &'a [T], needle: T) -> Option<&'a T>;
fn search_combined<'a>(&self, haystack: &'a [T], begin: usize, needle: T) -> Option<&'a T>;
}
/// Search the pivot for the needle sequentially.
///
/// The order of operations will be: `search_forward()?, search_backward()`.
#[derive(Debug)]
struct SearchSeq;
impl MidpointFBSearcher<u8> for SearchSeq
{
#[inline(always)]
fn search_forward<'a>(&self, haystack: &'a [u8], needle: u8) -> Option<&'a u8> {
memchr::memchr(needle, haystack).map(move |i| &haystack[i])
}
#[inline(always)]
fn search_backward<'a>(&self, haystack: &'a [u8], needle: u8) -> Option<&'a u8> {
memchr::memrchr(needle, haystack).map(move |i| &haystack[i])
}
#[inline]
fn search_combined<'a>(&self, haystack: &'a [u8], begin: usize, needle: u8) -> Option<&'a u8> {
match haystack.split_at(begin) {
([], []) => None,
([], x) => self.search_forward(x, needle),
(x, []) => self.search_backward(x, needle),
(x, y) => {
self.search_forward(y, needle)?;
self.search_backward(x, needle)
}
}
}
}
#[cfg(feature="async")]
const _TODO_FUTURES_JOIN2_ASYNC_SEARCH: () = {
#[derive(Debug, Clone)]
struct SearchAsync<F>
{
spawn_task: F,
result: oneshot::Receiver<usize>,
}
#[cfg(feature="threads-async")]
impl<F, Fu> MidpointFBSearcher<u8> for SearchAsync<F>
where F: Fn() -> Fu,
Fu: futures::Future + Send + Sync + 'static
{
}
};
/// Search in parallel.
///
/// # Warning
/// This search operation is heavy. It **always** spawns its own (up to) two threads when `search_combined()` is invoked. This may not be ideal...
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct SearchPar
{
cap_start: usize,
}
/// For f/b pivot-searching, the max area for each operation to attempt.
const DEFAULT_PIVOT_MAX_SEARCH_AREA: usize = 1024;
/// For f/b pivot-searching, the max *possible* area for each operation to attempt when it grows in capacity
const DEFAULT_PIVOT_MAX_POSSIBLE_SEARCH_AREA: usize = (1024 * 1024 * 1024) * 2; // 2GB
/// The number of pages of memory loaded where non-page-bound operations assume they're using HP-mapped data.
const DEFAULT_MEM_DETECT_HUGE_SIZE_PAGES: usize = 4;
lazy_static::lazy_static! {
/// Real system page size.
static ref REAL_PAGE_SIZE: std::ffi::c_int = {
use std::ffi::c_int;
extern "C" {
fn getpagesize() -> c_int;
}
unsafe {
getpagesize()
}
};
}
/// Get a recommended bound for the pivot search area (if there is one.)
///
/// # Returns
/// The recommended max bound for a pivot search area, or `None` for unbounded search.
///
/// # Page kinds
/// If the operation is using huge-page mapped memory, set `use_hp` to true.
#[inline]
fn get_max_pivot_search_area(_use_hp: bool) -> Option<NonZeroUsize>
{
use std::ffi::c_int;
lazy_static::lazy_static! {
static ref PAGE_SIZE: usize = {
match *REAL_PAGE_SIZE {
c_int::MIN..=0 => DEFAULT_PIVOT_MAX_SEARCH_AREA,
// Very large (hp)
very_large if very_large as usize > DEFAULT_PIVOT_MAX_POSSIBLE_SEARCH_AREA => 0,
// Large (limit to upper bound of non-hp)
large if large as usize >= DEFAULT_PIVOT_MAX_SEARCH_AREA => std::cmp::min(large as usize, DEFAULT_PIVOT_MAX_POSSIBLE_SEARCH_AREA),
// Smaller than default bound
small => small as usize
}
};
}
//XXX: Should we return a different value if `use_hp` is enabled? ("using hugepage")
NonZeroUsize::new(*PAGE_SIZE)
}
impl SearchPar {
#[inline(always)]
pub const fn new() -> Self
{
Self::with_capacity(DEFAULT_PIVOT_MAX_SEARCH_AREA)
}
#[inline]
pub const fn with_capacity(cap_start: usize) -> Self
{
Self { cap_start }
}
#[inline]
pub const fn cap(&self) -> usize
{
self.cap_start
}
#[inline(always)]
pub unsafe fn cap_mut(&mut self) -> &mut usize
{
&mut self.cap_start
}
}
impl Default for SearchPar
{
#[inline]
fn default() -> Self
{
Self {
cap_start: match *REAL_PAGE_SIZE {
std::ffi::c_int::MIN..=0 => DEFAULT_PIVOT_MAX_SEARCH_AREA,
above_zero => above_zero as usize,
},
}
}
}
#[cfg(feature="threads")]
impl MidpointFBSearcher<u8> for SearchPar
{
#[inline(always)]
fn search_forward<'a>(&self, haystack: &'a [u8], needle: u8) -> Option<&'a u8> {
memchr::memchr(needle, haystack).map(move |i| &haystack[i])
}
#[inline(always)]
fn search_backward<'a>(&self, haystack: &'a [u8], needle: u8) -> Option<&'a u8> {
memchr::memrchr(needle, haystack).map(move |i| &haystack[i])
}
fn search_combined<'a>(&self, haystack: &'a [u8], begin: usize, needle: u8) -> Option<&'a u8> {
let complete = crossbeam::atomic::AtomicCell::new(false);
std::thread::scope(|s| {
//let mut complete_val = UnsafeCell::new(false);
//let complete: parking_lot::Once = parking_lot::Once::new();
// let value_cont = (parking_lot::Condvar::new(), parking_lot::FairMutex::new(None::<&'a u8>));
let (mut hb, mut hf) = haystack.split_at(begin);
let max_cap = match get_max_pivot_search_area(hf.len() > (DEFAULT_PIVOT_MAX_SEARCH_AREA * DEFAULT_MEM_DETECT_HUGE_SIZE_PAGES)){ // Assume huge-page memory if len is larger than 4 pages.
// On debug builds, cap search area to one system page only.
_ignore if cfg!(debug_assertions) && (*REAL_PAGE_SIZE) > 0 =>
// SAFETY: We have checked if `*REAL_PAGE_SIZE` is non-zero above.
Some(unsafe { NonZeroUsize::new_unchecked(*REAL_PAGE_SIZE as usize) }),
// Otherwise, use the detected value.
cap => cap,
};
let forward = if hf.len() > 0 {
let cap = self.cap_start;
let sf = &self;
let complete = &complete;
Some(s.spawn(move || -> Option<_> {
let mut cap = std::cmp::min(cap, hf.len());
let len = hf.len();
while cap <= len {
if let /*v @ */Some(x) = sf.search_forward(&hf[..cap], needle) {
complete.store(true);
//complete.call_once(|| complete_val = true);
return Some(x);
} else if complete.load() {
break;
}
// Cut out `cap` bytes from the start.
hf = &hf[cap..];
// Grow `cap` by 1 ^2 (not passing `max_cap` if there is one set.)
cap = max_cap.map(|max| std::cmp::min(max.get(), cap << 1)).unwrap_or_else(|| cap << 1);
}
None::<&'a u8>
}))
} else {
None
};
let backward = if hb.len() > 0 {
let cap = self.cap_start;
let sf = &self;
let complete = &complete;
Some(s.spawn(move || -> Option<_> {
let mut cap = std::cmp::min(cap, hb.len());
let len = hb.len();
while cap <= len {
if let /*v @ */Some(x) = sf.search_backward(&hb[(hb.len()-cap)..], needle) {
complete.store(true);
//complete.call_once(|| complete_val = true);
return Some(x);
} else if complete.load() {
break;
}
// Cut out `cap` bytes from the end.
hb = &hb[..cap];
// Grow `cap` by 1 ^2 (not passing `max_cap` if there is one set.)
cap = max_cap.map(|max| std::cmp::min(max.get(), cap << 1)).unwrap_or_else(|| cap << 1);
}
None::<&'a u8>
}))
} else {
None
};
match (forward, backward) {
(None, None) => None,
(None, Some(back)) => back.join().unwrap_or(None),
(Some(forward), None) => forward.join().unwrap_or(None),
(Some(forward), Some(backward)) => forward.join().unwrap_or(None).
or_else(move || backward.join().unwrap_or(None)),
}
})
}
}
fn partition_once_with<'a, S>(buffer: &'a [u8], needle: u8, method: S) -> (&'a [u8], &'a [u8])
where S: MidpointFBSearcher<u8>
{
todo!("Perform one single buffer partition partition (buffer/2.at_nearest_mpr(needle)) (using `method.search_combined()`) and return its parts. If we can fast-path skip the `search_combined()` then that is okay (e.g. if the buffer/2 is small enough that we should just use `SearchSeq`, we can use `SearchSeq` instead of `S`, and so on.) (XXX: Also see below about thread spawning on parallelised partitions and re-using thread pools (we may be able to do this manually with crossbeam, or we might just have to embrace using `spawn_blocking()` async/a default tokio multithreaded-runtime) since parallel partitions needs at least two threads to search both directions at a time.)")
}
//XXX: Should we add a `SearchAsync`? Or an impl for SearchPar that uses already-spawned threads? TODO: It would be best if we could re-use extant threads instead of spawning two on each partition...
//Parallel (feature="threads") byte area partition-on-nearest-newline-to-halve impl, and non-parallel (default) impl. These impls can differ in their desired depths of partitioning (using parallel impls should balance num of partitions to num of logical cpus & input(/desired chunk) size.)

Loading…
Cancel
Save