From 633466b901636f704359a74485d7fc8058f998ad Mon Sep 17 00:00:00 2001 From: Avril Date: Sun, 11 Oct 2020 10:55:44 +0100 Subject: [PATCH] added /sentance/ chunking --- Cargo.lock | 5 +- Cargo.toml | 5 +- markov.toml | 5 +- src/api/sentance.rs | 2 - src/config.rs | 23 ++++ src/ext.rs | 27 ++++ src/feed.rs | 18 ++- src/gen.rs | 2 +- src/main.rs | 41 +++++- src/sanitise/.#filter.rs | 1 + src/sanitise/filter.rs | 260 +++++++++++++++++++++++++++++++++++++++ src/sanitise/mod.rs | 3 + src/sentance.rs | 25 ++++ src/state.rs | 9 +- src/util.rs | 41 ++++++ 15 files changed, 449 insertions(+), 18 deletions(-) delete mode 100644 src/api/sentance.rs create mode 100644 src/ext.rs create mode 120000 src/sanitise/.#filter.rs create mode 100644 src/sanitise/filter.rs create mode 100644 src/sentance.rs create mode 100644 src/util.rs diff --git a/Cargo.lock b/Cargo.lock index cc9c496..c366482 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -650,6 +650,7 @@ dependencies = [ "log", "lzzzz", "markov 1.1.0", + "once_cell", "pin-project", "pretty_env_logger", "serde", @@ -1269,9 +1270,9 @@ checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" [[package]] name = "smallmap" -version = "1.1.3" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2adda73259bbc3ff84f711425ebfb8c90e9dd32a12b05c6528dd49244ea8230f" +checksum = "97ce78b988fb0df3b438d106942c0c2438849ecf40e3418af55044f96d27514d" dependencies = [ "rustc_version", ] diff --git a/Cargo.toml b/Cargo.toml index cd93c39..c51f2df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "markov" -version = "0.5.4" +version = "0.6" description = "Generate string of text from Markov chain fed by stdin" authors = ["Avril "] edition = "2018" @@ -63,5 +63,6 @@ toml = "0.5.6" async-compression = {version = "0.3.5", features=["tokio-02", "bzip2"], optional=true} pin-project = "0.4.26" libc = "0.2.79" -smallmap = "1.1.3" +smallmap = "1.1.5" lazy_static = "1.4.0" +once_cell = "1.4.1" diff --git a/markov.toml b/markov.toml index dc0a048..e2064b8 100644 --- a/markov.toml +++ b/markov.toml @@ -2,5 +2,8 @@ bindpoint = '127.0.0.1:8001' file = 'chain.dat' max_content_length = 4194304 max_gen_size = 256 -#save_interval_secs = 15 +#save_interval_secs = 2 trust_x_forwarded_for = false + +[filter] +exclude = "<>)([]/" diff --git a/src/api/sentance.rs b/src/api/sentance.rs deleted file mode 100644 index 9f8f411..0000000 --- a/src/api/sentance.rs +++ /dev/null @@ -1,2 +0,0 @@ -//! /sentance/ -use super::*; diff --git a/src/config.rs b/src/config.rs index 2940f49..ebae65e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -25,6 +25,28 @@ pub struct Config pub max_gen_size: usize, pub save_interval_secs: Option, pub trust_x_forwarded_for: bool, + #[serde(default)] + pub filter: FilterConfig, +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Hash, Serialize, Deserialize)] +pub struct FilterConfig +{ + exclude: String, + +} + +impl FilterConfig +{ + pub fn get_filter(&self) -> sanitise::filter::Filter + { + let filt: sanitise::filter::Filter = self.exclude.parse().unwrap(); + if !filt.is_empty() + { + warn!("Loaded exclude filter: {:?}", filt.iter().collect::()); + } + filt + } } impl Default for Config @@ -39,6 +61,7 @@ impl Default for Config max_gen_size: 256, save_interval_secs: Some(unsafe{NonZeroU64::new_unchecked(2)}), trust_x_forwarded_for: false, + filter: Default::default(), } } } diff --git a/src/ext.rs b/src/ext.rs new file mode 100644 index 0000000..eb9b5ce --- /dev/null +++ b/src/ext.rs @@ -0,0 +1,27 @@ +//! Extensions +use std::{ + iter, +}; + +pub trait StringJoinExt: Sized +{ + fn join>(self, sep: P) -> String; +} + +impl StringJoinExt for I +where I: IntoIterator, + T: AsRef +{ + fn join>(self, sep: P) -> String + { + let mut string = String::new(); + for (first, s) in iter::successors(Some(true), |_| Some(false)).zip(self.into_iter()) + { + if !first { + string.push_str(sep.as_ref()); + } + string.push_str(s.as_ref()); + } + string + } +} diff --git a/src/feed.rs b/src/feed.rs index 866c1bf..48150c2 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -44,6 +44,15 @@ pub async fn full(who: &IpAddr, state: State, body: impl Unpin + Stream { + { + let buffer = $buffer; + feed($chain, &buffer, $bounds) + } + } + } + cfg_if!{ if #[cfg(any(not(feature="split-newlines"), feature="always-aggregate"))] { let mut body = body; @@ -60,15 +69,17 @@ pub async fn full(who: &IpAddr, state: State, body: impl Unpin + Stream {:?}", who, buffer); let mut chain = state.chain().write().await; cfg_if! { if #[cfg(feature="split-newlines")] { for buffer in buffer.split('\n').filter(|line| !line.trim().is_empty()) { - feed(&mut chain, buffer, FEED_BOUNDS); + feed!(&mut chain, buffer, FEED_BOUNDS); + } } else { - feed(&mut chain, buffer, FEED_BOUNDS); + feed!(&mut chain, buffer, FEED_BOUNDS); } } @@ -81,12 +92,13 @@ pub async fn full(who: &IpAddr, state: State, body: impl Unpin + Stream {:?}", who, line); } written+=line.len(); diff --git a/src/gen.rs b/src/gen.rs index 6c1e4e5..911f2e0 100644 --- a/src/gen.rs +++ b/src/gen.rs @@ -2,7 +2,7 @@ use super::*; #[derive(Debug)] -pub struct GenBodyError(String); +pub struct GenBodyError(pub String); impl error::Error for GenBodyError{} impl fmt::Display for GenBodyError diff --git a/src/main.rs b/src/main.rs index c516086..46ad529 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ #![feature(split_inclusive)] +#![feature(min_const_generics)] #![allow(dead_code)] @@ -60,6 +61,9 @@ macro_rules! status { }; } +mod ext; +use ext::*; +mod util; mod sanitise; mod bytes; mod chunking; @@ -76,6 +80,7 @@ use forwarded_list::XForwardedFor; mod feed; mod gen; +mod sentance; #[tokio::main] async fn main() { @@ -135,8 +140,8 @@ async fn main() { }; let push = warp::put() - .and(chain.clone()) .and(warp::path("put")) + .and(chain.clone()) .and(client_ip.clone()) .and(warp::body::content_length_limit(state.config().max_content_length)) .and(warp::body::stream()) @@ -173,9 +178,6 @@ async fn main() { .and_then(api::single) .with(warp::log("markov::api::single")) }; - let sentance = warp::post() - .and(warp::path("sentance")); //TODO: sanitise::Sentance::new_iter the body line - warp::path("api") .and(single) .recover(api::error::rejection) @@ -183,11 +185,12 @@ async fn main() { } } + let read = warp::get() .and(chain.clone()) - .and(warp::path("get")) .and(client_ip.clone()) - .and(warp::path::param().map(|opt: usize| Some(opt)).or(warp::any().map(|| Option::::None)).unify()) + .and(warp::path::param().map(|opt: usize| Some(opt)) + .or(warp::path::end().map(|| Option::::None)).unify()) .and_then(|state: State, host: IpAddr, num: Option| { async move { let (tx, rx) = mpsc::channel(state.config().max_gen_size); @@ -199,7 +202,33 @@ async fn main() { } }) .with(warp::log("markov::read")); + + let sentance = warp::get() + .and(warp::path("sentance")) //TODO: sanitise::Sentance::new_iter the body line + .and(chain.clone()) + .and(client_ip.clone()) + .and(warp::path::param().map(|opt: usize| Some(opt)) + .or(warp::path::end().map(|| Option::::None)).unify()) + .and_then(|state: State, host: IpAddr, num: Option| { + async move { + let (tx, rx) = mpsc::channel(state.config().max_gen_size); + tokio::spawn(sentance::body(state, num, tx)); + Ok::<_, std::convert::Infallible>(Response::new(Body::wrap_stream(rx.map(move |mut x| { + info!("{} (sentance) <- {:?}", host, x); + // match x.chars().last() { + // Some(chr) if sanitise::is_sentance_boundary(chr) => { + // x.push(' '); + // }, + // _ => (), + // } + x.push(' '); + Ok::<_, std::convert::Infallible>(x) + })))) + } + }) + .with(warp::log("markov::read::sentance")); + let read = warp::path("get").and(read.or(sentance)); #[cfg(feature="api")] let read = read.or(api); diff --git a/src/sanitise/.#filter.rs b/src/sanitise/.#filter.rs new file mode 120000 index 0000000..e39f68a --- /dev/null +++ b/src/sanitise/.#filter.rs @@ -0,0 +1 @@ +avril@eientei.880:1602382403 \ No newline at end of file diff --git a/src/sanitise/filter.rs b/src/sanitise/filter.rs new file mode 100644 index 0000000..c570dd7 --- /dev/null +++ b/src/sanitise/filter.rs @@ -0,0 +1,260 @@ +//! Filter out characters and such +use smallmap::Map as SmallMap; +use std::{ + borrow::Cow, + fmt, + iter::{ + self, + FromIterator, + }, + str, +}; +use once_cell::sync::OnceCell; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Filter(SmallMap); + +impl From<[char; N]> for Filter +{ + fn from(from: [char; N]) -> Self + { + let mut map = SmallMap::with_capacity(1 + (N / 256)); + for &chr in from.iter() + { + map.insert(chr, ()); + } + Self(map) + } +} + +impl<'a> From<&'a [char]> for Filter +{ + fn from(from: &'a [char]) -> Self + { + let mut map = SmallMap::new(); + for &chr in from.iter() + { + map.insert(chr, ()); + } + Self(map) + } +} +impl<'a> From<&'a str> for Filter +{ + fn from(from: &'a str) -> Self + { + let mut output = Self::new(); + output.insert(from.chars()); + output + } +} + +impl str::FromStr for Filter +{ + type Err = std::convert::Infallible; + fn from_str(s: &str) -> Result { + Ok(Self::from(s)) + } +} + +impl fmt::Display for Filter +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result + { + use std::fmt::Write; + for chr in self.iter() + { + f.write_char(chr)?; + } + Ok(()) + } +} + +pub struct FilterKeyIter<'a>(smallmap::iter::Iter<'a, char, ()>, usize); + +impl<'a> Iterator for FilterKeyIter<'a> +{ + type Item = char; + fn next(&mut self) -> Option + { + self.0.next().map(|&(x, _)| x) + } + fn size_hint(&self) -> (usize, Option) { + (self.1, Some(self.1)) + } +} +impl<'a> iter::FusedIterator for FilterKeyIter<'a>{} +impl<'a> iter::ExactSizeIterator for FilterKeyIter<'a>{} + +impl Filter +{ + pub fn new() -> Self + { + Self(SmallMap::new()) + } + pub fn insert>(&mut self, from: I) + { + for from in from.into_iter() + { + self.0.insert(from, ()); + } + } + + pub fn remove>(&mut self, from: I) + { + for from in from.into_iter() + { + self.0.remove(&from); + } + } + + pub fn len(&self) -> usize + { + self.0.len() + } + + pub fn is_empty(&self) -> bool + { + //TODO: impl this in smallmap itself + self.len() == 0 + } + + pub fn iter(&self) -> impl Iterator + '_ + { + self.0.iter() + .copied() + .map(|(x, _)| x) + //FilterKeyIter(self.0.iter(), self.0.len()) + } + + /// Should this character be filtered? + #[inline] pub fn check(&self, chr: char) -> bool + { + self.0.get(&chr).is_some() + } + + pub fn filter<'a, I: IntoIterator>(&'a self, from_iter: I) -> FilterIter<'a, I::IntoIter> + where I::IntoIter: 'a + { + FilterIter(&self, from_iter.into_iter().fuse()) + } + + pub fn filter_cow<'a>(&self, string: &'a (impl AsRef + 'a + ?Sized)) -> Cow<'a, str> + { + let string = string.as_ref(); + + if self.is_empty() { + return Cow::Borrowed(string); + } + + let mut output = Cow::Borrowed(string); + let mut i=0; + for chr in string.chars() + { + if self.check(chr) { + output.to_mut().remove(i); + } else { + i+=1; + } + } + + output + } + + pub fn filter_str<'a, T: AsRef+'a>(&'a self, string: &'a T) -> FilterStr<'a> + { + FilterStr(string.as_ref(), self, OnceCell::new()) + } +} + +impl FromIterator for Filter +{ + fn from_iter>(iter: I) -> Self + { + let mut output= Self::new(); + output.insert(iter); + output + } +} + +impl<'a> FilterStr<'a> +{ + pub fn as_str(&self) -> &str + { + fn fmt(this: &FilterStr<'_>) -> String + { + let chars = this.0.chars(); + let mut f: String = crate::util::hint_cap(&chars); + for chr in chars { + if !this.1.check(chr) { + f.push(chr); + } + } + f + } + &self.2.get_or_init(|| fmt(&self))[..] + } +} + +pub struct FilterStr<'a>(&'a str, &'a Filter, OnceCell); +impl<'a> fmt::Display for FilterStr<'a> +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result + { + write!(f, "{}", self.as_str()) + } +} +impl<'a> FilterStr<'a> +{ + pub fn filter(&self) -> &Filter + { + &self.1 + } +} + +pub struct FilterIter<'a, I>(&'a Filter, iter::Fuse); + +impl<'a, I: Iterator> Iterator for FilterIter<'a, I> +{ + type Item = char; + fn next(&mut self) -> Option + { + loop { + break match self.1.next() { + Some(chr) if !self.0.check(chr) => Some(chr), + None => None, + _ => continue, + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let (_, high) = self.1.size_hint(); + (0, high) + } +} +impl<'a, I> FilterIter<'a, I> +{ + pub fn filter(&self) -> &Filter + { + self.0 + } +} + +impl<'a, I: Iterator> iter::FusedIterator for FilterIter<'a, I>{} + +#[cfg(test)] +mod tests +{ + use super::*; + #[test] + fn filter_cow() + { + let filter: Filter = " hi".chars().collect(); + + let string = "abcdef ghi jk1\nhian"; + + assert_eq!(filter.filter_str(&string).to_string(), filter.filter_cow(&string).to_string()); + assert_eq!(filter.filter_cow(&string).to_string(), filter.filter(string.chars()).collect::()); + } +} diff --git a/src/sanitise/mod.rs b/src/sanitise/mod.rs index 3608cc0..8f5fae1 100644 --- a/src/sanitise/mod.rs +++ b/src/sanitise/mod.rs @@ -8,6 +8,9 @@ mod sentance; pub use sentance::*; mod word; pub use word::*; + +pub mod filter; + /* pub fn take_sentance(from: &mut T, to: &mut U) -> Result { diff --git a/src/sentance.rs b/src/sentance.rs new file mode 100644 index 0000000..b7aee69 --- /dev/null +++ b/src/sentance.rs @@ -0,0 +1,25 @@ +//! /sentance/ +use super::*; + +pub async fn body(state: State, num: Option, mut output: mpsc::Sender) -> Result<(), gen::GenBodyError> +{ + let string = { + let chain = state.chain().read().await; + if chain.is_empty() { + return Ok(()); + } + + match num { + None => chain.generate_str(), + Some(num) => (0..num).map(|_| chain.generate_str()).join("\n"), + } + }; + + debug!("Taking {:?} from {:?}" ,num, string); + for sen in sanitise::Sentance::new_iter(&string).take(num.unwrap_or(1)) + { + output.send(sen.to_owned()).await.map_err(|e| gen::GenBodyError(e.0))?; + } + Ok(()) +} + diff --git a/src/state.rs b/src/state.rs index 6ec7883..3bebfc8 100644 --- a/src/state.rs +++ b/src/state.rs @@ -11,6 +11,7 @@ use config::Config; pub struct State { config: Arc, //to avoid cloning config + exclude: Arc, chain: Arc>>, save: Arc, @@ -20,11 +21,17 @@ pub struct State impl State { + pub fn filter(&self) -> &sanitise::filter::Filter + { + &self.exclude + } + pub fn new(config: Config, chain: Arc>>, save: Arc) -> Self { let (shutdown, shutdown_recv) = watch::channel(false); Self { - config: Arc::new(config), + exclude: Arc::new(config.filter.get_filter()), + config: Arc::new(config), chain, save, shutdown: Arc::new(shutdown), diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..2a5d5a3 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,41 @@ +//! Utils + +pub trait NewCapacity: Sized +{ + fn new() -> Self; + fn with_capacity(cap: usize) -> Self; +} + +impl NewCapacity for String +{ + fn new() -> Self + { + Self::new() + } + + fn with_capacity(cap: usize) -> Self + { + Self::with_capacity(cap) + } +} + +impl NewCapacity for Vec +{ + fn new() -> Self + { + Self::new() + } + + fn with_capacity(cap: usize) -> Self + { + Self::with_capacity(cap) + } +} + +pub fn hint_cap(iter: &I) -> T +{ + match iter.size_hint() { + (0, Some(0)) | (0, None) => T::new(), + (_, Some(x)) | (x, _) => T::with_capacity(x) + } +}