From ecf7ff6f07368a05a8facef88f8a3a1aea693760 Mon Sep 17 00:00:00 2001 From: Avril Date: Sun, 11 Oct 2020 05:21:54 +0100 Subject: [PATCH] sentances --- Cargo.lock | 35 ++++++++++ Cargo.toml | 7 ++ src/feed.rs | 44 ++++++++---- src/main.rs | 3 +- src/sanitise/mod.rs | 62 +++++++++++++---- src/sanitise/sentance.rs | 146 +++++++++++++++++++++++++++++++++++++++ src/sanitise/word.rs | 109 +++++++++++++++++++++++++++-- 7 files changed, 373 insertions(+), 33 deletions(-) create mode 100644 src/sanitise/sentance.rs diff --git a/Cargo.lock b/Cargo.lock index 67345b2..ae411b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -645,6 +645,7 @@ dependencies = [ "cfg-if 1.0.0", "futures", "hyper", + "lazy_static", "libc", "log", "lzzzz", @@ -653,6 +654,7 @@ dependencies = [ "pretty_env_logger", "serde", "serde_cbor", + "smallmap", "tokio", "toml", "warp", @@ -1117,6 +1119,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + [[package]] name = "ryu" version = "1.0.5" @@ -1135,6 +1146,21 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.116" @@ -1241,6 +1267,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +[[package]] +name = "smallmap" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2adda73259bbc3ff84f711425ebfb8c90e9dd32a12b05c6528dd49244ea8230f" +dependencies = [ + "rustc_version", +] + [[package]] name = "socket2" version = "0.3.15" diff --git a/Cargo.toml b/Cargo.toml index 2ad06f5..0b23ed6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,11 @@ compress-chain = ["async-compression"] # Treat each new line as a new set to feed instead of feeding the whole data at once split-newlines = [] +# Feed each sentance seperately, instead of just each line / whole body +# Maybe better without `split-newlines`? +# Kinda experimental +split-sentance = [] + # Always aggregate incoming buffer instead of streaming them # This will make feeds faster but allocate full buffers for the aggregated body # @@ -58,3 +63,5 @@ toml = "0.5.6" async-compression = {version = "0.3.5", features=["tokio-02", "bzip2"], optional=true} pin-project = "0.4.26" libc = "0.2.79" +smallmap = "1.1.3" +lazy_static = "1.4.0" diff --git a/src/feed.rs b/src/feed.rs index 0a9eb8f..7b119e8 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -1,22 +1,42 @@ //! Feeding the chain use super::*; +use sanitise::Sentance; const FEED_BOUNDS: std::ops::RangeFrom = 2..; //TODO: Add to config somehow -pub fn feed(chain: &mut Chain, what: impl AsRef, bounds: impl std::ops::RangeBounds) -> bool +pub fn feed(chain: &mut Chain, what: impl AsRef, bounds: impl std::ops::RangeBounds) { - let map = what.as_ref().split_whitespace() - .filter(|word| !word.is_empty()) - .map(|s| s.to_owned()).collect::>(); - debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds"); - if bounds.contains(&map.len()) { - chain.feed(map); - true - } - else { - debug!("Ignoring feed of invalid length {}", map.len()); - false + cfg_if! { + if #[cfg(feature="split-sentance")] { + let map = Sentance::new_iter(&what) //get each sentance in string + .map(|what| what.split_whitespace() // .words() here will remove the punctuation. + .filter(|word| !word.is_empty()) + .map(|s| s.to_owned()).collect::>()); + debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds"); + for map in map {// feed each sentance seperately + if bounds.contains(&map.len()) { + chain.feed(map); + } + else { + debug!("Ignoring feed of invalid length {}", map.len()); + } + } + } else { + let map = Sentance::new_iter(&what) //get each sentance in string + .map(|what| what.split_whitespace() // .words() here will remove the punctuation. + .filter(|word| !word.is_empty())) + .flatten() // add all into one buffer + .map(|s| s.to_owned()).collect::>(); + debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds"); + if bounds.contains(&map.len()) { + chain.feed(map); + } + else { + debug!("Ignoring feed of invalid length {}", map.len()); + } + + } } } diff --git a/src/main.rs b/src/main.rs index 124eb63..7d7200e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,6 +39,7 @@ use futures::{ join_all, }, }; +use lazy_static::lazy_static; use cfg_if::cfg_if; macro_rules! if_debug { @@ -171,7 +172,7 @@ async fn main() { .with(warp::log("markov::api::single")) }; let sentance = warp::post() - .and(warp::path("sentance")); + .and(warp::path("sentance")); //TODO: sanitise::Sentance::new_iter the body line warp::path("api") .and(single) diff --git a/src/sanitise/mod.rs b/src/sanitise/mod.rs index 34ba611..3608cc0 100644 --- a/src/sanitise/mod.rs +++ b/src/sanitise/mod.rs @@ -1,37 +1,73 @@ //! Sanitisers use super::*; use std::{ - marker::Unpin, error, fmt, }; -use tokio::{ - prelude::*, - io::{ - AsyncRead, - AsyncBufRead - }, -}; - +mod sentance; +pub use sentance::*; mod word; pub use word::*; - +/* pub fn take_sentance(from: &mut T, to: &mut U) -> Result { todo!() -} +}*/ + #[derive(Debug)] pub enum Error { - + Word(WordError), + Sentance(SentanceError), } + impl error::Error for Error{} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Ok(()) + match self { + Self::Word(_) => write!(f, "couldn't extract word"), + Self::Sentance(_) => write!(f, "couldn't extract sentance"), + } + } +} + +impl From for Error +{ + #[inline] fn from(from: WordError) -> Self + { + Self::Word(from) } } +impl From for Error +{ + #[inline] fn from(from: SentanceError) -> Self + { + Self::Sentance(from) + } +} + +#[cfg(test)] +mod tests +{ + use super::*; + #[test] + fn sentance() + { + let string = r#"Hello world. +I am a string, that is a string. Strings, I love them!!! + +Owo uwu"#; + let sentances = Sentance::new_iter(string); + for sentance in sentances { + let words = Word::new(sentance); + println!("Word in {:?} -> {:?}", sentance, words); + } + + + } +} diff --git a/src/sanitise/sentance.rs b/src/sanitise/sentance.rs new file mode 100644 index 0000000..9144240 --- /dev/null +++ b/src/sanitise/sentance.rs @@ -0,0 +1,146 @@ +//! Sentance splitting +use super::*; +use std::{ + borrow::{ + Borrow, + ToOwned, + }, + ops::{ + Deref,DerefMut, + }, +}; + +#[derive(Debug)] +pub struct SentanceError; + +/// A sentance +#[derive(Debug, PartialEq, Eq)] +#[repr(transparent)] +pub struct Sentance(str); + + +macro_rules! new { + ($str:expr) => { + unsafe {Sentance::new_unchecked($str)} + }; +} + +const DEFAULT_BOUNDARIES: &[char] = &['\n', '.', ':', '!']; + +lazy_static! { + static ref BOUNDARIES: smallmap::Map = { + let mut map = smallmap::Map::new(); + for &chr in DEFAULT_BOUNDARIES.iter() { + map.insert(chr, ()); + } + map + }; +} + +#[inline] pub fn is_sentance_boundary(chr: char) -> bool +{ + BOUNDARIES.contains_key(&chr) +} + +impl Sentance +{ + /// Create a new word reference without checking for sentance boundaries + pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self + { + std::mem::transmute(from) + } + + /// Create a single sentance + pub fn single<'a>(from: &'a (impl AsRef + 'a + ?Sized)) -> Result<&'a Self, SentanceError> + { + let from = from.as_ref(); + match from.find(is_sentance_boundary) { + Some(_) => Err(SentanceError), + _ => Ok(new!(from)), + } + } + + /// Create a new section of sentances from this string + #[inline] pub fn new<'a>(from: &'a (impl AsRef + 'a + ?Sized)) -> Vec<&'a Self> + { + Self::new_iter(from) + .collect() + } + + /// Create a new iterator over sentances from this string. + pub fn new_iter<'a>(from: &'a (impl AsRef +'a + ?Sized)) -> impl Iterator + { + let from = from.as_ref(); + from.split(is_sentance_boundary) + .map(|x| new!(x.trim())) + .filter(|x| !x.is_empty()) + } + + /// Get the words in this sentance + pub fn words(&self) -> impl Iterator + { + Word::new_iter(self) + } +} + +impl<'a> From<&'a str> for &'a Sentance +{ + fn from(from: &'a str) -> Self + { + new!(from) + } +} + +impl AsRef for Sentance +{ + fn as_ref(&self) -> &str + { + &self.0 + } +} + +impl AsRef for str +{ + fn as_ref(&self) -> &Sentance + { + new!(self) + } +} + +impl Borrow for String +{ + fn borrow(&self) -> &Sentance { + new!(&self[..]) + } +} + +impl ToOwned for Sentance +{ + type Owned = String; + fn to_owned(&self) -> Self::Owned { + self.0.to_owned() + } +} + +impl Deref for Sentance +{ + type Target = str; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Sentance +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl AsRef for Sentance +{ + #[inline] fn as_ref(&self) -> &Sentance + { + self + } +} diff --git a/src/sanitise/word.rs b/src/sanitise/word.rs index 0730282..365c11d 100644 --- a/src/sanitise/word.rs +++ b/src/sanitise/word.rs @@ -1,28 +1,87 @@ //! Word splitting use super::*; use std::{ - borrow::Borrow, + borrow::{ + Borrow, + ToOwned, + }, + ops::{ + Deref,DerefMut, + }, }; +#[derive(Debug)] +pub struct WordError; + +/// A word is a non-whitespace containing string representing part of a sentance #[derive(Debug, PartialEq, Eq)] #[repr(transparent)] pub struct Word(str); + +macro_rules! new { + ($str:expr) => { + unsafe {Word::new_unchecked($str)} + }; +} + +const DEFAULT_BOUNDARIES: &[char] = &['!', '.', ',']; + +lazy_static! { + static ref BOUNDARIES: smallmap::Map = { + let mut map = smallmap::Map::new(); + for &chr in DEFAULT_BOUNDARIES.iter() { + map.insert(chr, ()); + } + map + }; +} + +#[inline] pub fn is_word_boundary(chr: char) -> bool +{ + chr.is_whitespace() || BOUNDARIES.contains_key(&chr) +} + impl Word { - pub fn new<'a>(from: &'a str) -> &'a Self + /// Create a new word reference without checking for whitespace + pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self { - unsafe { - std::mem::transmute(from) + std::mem::transmute(from) + } + + /// Create a single word + pub fn single<'a>(from: &'a (impl AsRef +?Sized +'a)) -> Result<&'a Self, WordError> + { + let from = from.as_ref(); + match from.find(is_word_boundary) { + Some(_) => Err(WordError), + _ => Ok(new!(from)), } } + + /// Create a new section of words from this sentance + pub fn new<'a>(from: &'a (impl AsRef +?Sized+'a)) -> Vec<&'a Self> + { + Self::new_iter(from) + .collect() + } + + /// Create a new iterator over words from this sentance. + pub fn new_iter<'a>(from: &'a (impl AsRef +?Sized+'a)) -> impl Iterator + { + let from = from.as_ref(); + from.split(is_word_boundary) + .filter(|x| !x.is_empty()) + .map(|x| new!(x)) + } } impl<'a> From<&'a str> for &'a Word { fn from(from: &'a str) -> Self { - Word::new(from) + new!(from) } } @@ -38,8 +97,44 @@ impl AsRef for str { fn as_ref(&self) -> &Word { - Word::new(self) + new!(self) } } -//impl Borrow<> +impl Borrow for String +{ + fn borrow(&self) -> &Word { + new!(&self[..]) + } +} + +impl ToOwned for Word +{ + type Owned = String; + fn to_owned(&self) -> Self::Owned { + self.0.to_owned() + } +} + +impl Deref for Word +{ + type Target = str; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Word +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl AsRef for Word +{ + #[inline] fn as_ref(&self) -> &Word + { + self + } +}