From ecf7ff6f07368a05a8facef88f8a3a1aea693760 Mon Sep 17 00:00:00 2001
From: Avril <flanchan@cumallover.me>
Date: Sun, 11 Oct 2020 05:21:54 +0100
Subject: [PATCH] sentances

---
 Cargo.lock               |  35 ++++++++++
 Cargo.toml               |   7 ++
 src/feed.rs              |  44 ++++++++----
 src/main.rs              |   3 +-
 src/sanitise/mod.rs      |  62 +++++++++++++----
 src/sanitise/sentance.rs | 146 +++++++++++++++++++++++++++++++++++++++
 src/sanitise/word.rs     | 109 +++++++++++++++++++++++++++--
 7 files changed, 373 insertions(+), 33 deletions(-)
 create mode 100644 src/sanitise/sentance.rs
diff --git a/Cargo.lock b/Cargo.lock
index 67345b2..ae411b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -645,6 +645,7 @@ dependencies = [
  "cfg-if 1.0.0",
  "futures",
  "hyper",
+ "lazy_static",
  "libc",
  "log",
  "lzzzz",
@@ -653,6 +654,7 @@ dependencies = [
  "pretty_env_logger",
  "serde",
  "serde_cbor",
+ "smallmap",
  "tokio",
  "toml",
  "warp",
@@ -1117,6 +1119,15 @@ dependencies = [
  "winapi 0.3.9",
 ]
 
+[[package]]
+name = "rustc_version"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "ryu"
 version = "1.0.5"
@@ -1135,6 +1146,21 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2"
 
+[[package]]
+name = "semver"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
+dependencies = [
+ "semver-parser",
+]
+
+[[package]]
+name = "semver-parser"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
+
 [[package]]
 name = "serde"
 version = "1.0.116"
@@ -1241,6 +1267,15 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
 
+[[package]]
+name = "smallmap"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2adda73259bbc3ff84f711425ebfb8c90e9dd32a12b05c6528dd49244ea8230f"
+dependencies = [
+ "rustc_version",
+]
+
 [[package]]
 name = "socket2"
 version = "0.3.15"
diff --git a/Cargo.toml b/Cargo.toml
index 2ad06f5..0b23ed6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,11 @@ compress-chain = ["async-compression"]
 # Treat each new line as a new set to feed instead of feeding the whole data at once
 split-newlines = []
 
+# Feed each sentance seperately, instead of just each line / whole body
+# Maybe better without `split-newlines`?
+# Kinda experimental
+split-sentance = []
+
 # Always aggregate incoming buffer instead of streaming them
 # This will make feeds faster but allocate full buffers for the aggregated body
 #
@@ -58,3 +63,5 @@ toml = "0.5.6"
 async-compression = {version = "0.3.5", features=["tokio-02", "bzip2"], optional=true}
 pin-project = "0.4.26"
 libc = "0.2.79"
+smallmap = "1.1.3"
+lazy_static = "1.4.0"
diff --git a/src/feed.rs b/src/feed.rs
index 0a9eb8f..7b119e8 100644
--- a/src/feed.rs
+++ b/src/feed.rs
@@ -1,22 +1,42 @@
 //! Feeding the chain
 use super::*;
+use sanitise::Sentance;
 
 const FEED_BOUNDS: std::ops::RangeFrom<usize> = 2..; //TODO: Add to config somehow
 
 
-pub fn feed(chain: &mut Chain<String>, what: impl AsRef<str>, bounds: impl std::ops::RangeBounds<usize>) -> bool
+pub fn feed(chain: &mut Chain<String>, what: impl AsRef<str>, bounds: impl std::ops::RangeBounds<usize>)
 {
-    let map = what.as_ref().split_whitespace()
-        .filter(|word| !word.is_empty())
-        .map(|s| s.to_owned()).collect::<Vec<_>>();
-    debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds");
-    if bounds.contains(&map.len()) {
-	chain.feed(map);
-	true
-    }
-    else {
-	debug!("Ignoring feed of invalid length {}", map.len());
-	false
+    cfg_if! {
+	if #[cfg(feature="split-sentance")] { 
+	    let map = Sentance::new_iter(&what) //get each sentance in string
+		.map(|what| what.split_whitespace() // .words() here will remove the punctuation.
+		     .filter(|word| !word.is_empty())
+		     .map(|s| s.to_owned()).collect::<Vec<_>>());
+	    debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds");
+	    for map in map {// feed each sentance seperately
+		if bounds.contains(&map.len()) {
+		    chain.feed(map);
+		}
+		else {
+		    debug!("Ignoring feed of invalid length {}", map.len());
+		}
+	    }
+	} else {
+	    let map = Sentance::new_iter(&what) //get each sentance in string
+		.map(|what| what.split_whitespace() // .words() here will remove the punctuation.
+		     .filter(|word| !word.is_empty()))
+		.flatten() // add all into one buffer
+		.map(|s| s.to_owned()).collect::<Vec<_>>();
+	    debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds");
+	    if bounds.contains(&map.len()) {
+		chain.feed(map);
+	    }
+	    else {
+		debug!("Ignoring feed of invalid length {}", map.len());
+	    }
+	    
+	}
     }
 }
 
diff --git a/src/main.rs b/src/main.rs
index 124eb63..7d7200e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -39,6 +39,7 @@ use futures::{
 	join_all,
     },
 };
+use lazy_static::lazy_static;
 use cfg_if::cfg_if;
 
 macro_rules! if_debug {
@@ -171,7 +172,7 @@ async fn main() {
 			    .with(warp::log("markov::api::single"))
 		    };
 		    let sentance = warp::post()
-			.and(warp::path("sentance"));
+			.and(warp::path("sentance")); //TODO: sanitise::Sentance::new_iter the body line
 
 		    warp::path("api")
 			.and(single)
diff --git a/src/sanitise/mod.rs b/src/sanitise/mod.rs
index 34ba611..3608cc0 100644
--- a/src/sanitise/mod.rs
+++ b/src/sanitise/mod.rs
@@ -1,37 +1,73 @@
 //! Sanitisers
 use super::*;
 use std::{
-    marker::Unpin,
     error,
     fmt,
 };
-use tokio::{
-    prelude::*,
-    io::{
-	AsyncRead,
-	AsyncBufRead
-    },
-};
-
+mod sentance;
+pub use sentance::*;
 mod word;
 pub use word::*;
-
+/*
 pub fn take_sentance<T: AsyncBufRead+ ?Sized + Unpin, U: AsyncWrite + ?Sized + Unpin>(from: &mut T, to: &mut U) -> Result<usize, Error>
 {
     todo!()
-}
+}*/
+
 
 
 #[derive(Debug)]
 pub enum Error {
-    
+    Word(WordError),
+    Sentance(SentanceError),
 }
+
 impl error::Error for Error{}
+
 impl fmt::Display for Error
 {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
     {
-	Ok(())
+	match self {
+	    Self::Word(_) => write!(f, "couldn't extract word"),
+	    Self::Sentance(_) => write!(f, "couldn't extract sentance"),
+	}
+    }
+}
+
+impl From<WordError> for Error
+{
+    #[inline] fn from(from: WordError) -> Self
+    {
+	Self::Word(from)
     }
 }
 
+impl From<SentanceError> for Error
+{
+    #[inline]  fn from(from: SentanceError) -> Self
+    {
+	Self::Sentance(from)
+    }
+}
+
+#[cfg(test)]
+mod tests
+{
+    use super::*;
+    #[test]
+    fn sentance()
+    {
+	let string = r#"Hello world.
+I am a string, that is a string. Strings, I love them!!!
+
+Owo uwu"#;
+	let sentances = Sentance::new_iter(string);
+	for sentance in sentances {
+	    let words = Word::new(sentance);
+	    println!("Word in {:?} -> {:?}", sentance, words);
+	}
+
+	
+    }
+}
diff --git a/src/sanitise/sentance.rs b/src/sanitise/sentance.rs
new file mode 100644
index 0000000..9144240
--- /dev/null
+++ b/src/sanitise/sentance.rs
@@ -0,0 +1,146 @@
+//! Sentance splitting
+use super::*;
+use std::{
+    borrow::{
+	Borrow,
+	ToOwned,
+    },
+    ops::{
+	Deref,DerefMut,
+    },
+};
+
+#[derive(Debug)]
+pub struct SentanceError;
+
+/// A sentance
+#[derive(Debug, PartialEq, Eq)]
+#[repr(transparent)]
+pub struct Sentance(str);
+
+
+macro_rules! new {
+    ($str:expr) => {
+	unsafe {Sentance::new_unchecked($str)}
+    };
+}
+
+const DEFAULT_BOUNDARIES: &[char] = &['\n', '.', ':', '!'];
+
+lazy_static! {
+    static ref BOUNDARIES: smallmap::Map<char, ()> = {
+	let mut map = smallmap::Map::new();
+	for &chr in DEFAULT_BOUNDARIES.iter() {
+	    map.insert(chr, ());
+	}
+	map
+    };
+}
+
+#[inline] pub fn is_sentance_boundary(chr: char) -> bool
+{
+    BOUNDARIES.contains_key(&chr)
+}
+
+impl Sentance
+{
+    /// Create a new word reference without checking for sentance boundaries
+    pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self
+    {
+	std::mem::transmute(from)
+    }
+
+    /// Create a single sentance
+    pub fn single<'a>(from: &'a (impl AsRef<str> + 'a + ?Sized)) -> Result<&'a Self, SentanceError>
+    {
+	let from = from.as_ref();
+	match from.find(is_sentance_boundary) {
+	    Some(_) => Err(SentanceError),
+	    _ => Ok(new!(from)),
+	}
+    }
+
+    /// Create a new section of sentances from this string
+    #[inline] pub fn new<'a>(from: &'a (impl AsRef<str> + 'a + ?Sized)) -> Vec<&'a Self>
+    {
+	Self::new_iter(from)
+	    .collect()
+    }
+
+    /// Create a new iterator over sentances from this string.
+    pub fn new_iter<'a>(from: &'a (impl AsRef<str> +'a + ?Sized)) -> impl Iterator<Item = &'a Self>
+    {
+	let from = from.as_ref();
+	from.split(is_sentance_boundary)
+	    .map(|x| new!(x.trim()))
+	    .filter(|x| !x.is_empty())
+    }
+
+    /// Get the words in this sentance
+    pub fn words(&self) -> impl Iterator<Item = &'_ Word>
+    {
+	Word::new_iter(self)
+    }
+}
+
+impl<'a> From<&'a str> for &'a Sentance
+{
+    fn from(from: &'a str) -> Self
+    {
+	new!(from)
+    }
+}
+
+impl AsRef<str> for Sentance
+{
+    fn as_ref(&self) -> &str
+    {
+	&self.0
+    }
+}
+
+impl AsRef<Sentance> for str
+{
+    fn as_ref(&self) -> &Sentance
+    {
+	new!(self)
+    }
+}
+
+impl Borrow<Sentance> for String
+{
+    fn borrow(&self) -> &Sentance {
+	new!(&self[..])
+    }
+}
+
+impl ToOwned for Sentance
+{
+    type Owned = String;
+    fn to_owned(&self) -> Self::Owned {
+	self.0.to_owned()
+    }
+}
+
+impl Deref for Sentance
+{
+    type Target = str;
+    fn deref(&self) -> &Self::Target {
+	&self.0
+    }
+}
+
+impl DerefMut for Sentance
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+	&mut self.0
+    }
+}
+
+impl AsRef<Sentance> for Sentance
+{
+    #[inline] fn as_ref(&self) -> &Sentance
+    {
+	self
+    }
+}
diff --git a/src/sanitise/word.rs b/src/sanitise/word.rs
index 0730282..365c11d 100644
--- a/src/sanitise/word.rs
+++ b/src/sanitise/word.rs
@@ -1,28 +1,87 @@
 //! Word splitting
 use super::*;
 use std::{
-    borrow::Borrow,
+    borrow::{
+	Borrow,
+	ToOwned,
+    },
+    ops::{
+	Deref,DerefMut,
+    },
 };
 
+#[derive(Debug)]
+pub struct WordError;
+
+/// A word is a non-whitespace containing string representing part of a sentance
 #[derive(Debug, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct Word(str);
 
+
+macro_rules! new {
+    ($str:expr) => {
+	unsafe {Word::new_unchecked($str)}
+    };
+}
+
+const DEFAULT_BOUNDARIES: &[char] = &['!', '.', ','];
+
+lazy_static! {
+    static ref BOUNDARIES: smallmap::Map<char, ()> = {
+	let mut map = smallmap::Map::new();
+	for &chr in DEFAULT_BOUNDARIES.iter() {
+	    map.insert(chr, ());
+	}
+	map
+    };
+}
+
+#[inline] pub fn is_word_boundary(chr: char) -> bool
+{
+    chr.is_whitespace() || BOUNDARIES.contains_key(&chr)
+}
+
 impl Word
 {
-    pub fn new<'a>(from: &'a str) -> &'a Self
+    /// Create a new word reference without checking for whitespace
+    pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self
     {
-	unsafe {
-	    std::mem::transmute(from)
+	std::mem::transmute(from)
+    }
+
+    /// Create a single word
+    pub fn single<'a>(from: &'a (impl AsRef<Sentance> +?Sized +'a)) -> Result<&'a Self, WordError>
+    {
+	let from = from.as_ref();
+	match from.find(is_word_boundary) {
+	    Some(_) => Err(WordError),
+	    _ => Ok(new!(from)),
 	}
     }
+
+    /// Create a new section of words from this sentance
+    pub fn new<'a>(from: &'a (impl AsRef<Sentance> +?Sized+'a)) -> Vec<&'a Self>
+    {
+	Self::new_iter(from)
+	    .collect()
+    }
+
+    /// Create a new iterator over words from this sentance.
+    pub fn new_iter<'a>(from: &'a (impl AsRef<Sentance> +?Sized+'a)) -> impl Iterator<Item = &'a Self>
+    {
+	let from = from.as_ref();
+	from.split(is_word_boundary)
+	    .filter(|x| !x.is_empty())
+	    .map(|x| new!(x))
+    }
 }
 
 impl<'a> From<&'a str> for &'a Word
 {
     fn from(from: &'a str) -> Self
     {
-	Word::new(from)
+	new!(from)
     }
 }
 
@@ -38,8 +97,44 @@ impl AsRef<Word> for str
 {
     fn as_ref(&self) -> &Word
     {
-	Word::new(self)
+	new!(self)
     }
 }
 
-//impl Borrow<>
+impl Borrow<Word> for String
+{
+    fn borrow(&self) -> &Word {
+	new!(&self[..])
+    }
+}
+
+impl ToOwned for Word
+{
+    type Owned = String;
+    fn to_owned(&self) -> Self::Owned {
+	self.0.to_owned()
+    }
+}
+
+impl Deref for Word
+{
+    type Target = str;
+    fn deref(&self) -> &Self::Target {
+	&self.0
+    }
+}
+
+impl DerefMut for Word
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+	&mut self.0
+    }
+}
+
+impl AsRef<Word> for Word
+{
+    #[inline] fn as_ref(&self) -> &Word
+    {
+	self
+    }
+}