serve
Avril 4 years ago
parent 51a5d0aeba
commit ecf7ff6f07
Signed by: flanchan
GPG Key ID: 284488987C31F630

35
Cargo.lock generated

@ -645,6 +645,7 @@ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"futures", "futures",
"hyper", "hyper",
"lazy_static",
"libc", "libc",
"log", "log",
"lzzzz", "lzzzz",
@ -653,6 +654,7 @@ dependencies = [
"pretty_env_logger", "pretty_env_logger",
"serde", "serde",
"serde_cbor", "serde_cbor",
"smallmap",
"tokio", "tokio",
"toml", "toml",
"warp", "warp",
@ -1117,6 +1119,15 @@ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
dependencies = [
"semver",
]
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.5" version = "1.0.5"
@ -1135,6 +1146,21 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2"
[[package]]
name = "semver"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
dependencies = [
"semver-parser",
]
[[package]]
name = "semver-parser"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.116" version = "1.0.116"
@ -1241,6 +1267,15 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "smallmap"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2adda73259bbc3ff84f711425ebfb8c90e9dd32a12b05c6528dd49244ea8230f"
dependencies = [
"rustc_version",
]
[[package]] [[package]]
name = "socket2" name = "socket2"
version = "0.3.15" version = "0.3.15"

@ -16,6 +16,11 @@ compress-chain = ["async-compression"]
# Treat each new line as a new set to feed instead of feeding the whole data at once # Treat each new line as a new set to feed instead of feeding the whole data at once
split-newlines = [] split-newlines = []
# Feed each sentance seperately, instead of just each line / whole body
# Maybe better without `split-newlines`?
# Kinda experimental
split-sentance = []
# Always aggregate incoming buffer instead of streaming them # Always aggregate incoming buffer instead of streaming them
# This will make feeds faster but allocate full buffers for the aggregated body # This will make feeds faster but allocate full buffers for the aggregated body
# #
@ -58,3 +63,5 @@ toml = "0.5.6"
async-compression = {version = "0.3.5", features=["tokio-02", "bzip2"], optional=true} async-compression = {version = "0.3.5", features=["tokio-02", "bzip2"], optional=true}
pin-project = "0.4.26" pin-project = "0.4.26"
libc = "0.2.79" libc = "0.2.79"
smallmap = "1.1.3"
lazy_static = "1.4.0"

@ -1,22 +1,42 @@
//! Feeding the chain //! Feeding the chain
use super::*; use super::*;
use sanitise::Sentance;
const FEED_BOUNDS: std::ops::RangeFrom<usize> = 2..; //TODO: Add to config somehow const FEED_BOUNDS: std::ops::RangeFrom<usize> = 2..; //TODO: Add to config somehow
pub fn feed(chain: &mut Chain<String>, what: impl AsRef<str>, bounds: impl std::ops::RangeBounds<usize>) -> bool pub fn feed(chain: &mut Chain<String>, what: impl AsRef<str>, bounds: impl std::ops::RangeBounds<usize>)
{ {
let map = what.as_ref().split_whitespace() cfg_if! {
.filter(|word| !word.is_empty()) if #[cfg(feature="split-sentance")] {
.map(|s| s.to_owned()).collect::<Vec<_>>(); let map = Sentance::new_iter(&what) //get each sentance in string
debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds"); .map(|what| what.split_whitespace() // .words() here will remove the punctuation.
if bounds.contains(&map.len()) { .filter(|word| !word.is_empty())
chain.feed(map); .map(|s| s.to_owned()).collect::<Vec<_>>());
true debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds");
} for map in map {// feed each sentance seperately
else { if bounds.contains(&map.len()) {
debug!("Ignoring feed of invalid length {}", map.len()); chain.feed(map);
false }
else {
debug!("Ignoring feed of invalid length {}", map.len());
}
}
} else {
let map = Sentance::new_iter(&what) //get each sentance in string
.map(|what| what.split_whitespace() // .words() here will remove the punctuation.
.filter(|word| !word.is_empty()))
.flatten() // add all into one buffer
.map(|s| s.to_owned()).collect::<Vec<_>>();
debug_assert!(!bounds.contains(&0), "Cannot allow 0 size feeds");
if bounds.contains(&map.len()) {
chain.feed(map);
}
else {
debug!("Ignoring feed of invalid length {}", map.len());
}
}
} }
} }

@ -39,6 +39,7 @@ use futures::{
join_all, join_all,
}, },
}; };
use lazy_static::lazy_static;
use cfg_if::cfg_if; use cfg_if::cfg_if;
macro_rules! if_debug { macro_rules! if_debug {
@ -171,7 +172,7 @@ async fn main() {
.with(warp::log("markov::api::single")) .with(warp::log("markov::api::single"))
}; };
let sentance = warp::post() let sentance = warp::post()
.and(warp::path("sentance")); .and(warp::path("sentance")); //TODO: sanitise::Sentance::new_iter the body line
warp::path("api") warp::path("api")
.and(single) .and(single)

@ -1,37 +1,73 @@
//! Sanitisers //! Sanitisers
use super::*; use super::*;
use std::{ use std::{
marker::Unpin,
error, error,
fmt, fmt,
}; };
use tokio::{ mod sentance;
prelude::*, pub use sentance::*;
io::{
AsyncRead,
AsyncBufRead
},
};
mod word; mod word;
pub use word::*; pub use word::*;
/*
pub fn take_sentance<T: AsyncBufRead+ ?Sized + Unpin, U: AsyncWrite + ?Sized + Unpin>(from: &mut T, to: &mut U) -> Result<usize, Error> pub fn take_sentance<T: AsyncBufRead+ ?Sized + Unpin, U: AsyncWrite + ?Sized + Unpin>(from: &mut T, to: &mut U) -> Result<usize, Error>
{ {
todo!() todo!()
} }*/
#[derive(Debug)] #[derive(Debug)]
pub enum Error { pub enum Error {
Word(WordError),
Sentance(SentanceError),
} }
impl error::Error for Error{} impl error::Error for Error{}
impl fmt::Display for Error impl fmt::Display for Error
{ {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{ {
Ok(()) match self {
Self::Word(_) => write!(f, "couldn't extract word"),
Self::Sentance(_) => write!(f, "couldn't extract sentance"),
}
}
}
impl From<WordError> for Error
{
#[inline] fn from(from: WordError) -> Self
{
Self::Word(from)
} }
} }
impl From<SentanceError> for Error
{
#[inline] fn from(from: SentanceError) -> Self
{
Self::Sentance(from)
}
}
#[cfg(test)]
mod tests
{
use super::*;
#[test]
fn sentance()
{
let string = r#"Hello world.
I am a string, that is a string. Strings, I love them!!!
Owo uwu"#;
let sentances = Sentance::new_iter(string);
for sentance in sentances {
let words = Word::new(sentance);
println!("Word in {:?} -> {:?}", sentance, words);
}
}
}

@ -0,0 +1,146 @@
//! Sentance splitting
use super::*;
use std::{
borrow::{
Borrow,
ToOwned,
},
ops::{
Deref,DerefMut,
},
};
#[derive(Debug)]
pub struct SentanceError;
/// A sentance
#[derive(Debug, PartialEq, Eq)]
#[repr(transparent)]
pub struct Sentance(str);
macro_rules! new {
($str:expr) => {
unsafe {Sentance::new_unchecked($str)}
};
}
const DEFAULT_BOUNDARIES: &[char] = &['\n', '.', ':', '!'];
lazy_static! {
static ref BOUNDARIES: smallmap::Map<char, ()> = {
let mut map = smallmap::Map::new();
for &chr in DEFAULT_BOUNDARIES.iter() {
map.insert(chr, ());
}
map
};
}
#[inline] pub fn is_sentance_boundary(chr: char) -> bool
{
BOUNDARIES.contains_key(&chr)
}
impl Sentance
{
/// Create a new word reference without checking for sentance boundaries
pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self
{
std::mem::transmute(from)
}
/// Create a single sentance
pub fn single<'a>(from: &'a (impl AsRef<str> + 'a + ?Sized)) -> Result<&'a Self, SentanceError>
{
let from = from.as_ref();
match from.find(is_sentance_boundary) {
Some(_) => Err(SentanceError),
_ => Ok(new!(from)),
}
}
/// Create a new section of sentances from this string
#[inline] pub fn new<'a>(from: &'a (impl AsRef<str> + 'a + ?Sized)) -> Vec<&'a Self>
{
Self::new_iter(from)
.collect()
}
/// Create a new iterator over sentances from this string.
pub fn new_iter<'a>(from: &'a (impl AsRef<str> +'a + ?Sized)) -> impl Iterator<Item = &'a Self>
{
let from = from.as_ref();
from.split(is_sentance_boundary)
.map(|x| new!(x.trim()))
.filter(|x| !x.is_empty())
}
/// Get the words in this sentance
pub fn words(&self) -> impl Iterator<Item = &'_ Word>
{
Word::new_iter(self)
}
}
impl<'a> From<&'a str> for &'a Sentance
{
fn from(from: &'a str) -> Self
{
new!(from)
}
}
impl AsRef<str> for Sentance
{
fn as_ref(&self) -> &str
{
&self.0
}
}
impl AsRef<Sentance> for str
{
fn as_ref(&self) -> &Sentance
{
new!(self)
}
}
impl Borrow<Sentance> for String
{
fn borrow(&self) -> &Sentance {
new!(&self[..])
}
}
impl ToOwned for Sentance
{
type Owned = String;
fn to_owned(&self) -> Self::Owned {
self.0.to_owned()
}
}
impl Deref for Sentance
{
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Sentance
{
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl AsRef<Sentance> for Sentance
{
#[inline] fn as_ref(&self) -> &Sentance
{
self
}
}

@ -1,28 +1,87 @@
//! Word splitting //! Word splitting
use super::*; use super::*;
use std::{ use std::{
borrow::Borrow, borrow::{
Borrow,
ToOwned,
},
ops::{
Deref,DerefMut,
},
}; };
#[derive(Debug)]
pub struct WordError;
/// A word is a non-whitespace containing string representing part of a sentance
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
#[repr(transparent)] #[repr(transparent)]
pub struct Word(str); pub struct Word(str);
macro_rules! new {
($str:expr) => {
unsafe {Word::new_unchecked($str)}
};
}
const DEFAULT_BOUNDARIES: &[char] = &['!', '.', ','];
lazy_static! {
static ref BOUNDARIES: smallmap::Map<char, ()> = {
let mut map = smallmap::Map::new();
for &chr in DEFAULT_BOUNDARIES.iter() {
map.insert(chr, ());
}
map
};
}
#[inline] pub fn is_word_boundary(chr: char) -> bool
{
chr.is_whitespace() || BOUNDARIES.contains_key(&chr)
}
impl Word impl Word
{ {
pub fn new<'a>(from: &'a str) -> &'a Self /// Create a new word reference without checking for whitespace
pub unsafe fn new_unchecked<'a>(from: &'a str) -> &'a Self
{ {
unsafe { std::mem::transmute(from)
std::mem::transmute(from) }
/// Create a single word
pub fn single<'a>(from: &'a (impl AsRef<Sentance> +?Sized +'a)) -> Result<&'a Self, WordError>
{
let from = from.as_ref();
match from.find(is_word_boundary) {
Some(_) => Err(WordError),
_ => Ok(new!(from)),
} }
} }
/// Create a new section of words from this sentance
pub fn new<'a>(from: &'a (impl AsRef<Sentance> +?Sized+'a)) -> Vec<&'a Self>
{
Self::new_iter(from)
.collect()
}
/// Create a new iterator over words from this sentance.
pub fn new_iter<'a>(from: &'a (impl AsRef<Sentance> +?Sized+'a)) -> impl Iterator<Item = &'a Self>
{
let from = from.as_ref();
from.split(is_word_boundary)
.filter(|x| !x.is_empty())
.map(|x| new!(x))
}
} }
impl<'a> From<&'a str> for &'a Word impl<'a> From<&'a str> for &'a Word
{ {
fn from(from: &'a str) -> Self fn from(from: &'a str) -> Self
{ {
Word::new(from) new!(from)
} }
} }
@ -38,8 +97,44 @@ impl AsRef<Word> for str
{ {
fn as_ref(&self) -> &Word fn as_ref(&self) -> &Word
{ {
Word::new(self) new!(self)
} }
} }
//impl Borrow<> impl Borrow<Word> for String
{
fn borrow(&self) -> &Word {
new!(&self[..])
}
}
impl ToOwned for Word
{
type Owned = String;
fn to_owned(&self) -> Self::Owned {
self.0.to_owned()
}
}
impl Deref for Word
{
type Target = str;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Word
{
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl AsRef<Word> for Word
{
#[inline] fn as_ref(&self) -> &Word
{
self
}
}

Loading…
Cancel
Save