start FTS

main
Ziyang Hu 1 year ago
parent ee8cf79397
commit 56123c172e

122
Cargo.lock generated

@ -42,9 +42,9 @@ dependencies = [
[[package]]
name = "aho-corasick"
version = "0.7.20"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
dependencies = [
"memchr",
]
@ -235,9 +235,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "axum"
version = "0.6.15"
version = "0.6.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62"
checksum = "113713495a32dd0ab52baf5c10044725aa3aec00b31beda84218e469029b72a3"
dependencies = [
"async-trait",
"axum-core",
@ -402,9 +402,9 @@ dependencies = [
[[package]]
name = "bumpalo"
version = "3.12.0"
version = "3.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
[[package]]
name = "byte-slice-cast"
@ -461,6 +461,15 @@ dependencies = [
"jobserver",
]
[[package]]
name = "cedarwood"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
dependencies = [
"smallvec",
]
[[package]]
name = "cesu8"
version = "1.1.0"
@ -553,9 +562,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.2.2"
version = "4.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a"
checksum = "956ac1f6381d8d82ab4684768f89c0ea3afe66925ceadb4eeb3fc452ffc55d62"
dependencies = [
"clap_builder",
"clap_derive",
@ -564,9 +573,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.2.2"
version = "4.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6"
checksum = "84080e799e54cff944f4b4a4b0e71630b0e0443b25b985175c7dddc1a859b749"
dependencies = [
"anstream",
"anstyle",
@ -678,6 +687,7 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
name = "cozo"
version = "0.6.0"
dependencies = [
"aho-corasick",
"approx",
"base64 0.21.0",
"byteorder",
@ -692,6 +702,7 @@ dependencies = [
"env_logger",
"graph",
"itertools 0.10.5",
"jieba-rs",
"js-sys",
"lazy_static",
"log",
@ -709,6 +720,7 @@ dependencies = [
"rmp",
"rmp-serde",
"rmpv",
"rust-stemmers",
"rustc-hash",
"serde",
"serde_bytes",
@ -726,6 +738,7 @@ dependencies = [
"tokio",
"unicode-normalization",
"uuid",
"whatlang",
]
[[package]]
@ -736,7 +749,7 @@ dependencies = [
"axum",
"axum-macros",
"chrono",
"clap 4.2.2",
"clap 4.2.4",
"cozo",
"crossbeam",
"ctrlc",
@ -834,9 +847,9 @@ dependencies = [
[[package]]
name = "cpufeatures"
version = "0.2.6"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181"
checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
dependencies = [
"libc",
]
@ -1828,6 +1841,21 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "jieba-rs"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]]
name = "jni"
version = "0.21.1"
@ -1882,9 +1910,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.141"
version = "0.2.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317"
[[package]]
name = "libloading"
@ -1938,9 +1966,9 @@ dependencies = [
[[package]]
name = "linux-raw-sys"
version = "0.3.1"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f"
checksum = "9b085a4f2cde5781fc4b1717f2e86c62f5cda49de7ba99a7c2eae02b61c9064c"
[[package]]
name = "litrs"
@ -1985,9 +2013,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
[[package]]
name = "matrixmultiply"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "add85d4dd35074e6fedc608f8c8f513a3548619a9024b751949ef0e8e45a4d84"
checksum = "bb99c395ae250e1bf9133673f03ca9f97b7e71b705436bf8f089453445d1e9fe"
dependencies = [
"rawpointer",
]
@ -2024,9 +2052,9 @@ checksum = "8452105ba047068f40ff7093dd1d9da90898e63dd61736462e9cdda6a90ad3c3"
[[package]]
name = "miette"
version = "5.7.0"
version = "5.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7abdc09c381c9336b9f2e9bd6067a9a5290d20e2d2e2296f275456121c33ae89"
checksum = "92a992891d5579caa9efd8e601f82e30a1caa79a27a5db075dde30ecb9eab357"
dependencies = [
"backtrace",
"backtrace-ext",
@ -2045,9 +2073,9 @@ dependencies = [
[[package]]
name = "miette-derive"
version = "5.7.0"
version = "5.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8842972f23939443013dfd3720f46772b743e86f1a81d120d4b6fb090f87de1c"
checksum = "4c65c625186a9bcce6699394bee511e1b1aec689aa7e3be1bf4e996e75834153"
dependencies = [
"proc-macro2",
"quote",
@ -2333,9 +2361,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "openssl"
version = "0.10.50"
version = "0.10.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e30d8bc91859781f0a943411186324d580f2bbeb71b452fe91ae344806af3f1"
checksum = "97ea2d98598bf9ada7ea6ee8a30fb74f9156b63bbe495d64ec2b87c269d2dda3"
dependencies = [
"bitflags",
"cfg-if 1.0.0",
@ -2365,18 +2393,18 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-src"
version = "111.25.2+1.1.1t"
version = "111.25.3+1.1.1t"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320708a054ad9b3bf314688b5db87cf4d6683d64cfc835e2337924ae62bf4431"
checksum = "924757a6a226bf60da5f7dd0311a34d2b52283dd82ddeb103208ddc66362f80c"
dependencies = [
"cc",
]
[[package]]
name = "openssl-sys"
version = "0.9.85"
version = "0.9.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d3d193fb1488ad46ffe3aaabc912cc931d02ee8518fe2959aea8ef52718b0c0"
checksum = "992bac49bdbab4423199c654a5515bd2a6c6a23bf03f2dd3bdb7e5ae6259bc69"
dependencies = [
"cc",
"libc",
@ -3046,9 +3074,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.7.3"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370"
dependencies = [
"aho-corasick",
"memchr",
@ -3057,9 +3085,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.6.29"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c"
[[package]]
name = "reqwest"
@ -3145,6 +3173,16 @@ dependencies = [
"rmp",
]
[[package]]
name = "rust-stemmers"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
dependencies = [
"serde",
"serde_derive",
]
[[package]]
name = "rustc-demangle"
version = "0.1.23"
@ -3159,9 +3197,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.37.11"
version = "0.37.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77"
checksum = "f79bef90eb6d984c72722595b5b1348ab39275a5e5123faca6863bf07d75a4e0"
dependencies = [
"bitflags",
"errno",
@ -3558,9 +3596,9 @@ dependencies = [
[[package]]
name = "supports-hyperlinks"
version = "2.0.0"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b4806e0b03b9906e76b018a5d821ebf198c8e9dc0829ed3328eeeb5094aed60"
checksum = "f84231692eb0d4d41e4cdd0cabfdd2e6cd9e255e65f80c9aa7c98dd502b4233d"
dependencies = [
"is-terminal",
]
@ -4351,6 +4389,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "whatlang"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
dependencies = [
"hashbrown",
"once_cell",
]
[[package]]
name = "which"
version = "4.4.0"

@ -130,3 +130,8 @@ crossbeam = "0.8.2"
ndarray = { version = "0.15.6", features = ["serde"] }
sha2 = "0.10.6"
rustc-hash = "1.1.0"
# For the FTS feature
jieba-rs = "0.6.7"
aho-corasick = "1.0.1"
rust-stemmers = "1.2.0"
fast2s = "0.3.1"

@ -0,0 +1,6 @@
Stop words:
```
https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json
```

@ -0,0 +1,15 @@
/*
* Code under this module is adapted from the Cang-jie project
* https://github.com/DCjanus/cang-jie
* All code here are licensed under the MIT license, as in the original project.
*/
pub(crate) mod options;
pub(crate) mod stream;
pub(crate) mod tokenizer;
pub(crate) use {
options::TokenizerOption, stream::CangjieTokenStream, tokenizer::CangJieTokenizer,
};
pub const CANG_JIE: &str = "CANG_JIE";

@ -0,0 +1,19 @@
/// Tokenizer Option
#[derive(Debug, Clone)]
pub enum TokenizerOption {
/// Cut the input text, return all possible words
All,
/// Cut the input text
Default {
/// `hmm`: enable HMM or not
hmm: bool,
},
/// Cut the input text in search mode
ForSearch {
/// `hmm`: enable HMM or not
hmm: bool,
},
/// Cut the input text into UTF-8 characters
Unicode,
}

@ -0,0 +1,52 @@
use crate::fts::tokenizer::Token;
#[derive(Debug)]
pub struct CangjieTokenStream<'a> {
result: Vec<&'a str>,
// Begin with 1
index: usize,
offset_from: usize,
token: Token,
}
impl<'a> CangjieTokenStream<'a> {
pub fn new(result: Vec<&'a str>) -> Self {
CangjieTokenStream {
result,
index: 0,
offset_from: 0,
token: Token::default(),
}
}
}
impl<'a> crate::fts::tokenizer::TokenStream for CangjieTokenStream<'a> {
fn advance(&mut self) -> bool {
if self.index < self.result.len() {
let current_word = self.result[self.index];
let offset_to = self.offset_from + current_word.len();
self.token = Token {
offset_from: self.offset_from,
offset_to,
position: self.index,
text: current_word.to_string(),
position_length: self.result.len(),
};
self.index += 1;
self.offset_from = offset_to;
true
} else {
false
}
}
fn token(&self) -> &crate::fts::tokenizer::Token {
&self.token
}
fn token_mut(&mut self) -> &mut crate::fts::tokenizer::Token {
&mut self.token
}
}

@ -0,0 +1,45 @@
use super::{options::TokenizerOption, stream::CangjieTokenStream};
use jieba_rs::Jieba;
use log::trace;
use std::sync::Arc;
use crate::fts::tokenizer::BoxTokenStream;
#[derive(Clone, Debug)]
pub struct CangJieTokenizer {
/// Separation algorithm provider
pub worker: Arc<Jieba>,
/// Separation config
pub option: TokenizerOption,
}
impl Default for CangJieTokenizer {
fn default() -> Self {
CangJieTokenizer {
worker: Arc::new(Jieba::empty()),
option: TokenizerOption::Default { hmm: false },
}
}
}
impl crate::fts::tokenizer::Tokenizer for CangJieTokenizer {
/// Cut text into tokens
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let result = match self.option {
TokenizerOption::All => self.worker.cut_all(text),
TokenizerOption::Default { hmm: use_hmm } => self.worker.cut(text, use_hmm),
TokenizerOption::ForSearch { hmm: use_hmm } => {
self.worker.cut_for_search(text, use_hmm)
}
TokenizerOption::Unicode => {
text.chars()
.fold((0usize, vec![]), |(offset, mut result), the_char| {
result.push(&text[offset..offset + the_char.len_utf8()]);
(offset + the_char.len_utf8(), result)
})
.1
}
};
trace!("{:?}->{:?}", text, result);
BoxTokenStream::from(CangjieTokenStream::new(result))
}
}

@ -0,0 +1,10 @@
/*
* Copyright 2023, The Cozo Project Authors.
*
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
* If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/.
*/
pub(crate) mod tokenizer;
pub(crate) mod cangjie;

@ -0,0 +1,91 @@
//! # Example
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
//! assert!(stream.next().is_some());
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
#[derive(Clone)]
pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>,
}
impl<'a> AlphaNumOnlyFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
}
impl TokenFilter for AlphaNumOnlyFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
}
}
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{AlphaNumOnlyFilter, SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_alphanum_only() {
let tokens = token_stream_helper("I am a cat. 我輩は猫である。(1906)");
assert_eq!(tokens.len(), 5);
assert_token(&tokens[0], 0, "I", 0, 1);
assert_token(&tokens[1], 1, "am", 2, 4);
assert_token(&tokens[2], 2, "a", 5, 6);
assert_token(&tokens[3], 3, "cat", 7, 10);
assert_token(&tokens[4], 5, "1906", 37, 41);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,41 @@
use crate::fts::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};
#[derive(Clone)]
pub(crate) struct EmptyTokenizer;
impl Tokenizer for EmptyTokenizer {
fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> {
EmptyTokenStream::default().into()
}
}
#[derive(Default)]
struct EmptyTokenStream {
token: Token,
}
impl TokenStream for EmptyTokenStream {
fn advance(&mut self) -> bool {
false
}
fn token(&self) -> &super::Token {
&self.token
}
fn token_mut(&mut self) -> &mut super::Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::Tokenizer;
#[test]
fn test_empty_tokenizer() {
let tokenizer = super::EmptyTokenizer;
let mut empty = tokenizer.token_stream("whatever string");
assert!(!empty.advance());
}
}

@ -0,0 +1,86 @@
use std::mem;
use super::{Token, TokenFilter, TokenStream};
use crate::fts::tokenizer::BoxTokenStream;
impl TokenFilter for LowerCaser {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(LowerCaserTokenStream {
tail: token_stream,
buffer: String::with_capacity(100),
})
}
}
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
pub struct LowerCaserTokenStream<'a> {
buffer: String,
tail: BoxTokenStream<'a>,
}
// writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &str, output: &mut String) {
output.clear();
for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case.
// This will have an normalizationo effect, which is ok for search.
output.extend(c.to_lowercase());
}
}
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_to_lower_case() {
let tokens = token_stream_helper("Tree");
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "tree", 0, 4);
let tokens = token_stream_helper("Русский текст");
assert_eq!(tokens.len(), 2);
assert_token(&tokens[0], 0, "русский", 0, 14);
assert_token(&tokens[1], 1, "текст", 15, 25);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
.filter(LowerCaser)
.token_stream(text);
let mut tokens = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

@ -0,0 +1,306 @@
/*
* Code under this module is adapted from the Tantivy project
* https://github.com/quickwit-oss/tantivy/tree/0.19.2/src/tokenizer
* All code here are licensed under the MIT license, as in the original project.
*/
//! Tokenizer are in charge of chopping text into a stream of tokens
//! ready for indexing.
//!
//! You must define in your schema which tokenizer should be used for
//! each of your fields :
//!
//! ```rust
//! use tantivy::schema::*;
//!
//! let mut schema_builder = Schema::builder();
//!
//! let text_options = TextOptions::default()
//! .set_indexing_options(
//! TextFieldIndexing::default()
//! .set_tokenizer("en_stem")
//! .set_index_option(IndexRecordOption::Basic)
//! )
//! .set_stored();
//!
//! let id_options = TextOptions::default()
//! .set_indexing_options(
//! TextFieldIndexing::default()
//! .set_tokenizer("raw_ids")
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions)
//! )
//! .set_stored();
//!
//! schema_builder.add_text_field("title", text_options.clone());
//! schema_builder.add_text_field("text", text_options);
//! schema_builder.add_text_field("uuid", id_options);
//!
//! let schema = schema_builder.build();
//! ```
//!
//! By default, `tantivy` offers the following tokenizers:
//!
//! ## `default`
//!
//! `default` is the tokenizer that will be used if you do not
//! assign a specific tokenizer to your text field.
//! It will chop your text on punctuation and whitespaces,
//! removes tokens that are longer than 40 chars, and lowercase your text.
//!
//! ## `raw`
//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
//! It can be useful to index uuids, or urls for instance.
//!
//! ## `en_stem`
//!
//! In addition to what `default` does, the `en_stem` tokenizer also
//! apply stemming to your tokens. Stemming consists in trimming words to
//! remove their inflection. This tokenizer is slower than the default one,
//! but is recommended to improve recall.
//!
//!
//! # Custom tokenizers
//!
//! You can write your own tokenizer by implementing the [`Tokenizer`] trait
//! or you can extend an existing [`Tokenizer`] by chaining it with several
//! [`TokenFilter`]s.
//!
//! For instance, the `en_stem` is defined as follows.
//!
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new(Language::English));
//! ```
//!
//! Once your tokenizer is defined, you need to
//! register it with a name in your index's [`TokenizerManager`].
//!
//! ```rust
//! # use tantivy::schema::Schema;
//! # use tantivy::tokenizer::*;
//! # use tantivy::Index;
//! #
//! let custom_en_tokenizer = SimpleTokenizer;
//! # let schema = Schema::builder().build();
//! let index = Index::create_in_ram(schema);
//! index.tokenizers()
//! .register("custom_en", custom_en_tokenizer);
//! ```
//!
//! If you built your schema programmatically, a complete example
//! could like this for instance.
//!
//! Note that tokens with a len greater or equal to
//! [`MAX_TOKEN_LEN`].
//!
//! # Example
//!
//! ```rust
//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
//! use tantivy::tokenizer::*;
//! use tantivy::Index;
//!
//! let mut schema_builder = Schema::builder();
//! let text_field_indexing = TextFieldIndexing::default()
//! .set_tokenizer("custom_en")
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
//! let text_options = TextOptions::default()
//! .set_indexing_options(text_field_indexing)
//! .set_stored();
//! schema_builder.add_text_field("title", text_options);
//! let schema = schema_builder.build();
//! let index = Index::create_in_ram(schema);
//!
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! index
//! .tokenizers()
//! .register("custom_en", custom_en_tokenizer);
//! ```
mod alphanum_only;
mod ascii_folding_filter;
mod empty_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
mod raw_tokenizer;
mod remove_long;
mod simple_tokenizer;
mod split_compound_words;
mod stemmer;
mod stop_word_filter;
mod tokenized_string;
mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
/// Maximum authorized len (in bytes) for a token.
///
/// Tokenizers are in charge of not emitting tokens larger than this value.
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
/// `2^16 - 1 - 5`, the token will simply be ignored downstream.
pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
#[cfg(test)]
pub mod tests {
use super::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
};
use crate::fts::tokenizer::TextAnalyzer;
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {} but {:?}",
position, token
);
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
assert_eq!(
token.offset_from, from,
"expected offset_from {} but {:?}",
from, token
);
assert_eq!(
token.offset_to, to,
"expected offset_to {} but {:?}",
to, token
);
}
#[test]
fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hello", 0, 5);
assert_token(&tokens[1], 1, "happi", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::Greek)),
);
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
}
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
}
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
ws_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
}

@ -0,0 +1,456 @@
use super::{Token, TokenStream, Tokenizer};
use crate::fts::tokenizer::BoxTokenStream;
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
/// With this tokenizer, the `position` is always 0.
/// Beware however, in presence of multiple value for the same field,
/// the position will be `POSITION_GAP * index of value`.
///
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
///
/// | Term | he | hel | el | ell | ll | llo | lo |
/// |----------|-----|-----|-----|-----|-----|-----|----|
/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
///
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
///
/// | Term | he | hel | hell | hello |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
///
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only:
/// **true**)
///
/// | Term | hε | hεl | hεll | hεllo |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "he");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 2);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "hel");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 3);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "el");
/// assert_eq!(token.offset_from, 1);
/// assert_eq!(token.offset_to, 3);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "ell");
/// assert_eq!(token.offset_from, 1);
/// assert_eq!(token.offset_to, 4);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "ll");
/// assert_eq!(token.offset_from, 2);
/// assert_eq!(token.offset_to, 4);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "llo");
/// assert_eq!(token.offset_from, 2);
/// assert_eq!(token.offset_to, 5);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "lo");
/// assert_eq!(token.offset_from, 3);
/// assert_eq!(token.offset_to, 5);
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
/// max size of the n-gram
max_gram: usize,
/// if true, will only parse the leading edge of the input
prefix_only: bool,
}
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
}
}
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
///
/// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false)
}
/// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true)
}
}
/// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> {
/// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
/// true if the NgramTokenStream is in prefix mode.
prefix_only: bool,
/// input
text: &'a str,
/// output
token: Token,
}
impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
From::from(NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text),
self.min_gram,
self.max_gram,
),
prefix_only: self.prefix_only,
text,
token: Token::default(),
})
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 {
return false;
}
self.token.position = 0;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.clear();
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
/// This iterator takes an underlying Iterator
/// and emits all of the pairs `(a,b)` such that
/// a and b are items emitted by the iterator at
/// an interval between `min_gram` and `max_gram`.
///
/// The elements are emitted in the order of appearance
/// of `a` first, `b` then.
///
/// See `test_stutterring_iterator` for an example of its
/// output.
struct StutteringIterator<T> {
underlying: T,
min_gram: usize,
max_gram: usize,
memory: Vec<usize>,
cursor: usize,
gram_len: usize,
}
impl<T> StutteringIterator<T>
where T: Iterator<Item = usize>
{
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
assert!(min_gram > 0);
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
if memory.len() <= min_gram {
// returns an empty iterator
StutteringIterator {
underlying,
min_gram: 1,
max_gram: 0,
memory,
cursor: 0,
gram_len: 0,
}
} else {
StutteringIterator {
underlying,
min_gram,
max_gram: memory.len() - 1,
memory,
cursor: 0,
gram_len: min_gram,
}
}
}
}
impl<T> Iterator for StutteringIterator<T>
where T: Iterator<Item = usize>
{
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if self.gram_len > self.max_gram {
// we have exhausted all options
// starting at `self.memory[self.cursor]`.
//
// Time to advance.
self.gram_len = self.min_gram;
if let Some(next_val) = self.underlying.next() {
self.memory[self.cursor] = next_val;
} else {
self.max_gram -= 1;
}
self.cursor += 1;
if self.cursor >= self.memory.len() {
self.cursor = 0;
}
}
if self.max_gram < self.min_gram {
return None;
}
let start = self.memory[self.cursor % self.memory.len()];
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
self.gram_len += 1;
Some((start, stop))
}
}
/// Emits all of the offsets where a codepoint starts
/// or a codepoint ends.
///
/// By convention, we emit `[0]` for the empty string.
struct CodepointFrontiers<'a> {
s: &'a str,
next_el: Option<usize>,
}
impl<'a> CodepointFrontiers<'a> {
fn for_str(s: &'a str) -> Self {
CodepointFrontiers {
s,
next_el: Some(0),
}
}
}
impl<'a> Iterator for CodepointFrontiers<'a> {
type Item = usize;
fn next(&mut self) -> Option<usize> {
self.next_el.map(|offset| {
if self.s.is_empty() {
self.next_el = None;
} else {
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
self.s = &self.s[first_codepoint_width..];
self.next_el = Some(offset + first_codepoint_width);
}
offset
})
}
}
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4];
// Number of bytes to encode a codepoint in UTF-8 given
// the first byte.
//
// To do that we count the number of higher significant bits set to `1`.
fn utf8_codepoint_width(b: u8) -> usize {
let higher_4_bits = (b as usize) >> 4;
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
}
#[cfg(test)]
mod tests {
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::tokenizer::Tokenizer;
use crate::fts::tokenizer::{BoxTokenStream, Token};
fn test_helper(mut tokenizer: BoxTokenStream<'_>) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens
}
#[test]
fn test_utf8_codepoint_width() {
// 0xxx
for i in 0..128 {
assert_eq!(utf8_codepoint_width(i), 1);
}
// 110xx
for i in (128 | 64)..(128 | 64 | 32) {
assert_eq!(utf8_codepoint_width(i), 2);
}
// 1110xx
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
assert_eq!(utf8_codepoint_width(i), 3);
}
// 1111xx
for i in (128 | 64 | 32 | 16)..256 {
assert_eq!(utf8_codepoint_width(i as u8), 4);
}
}
#[test]
fn test_codepoint_frontiers() {
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
assert_eq!(
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
vec![0, 1, 2, 3, 4]
);
assert_eq!(
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
vec![0, 1, 4]
);
}
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
assert_token(&tokens[2], 0, "e", 1, 2);
assert_token(&tokens[3], 0, "el", 1, 3);
assert_token(&tokens[4], 0, "l", 2, 3);
assert_token(&tokens[5], 0, "ll", 2, 4);
assert_token(&tokens[6], 0, "l", 3, 4);
assert_token(&tokens[7], 0, "lo", 3, 5);
assert_token(&tokens[8], 0, "o", 4, 5);
}
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
assert_token(&tokens[2], 0, "llo", 2, 5);
}
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
assert_token(&tokens[2], 0, "fran", 0, 4);
assert_token(&tokens[3], 0, "frank", 0, 5);
}
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "hε", 0, 3);
assert_token(&tokens[2], 0, "ε", 1, 3);
assert_token(&tokens[3], 0, "εl", 1, 4);
assert_token(&tokens[4], 0, "l", 3, 4);
assert_token(&tokens[5], 0, "ll", 3, 5);
assert_token(&tokens[6], 0, "l", 4, 5);
assert_token(&tokens[7], 0, "lo", 4, 6);
assert_token(&tokens[8], 0, "o", 5, 6);
}
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hε", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
assert_token(&tokens[2], 0, "hεll", 0, 5);
assert_token(&tokens[3], 0, "hεllo", 0, 6);
}
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
}
#[test]
#[should_panic(expected = "min_gram must not be greater than max_gram")]
fn test_invalid_interval_should_panic_if_smaller() {
NgramTokenizer::all_ngrams(2, 1);
}
#[test]
fn test_stutterring_iterator_empty() {
let rg: Vec<usize> = vec![0];
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
assert_eq!(it.next(), None);
}
#[test]
fn test_stutterring_iterator() {
let mut it = StutteringIterator::new(0..10, 1, 2);
assert_eq!(it.next(), Some((0, 1)));
assert_eq!(it.next(), Some((0, 2)));
assert_eq!(it.next(), Some((1, 2)));
assert_eq!(it.next(), Some((1, 3)));
assert_eq!(it.next(), Some((2, 3)));
assert_eq!(it.next(), Some((2, 4)));
assert_eq!(it.next(), Some((3, 4)));
assert_eq!(it.next(), Some((3, 5)));
assert_eq!(it.next(), Some((4, 5)));
assert_eq!(it.next(), Some((4, 6)));
assert_eq!(it.next(), Some((5, 6)));
assert_eq!(it.next(), Some((5, 7)));
assert_eq!(it.next(), Some((6, 7)));
assert_eq!(it.next(), Some((6, 8)));
assert_eq!(it.next(), Some((7, 8)));
assert_eq!(it.next(), Some((7, 9)));
assert_eq!(it.next(), Some((8, 9)));
assert_eq!(it.next(), None);
}
}

@ -0,0 +1,68 @@
use super::{Token, TokenStream, Tokenizer};
use crate::fts::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone)]
pub struct RawTokenizer;
pub struct RawTokenStream {
token: Token,
has_token: bool,
}
impl Tokenizer for RawTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let token = Token {
offset_from: 0,
offset_to: text.len(),
position: 0,
text: text.to_string(),
position_length: 1,
};
RawTokenStream {
token,
has_token: true,
}
.into()
}
}
impl TokenStream for RawTokenStream {
fn advance(&mut self) -> bool {
let result = self.has_token;
self.has_token = false;
result
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{RawTokenizer, TextAnalyzer, Token};
#[test]
fn test_raw_tokenizer() {
let tokens = token_stream_helper("Hello, happy tax payer!");
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(RawTokenizer);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

@ -0,0 +1,96 @@
//! # Example
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(RemoveLongFilter::limit(5));
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered
//! // out of the token stream.
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream};
use crate::fts::tokenizer::BoxTokenStream;
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
///
/// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)]
pub struct RemoveLongFilter {
length_limit: usize,
}
impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
RemoveLongFilter { length_limit }
}
}
impl<'a> RemoveLongFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
token.text.len() < self.token_length_limit
}
}
impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: token_stream,
})
}
}
pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: BoxTokenStream<'a>,
}
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{RemoveLongFilter, SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_remove_long() {
let tokens = token_stream_helper("hello tantivy, happy searching!");
assert_eq!(tokens.len(), 2);
assert_token(&tokens[0], 0, "hello", 0, 5);
assert_token(&tokens[1], 2, "happy", 15, 20);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6));
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

@ -0,0 +1,86 @@
use std::str::CharIndices;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)]
pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,
}
impl Tokenizer for SimpleTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(SimpleTokenStream {
text,
chars: text.char_indices(),
token: Token::default(),
})
}
}
impl<'a> SimpleTokenStream<'a> {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
.filter(|&(_, ref c)| !c.is_alphanumeric())
.map(|(offset, _)| offset)
.next()
.unwrap_or(self.text.len())
}
}
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() {
if c.is_alphanumeric() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
}
}
false
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{SimpleTokenizer, TextAnalyzer, Token};
#[test]
fn test_simple_tokenizer() {
let tokens = token_stream_helper("Hello, happy tax payer!");
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello", 0, 5);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer", 17, 22);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(SimpleTokenizer);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

@ -0,0 +1,249 @@
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use miette::{IntoDiagnostic, Result};
/// A [`TokenFilter`] which splits compound words into their parts
/// based on a given dictionary.
///
/// Words only will be split if they can be fully decomposed into
/// consecutive matches into the given dictionary.
///
/// This is mostly useful to split [compound nouns][compound] common to many
/// Germanic languages into their constituents.
///
/// # Example
///
/// The quality of the dictionary determines the quality of the splits,
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example.
///
/// ```rust
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
/// TextAnalyzer::from(SimpleTokenizer).filter(SplitCompoundWords::from_dictionary([
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
/// ]));
///
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
/// assert_eq!(stream.next().unwrap().text, "dampf");
/// assert_eq!(stream.next().unwrap().text, "schiff");
/// assert_eq!(stream.next().unwrap().text, "fahrt");
/// assert_eq!(stream.next(), None);
///
/// let mut stream = tokenizer.token_stream("brotbackautomat");
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
/// assert_eq!(stream.next(), None);
/// ```
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords {
dict: AhoCorasick,
}
impl SplitCompoundWords {
/// Create a filter from a given dictionary.
///
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Result<Self>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict)
.into_diagnostic()?;
Ok(Self::from_automaton(dict))
}
}
impl SplitCompoundWords {
/// Create a filter from a given automaton.
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick) -> Self {
Self { dict }
}
}
impl TokenFilter for SplitCompoundWords {
fn transform<'a>(&self, stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: stream,
cuts: Vec::new(),
parts: Vec::new(),
})
}
}
struct SplitCompoundWordsTokenStream<'a> {
dict: AhoCorasick,
tail: BoxTokenStream<'a>,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<'a> SplitCompoundWordsTokenStream<'a> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
let token = self.tail.token();
let mut text = token.text.as_str();
self.cuts.clear();
let mut pos = 0;
for match_ in self.dict.find_iter(text) {
if pos != match_.start() {
break;
}
self.cuts.push(pos);
pos = match_.end();
}
if pos == token.text.len() {
// Fill `self.parts` in reverse order,
// so that `self.parts.pop()` yields
// the tokens in their original order.
for pos in self.cuts.iter().rev() {
let (head, tail) = text.split_at(*pos);
text = head;
self.parts.push(Token {
text: tail.to_owned(),
..*token
});
}
}
}
}
impl<'a> TokenStream for SplitCompoundWordsTokenStream<'a> {
fn advance(&mut self) -> bool {
self.parts.pop();
if !self.parts.is_empty() {
return true;
}
if !self.tail.advance() {
return false;
}
// Will yield either `self.parts.last()` or
// `self.tail.token()` if it could not be split.
self.split();
true
}
fn token(&self) -> &Token {
self.parts.last().unwrap_or_else(|| self.tail.token())
}
fn token_mut(&mut self) -> &mut Token {
self.parts
.last_mut()
.unwrap_or_else(|| self.tail.token_mut())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fts::tokenizer::{SimpleTokenizer, TextAnalyzer};
#[test]
fn splitting_compound_words_works() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap());
{
let mut stream = tokenizer.token_stream("");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foo bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobarbaz");
assert_eq!(stream.next().unwrap().text, "foobarbaz");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("baz foobar qux");
assert_eq!(stream.next().unwrap().text, "baz");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobazbar foo bar foobar");
assert_eq!(stream.next().unwrap().text, "foobazbar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("foobar qux foobar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "qux");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next(), None);
}
{
let mut stream = tokenizer.token_stream("barfoo");
assert_eq!(stream.next().unwrap().text, "bar");
assert_eq!(stream.next().unwrap().text, "foo");
assert_eq!(stream.next(), None);
}
}
}

@ -0,0 +1,125 @@
use std::borrow::Cow;
use std::mem;
use rust_stemmers::{self, Algorithm};
use super::{Token, TokenFilter, TokenStream};
use crate::fts::tokenizer::BoxTokenStream;
/// Available stemmer languages.
#[derive(Debug, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq, Copy, Clone)]
#[allow(missing_docs)]
pub enum Language {
Arabic,
Danish,
Dutch,
English,
Finnish,
French,
German,
Greek,
Hungarian,
Italian,
Norwegian,
Portuguese,
Romanian,
Russian,
Spanish,
Swedish,
Tamil,
Turkish,
}
impl Language {
fn algorithm(self) -> Algorithm {
use self::Language::*;
match self {
Arabic => Algorithm::Arabic,
Danish => Algorithm::Danish,
Dutch => Algorithm::Dutch,
English => Algorithm::English,
Finnish => Algorithm::Finnish,
French => Algorithm::French,
German => Algorithm::German,
Greek => Algorithm::Greek,
Hungarian => Algorithm::Hungarian,
Italian => Algorithm::Italian,
Norwegian => Algorithm::Norwegian,
Portuguese => Algorithm::Portuguese,
Romanian => Algorithm::Romanian,
Russian => Algorithm::Russian,
Spanish => Algorithm::Spanish,
Swedish => Algorithm::Swedish,
Tamil => Algorithm::Tamil,
Turkish => Algorithm::Turkish,
}
}
}
/// `Stemmer` token filter. Several languages are supported, see [`Language`] for the available
/// languages.
/// Tokens are expected to be lowercased beforehand.
#[derive(Clone)]
pub struct Stemmer {
stemmer_algorithm: Algorithm,
}
impl Stemmer {
/// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm.
pub fn new(language: Language) -> Stemmer {
Stemmer {
stemmer_algorithm: language.algorithm(),
}
}
}
impl Default for Stemmer {
/// Creates a new `Stemmer` [`TokenFilter`] for [`Language::English`].
fn default() -> Self {
Stemmer::new(Language::English)
}
}
impl TokenFilter for Stemmer {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
BoxTokenStream::from(StemmerTokenStream {
tail: token_stream,
stemmer: inner_stemmer,
buffer: String::new(),
})
}
}
pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer,
buffer: String,
}
impl<'a> TokenStream for StemmerTokenStream<'a> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
let token = self.tail.token_mut();
let stemmed_str = self.stemmer.stem(&token.text);
match stemmed_str {
Cow::Owned(stemmed_str) => token.text = stemmed_str,
Cow::Borrowed(stemmed_str) => {
self.buffer.clear();
self.buffer.push_str(stemmed_str);
mem::swap(&mut token.text, &mut self.buffer);
}
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}

@ -0,0 +1,42 @@
import requests
LANGUAGES = [
"danish",
"dutch",
"finnish",
"french",
"german",
"italian",
"norwegian",
"portuguese",
"russian",
"spanish",
"swedish",
]
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
mod.write("/*\n")
mod.write(
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
)
resp = sess.get(
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
)
resp.raise_for_status()
mod.write(resp.text)
mod.write("*/\n\n")
for lang in LANGUAGES:
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
resp.raise_for_status()
mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")
for line in resp.text.splitlines():
line, _, _ = line.partition("|")
for word in line.split():
mod.write(f' "{word}",\n')
mod.write("];\n\n")

@ -0,0 +1,141 @@
//! # Example
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
//!
//! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox");
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! ```
#[cfg(feature = "stopwords")]
#[rustfmt::skip]
mod stopwords;
use std::sync::Arc;
use rustc_hash::FxHashSet;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
pub struct StopWordFilter {
words: Arc<FxHashSet<String>>,
}
impl StopWordFilter {
/// Creates a new [`StopWordFilter`] for the given [`Language`]
///
/// Returns `Some` if a list of stop words is available and `None` otherwise.
#[cfg(feature = "stopwords")]
pub fn new(language: Language) -> Option<Self> {
let words = match language {
Language::Danish => stopwords::DANISH,
Language::Dutch => stopwords::DUTCH,
Language::English => {
// This is the same list of words used by the Apache-licensed Lucene project,
// c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
&[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
]
}
Language::Finnish => stopwords::FINNISH,
Language::French => stopwords::FRENCH,
Language::German => stopwords::GERMAN,
Language::Italian => stopwords::ITALIAN,
Language::Norwegian => stopwords::NORWEGIAN,
Language::Portuguese => stopwords::PORTUGUESE,
Language::Russian => stopwords::RUSSIAN,
Language::Spanish => stopwords::SPANISH,
Language::Swedish => stopwords::SWEDISH,
_ => return None,
};
Some(Self::remove(words.iter().map(|&word| word.to_owned())))
}
/// Creates a `StopWordFilter` given a list of words to remove
pub fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
StopWordFilter {
words: Arc::new(words.into_iter().collect()),
}
}
}
pub struct StopWordFilterStream<'a> {
words: Arc<FxHashSet<String>>,
tail: BoxTokenStream<'a>,
}
impl TokenFilter for StopWordFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
BoxTokenStream::from(StopWordFilterStream {
words: self.words.clone(),
tail: token_stream,
})
}
}
impl<'a> StopWordFilterStream<'a> {
fn predicate(&self, token: &Token) -> bool {
!self.words.contains(&token.text)
}
}
impl<'a> TokenStream for StopWordFilterStream<'a> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{SimpleTokenizer, StopWordFilter, TextAnalyzer, Token};
#[test]
fn test_stop_word() {
let tokens = token_stream_helper("i am a cat. as yet i have no name.");
assert_eq!(tokens.len(), 5);
assert_token(&tokens[0], 3, "cat", 7, 10);
assert_token(&tokens[1], 5, "yet", 15, 18);
assert_token(&tokens[2], 7, "have", 21, 25);
assert_token(&tokens[3], 8, "no", 26, 28);
assert_token(&tokens[4], 9, "name", 29, 33);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let stops = vec![
"a".to_string(),
"as".to_string(),
"am".to_string(),
"i".to_string(),
];
let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::remove(stops));
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,100 @@
use std::cmp::Ordering;
use crate::fts::tokenizer::{Token, TokenStream};
/// Struct representing pre-tokenized text
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct PreTokenizedString {
/// Original text
pub text: String,
/// Tokens derived from the text
pub tokens: Vec<Token>,
}
impl Ord for PreTokenizedString {
fn cmp(&self, other: &Self) -> Ordering {
self.text.cmp(&other.text)
}
}
impl PartialOrd for PreTokenizedString {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// [`TokenStream`] implementation which wraps [`PreTokenizedString`]
pub struct PreTokenizedStream {
tokenized_string: PreTokenizedString,
current_token: i64,
}
impl From<PreTokenizedString> for PreTokenizedStream {
fn from(s: PreTokenizedString) -> PreTokenizedStream {
PreTokenizedStream {
tokenized_string: s,
current_token: -1,
}
}
}
impl TokenStream for PreTokenizedStream {
fn advance(&mut self) -> bool {
self.current_token += 1;
self.current_token < self.tokenized_string.tokens.len() as i64
}
fn token(&self) -> &Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&self.tokenized_string.tokens[self.current_token as usize]
}
fn token_mut(&mut self) -> &mut Token {
assert!(
self.current_token >= 0,
"TokenStream not initialized. You should call advance() at least once."
);
&mut self.tokenized_string.tokens[self.current_token as usize]
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fts::tokenizer::Token;
#[test]
fn test_tokenized_stream() {
let tok_text = PreTokenizedString {
text: String::from("A a"),
tokens: vec![
Token {
offset_from: 0,
offset_to: 1,
position: 0,
text: String::from("A"),
position_length: 1,
},
Token {
offset_from: 2,
offset_to: 3,
position: 1,
text: String::from("a"),
position_length: 1,
},
],
};
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
for expected_token in tok_text.tokens {
assert!(token_stream.advance());
assert_eq!(token_stream.token(), &expected_token);
}
assert!(!token_stream.advance());
}
}

@ -0,0 +1,310 @@
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use crate::fts::tokenizer::empty_tokenizer::EmptyTokenizer;
/// Token
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Is the length expressed in term of number of original tokens.
pub position_length: usize,
}
impl Default for Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,
position: usize::MAX,
text: String::with_capacity(200),
position_length: 1,
}
}
}
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>,
}
impl Default for TextAnalyzer {
fn default() -> TextAnalyzer {
TextAnalyzer::from(EmptyTokenizer)
}
}
impl<T: Tokenizer> From<T> for TextAnalyzer {
fn from(tokenizer: T) -> Self {
TextAnalyzer::new(tokenizer, Vec::new())
}
}
impl TextAnalyzer {
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
token_filters,
}
}
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// new one.
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::default());
/// ```
#[must_use]
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into());
self
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
}
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](crate::tokenizer) for more detail.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
T: TokenStream + 'a,
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// while let Some(token) = token_stream.next() {
/// println!("Token {:?}", token.text);
/// }
/// ```
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
///
/// Remove this.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
while self.advance() {
sink(self.token());
}
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
#[cfg(test)]
mod test {
use super::Token;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1,
};
let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}

@ -0,0 +1,78 @@
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use crate::fts::tokenizer::stemmer::Language;
use crate::fts::tokenizer::tokenizer::TextAnalyzer;
use crate::fts::tokenizer::{
LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer,
};
/// The tokenizer manager serves as a store for
/// all of the pre-configured tokenizer pipelines.
///
/// By default, it is populated with the following managers.
///
/// * `raw` : does not process nor tokenize the text.
/// * `default` : Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens
/// * `en_stem` : Like `default`, but also applies stemming on the
/// resulting tokens. Stemming can improve the recall of your
/// search engine.
/// * `whitespace` : Splits the text on whitespaces.
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
}
impl TokenizerManager {
/// Creates an empty tokenizer manager.
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
.insert(tokenizer_name.to_string(), boxed_tokenizer);
}
/// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")
.get(tokenizer_name)
.cloned()
}
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`.
fn default() -> TokenizerManager {
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer);
manager.register(
"default",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser),
);
manager.register(
"en_stem",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English)),
);
manager.register("whitespace", WhitespaceTokenizer);
manager
}
}

@ -0,0 +1,86 @@
use std::str::CharIndices;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces.
#[derive(Clone)]
pub struct WhitespaceTokenizer;
pub struct WhitespaceTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,
}
impl Tokenizer for WhitespaceTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(WhitespaceTokenStream {
text,
chars: text.char_indices(),
token: Token::default(),
})
}
}
impl<'a> WhitespaceTokenStream<'a> {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
.filter(|&(_, ref c)| c.is_ascii_whitespace())
.map(|(offset, _)| offset)
.next()
.unwrap_or(self.text.len())
}
}
impl<'a> TokenStream for WhitespaceTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
while let Some((offset_from, c)) = self.chars.next() {
if !c.is_ascii_whitespace() {
let offset_to = self.search_token_end();
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
return true;
}
}
false
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::{TextAnalyzer, Token, WhitespaceTokenizer};
#[test]
fn test_whitespace_tokenizer() {
let tokens = token_stream_helper("Hello, happy tax payer!");
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
fn token_stream_helper(text: &str) -> Vec<Token> {
let a = TextAnalyzer::from(WhitespaceTokenizer);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

@ -82,6 +82,7 @@ pub(crate) mod query;
pub(crate) mod runtime;
pub(crate) mod storage;
pub(crate) mod utils;
pub(crate) mod fts;
/// A dispatcher for concrete storage implementations, wrapping [Db]. This is done so that
/// client code does not have to deal with generic code constantly. You may prefer to use

@ -923,3 +923,34 @@ fn test_insertions() {
println!("{} {}", row[0], row[1]);
}
}
#[test]
fn tentivy_tokenizers() {
use crate::fts::cangjie::*;
use crate::fts::tokenizer::*;
use jieba_rs::Jieba;
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English));
let mut token_stream = tokenizer.token_stream("It is closer to Apache Lucene than to Elasticsearch or Apache Solr in the sense it is not an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine.");
while let Some(token) = token_stream.next() {
println!("Token {:?}", token.text);
}
println!("XXXXXXXXXXXXX");
let tokenizer = TextAnalyzer::from(CangJieTokenizer {
worker: std::sync::Arc::new(Jieba::new()),
option: TokenizerOption::Default { hmm: false },
})
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English));
let mut token_stream = tokenizer.token_stream("这个产品Finchat.io是一个相对比较有特色的文档问答类网站它集成了750多家公司的经融数据。感觉是把财报等数据借助Embedding都向量化了然后接入ChatGPT进行对话。");
while let Some(token) = token_stream.next() {
println!("Token {:?}", token.text);
}
}

Loading…
Cancel
Save