clean up imported code

main
Ziyang Hu 1 year ago
parent 271f36301d
commit a880431726

@ -6,5 +6,264 @@
* You can obtain one at https://mozilla.org/MPL/2.0/.
*/
pub(crate) mod tokenizer;
use crate::data::memcmp::MemCmpEncoder;
use crate::fts::cangjie::tokenizer::CangJieTokenizer;
use crate::fts::tokenizer::{
AlphaNumOnlyFilter, AsciiFoldingFilter, BoxTokenFilter, Language, LowerCaser, NgramTokenizer,
RawTokenizer, RemoveLongFilter, SimpleTokenizer, SplitCompoundWords, Stemmer, StopWordFilter,
TextAnalyzer, Tokenizer, WhitespaceTokenizer,
};
use crate::DataValue;
use jieba_rs::Jieba;
use miette::{bail, ensure, miette, Result};
use sha2::digest::FixedOutput;
use sha2::{Digest, Sha256};
use smartstring::{LazyCompact, SmartString};
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
pub(crate) mod cangjie;
pub(crate) mod tokenizer;
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde_derive::Serialize, serde_derive::Deserialize)]
pub(crate) struct TokenizerFilterConfig {
pub(crate) name: SmartString<LazyCompact>,
pub(crate) args: Vec<DataValue>,
}
impl TokenizerFilterConfig {
// use sha256::digest;
pub(crate) fn config_hash(&self, filters: &[Self]) -> impl AsRef<[u8]> {
let mut hasher = Sha256::new();
hasher.update(self.name.as_bytes());
let mut args_vec = vec![];
for arg in &self.args {
args_vec.encode_datavalue(arg);
}
hasher.update(&args_vec);
for filter in filters {
hasher.update(filter.name.as_bytes());
args_vec.clear();
for arg in &filter.args {
args_vec.encode_datavalue(arg);
}
hasher.update(&args_vec);
}
hasher.finalize_fixed()
}
pub(crate) fn build(&self, filters: &[Self]) -> Result<TextAnalyzer> {
let tokenizer = self.construct_tokenizer()?;
let token_filters = filters
.iter()
.map(|filter| filter.construct_token_filter())
.collect::<Result<Vec<_>>>()?;
Ok(TextAnalyzer {
tokenizer,
token_filters,
})
}
pub(crate) fn construct_tokenizer(&self) -> Result<Box<dyn Tokenizer>> {
Ok(match &self.name as &str {
"Raw" => Box::new(RawTokenizer),
"Simple" => Box::new(SimpleTokenizer),
"Whitespace" => Box::new(WhitespaceTokenizer),
"NGram" => {
let min_gram = self
.args
.get(0)
.ok_or_else(|| miette!("Missing first argument `min_gram`"))?
.get_int()
.ok_or_else(|| miette!("First argument `min_gram` must be an integer"))?;
let max_gram = self
.args
.get(1)
.unwrap_or(&DataValue::from(min_gram))
.get_int()
.ok_or_else(|| miette!("Second argument `max_gram` must be an integer"))?;
let prefix_only = self
.args
.get(2)
.unwrap_or(&DataValue::Bool(false))
.get_bool()
.ok_or_else(|| miette!("Third argument `prefix_only` must be a boolean"))?;
ensure!(min_gram >= 1, "min_gram must be >= 1");
ensure!(max_gram >= min_gram, "max_gram must be >= min_gram");
Box::new(NgramTokenizer::new(
min_gram as usize,
max_gram as usize,
prefix_only,
))
}
"Cangjie" => {
let hmm = match self.args.get(1) {
None => false,
Some(d) => d.get_bool().ok_or_else(|| {
miette!("Second argument `use_hmm` to Cangjie must be a boolean")
})?,
};
let option = match self.args.get(0) {
None => cangjie::options::TokenizerOption::Default { hmm },
Some(d) => {
let s = d.get_str().ok_or_else(|| {
miette!("First argument `kind` to Cangjie must be a string")
})?;
match s {
"default" => cangjie::options::TokenizerOption::Default { hmm },
"all" => cangjie::options::TokenizerOption::All,
"search" => cangjie::options::TokenizerOption::ForSearch { hmm },
"unicode" => cangjie::options::TokenizerOption::Unicode,
_ => bail!("Unknown Cangjie kind: {}", s),
}
}
};
Box::new(CangJieTokenizer {
worker: std::sync::Arc::new(Jieba::new()),
option,
})
}
_ => bail!("Unknown tokenizer: {}", self.name),
})
}
pub(crate) fn construct_token_filter(&self) -> Result<BoxTokenFilter> {
Ok(match &self.name as &str {
"AlphaNumOnly" => AlphaNumOnlyFilter.into(),
"AsciiFolding" => AsciiFoldingFilter.into(),
"LowerCase" => LowerCaser.into(),
"RemoveLong" => RemoveLongFilter::limit(
self.args
.get(0)
.ok_or_else(|| miette!("Missing first argument `min_length`"))?
.get_int()
.ok_or_else(|| miette!("First argument `min_length` must be an integer"))?
as usize,
)
.into(),
"SplitCompoundWords" => {
let mut list_values = Vec::new();
match self
.args
.get(0)
.ok_or_else(|| miette!("Missing first argument `compound_words_list`"))?
{
DataValue::List(l) => {
for v in l {
list_values.push(
v.get_str()
.ok_or_else(|| {
miette!("First argument `compound_words_list` must be a list of strings")
})?,
);
}
}
_ => bail!("First argument `compound_words_list` must be a list of strings"),
}
SplitCompoundWords::from_dictionary(list_values)
.map_err(|e| miette!("Failed to load dictionary: {}", e))?
.into()
}
"Stemmer" => {
let language = match self
.args
.get(0)
.ok_or_else(|| miette!("Missing first argument `language` to Stemmer"))?
.get_str()
.ok_or_else(|| {
miette!("First argument `language` to Stemmer must be a string")
})? {
"arabic" => Language::Arabic,
"danish" => Language::Danish,
"dutch" => Language::Dutch,
"english" => Language::English,
"finnish" => Language::Finnish,
"french" => Language::French,
"german" => Language::German,
"greek" => Language::Greek,
"hungarian" => Language::Hungarian,
"italian" => Language::Italian,
"norwegian" => Language::Norwegian,
"portuguese" => Language::Portuguese,
"romanian" => Language::Romanian,
"russian" => Language::Russian,
"spanish" => Language::Spanish,
"swedish" => Language::Swedish,
"tamil" => Language::Tamil,
"turkish" => Language::Turkish,
_ => bail!("Unsupported language: {}", self.name),
};
Stemmer::new(language).into()
}
"Stopwords" => {
match self.args.get(0).ok_or_else(|| {
miette!("Filter Stopwords requires language name or a list of stopwords")
})? {
DataValue::Str(name) => StopWordFilter::for_lang(name)?.into(),
DataValue::List(l) => {
let mut stopwords = Vec::new();
for v in l {
stopwords.push(
v.get_str()
.ok_or_else(|| {
miette!(
"First argument `stopwords` must be a list of strings"
)
})?
.to_string(),
);
}
StopWordFilter::new(stopwords).into()
}
_ => bail!("Filter Stopwords requires language name or a list of stopwords"),
}
}
_ => bail!("Unknown token filter: {}", self.name),
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde_derive::Serialize, serde_derive::Deserialize)]
pub(crate) struct FtsIndexConfig {
base_relation: SmartString<LazyCompact>,
index_name: SmartString<LazyCompact>,
fts_fields: Vec<SmartString<LazyCompact>>,
tokenizer: TokenizerFilterConfig,
filters: Vec<TokenizerFilterConfig>,
}
#[derive(Default)]
pub(crate) struct TokenizerCache {
named_cache: RwLock<HashMap<SmartString<LazyCompact>, Arc<TextAnalyzer>>>,
hashed_cache: RwLock<HashMap<Vec<u8>, Arc<TextAnalyzer>>>,
}
impl TokenizerCache {
pub(crate) fn get(
&self,
tokenizer_name: &str,
tokenizer: &TokenizerFilterConfig,
filters: &[TokenizerFilterConfig],
) -> Result<Arc<TextAnalyzer>> {
{
let idx_cache = self.named_cache.read().unwrap();
if let Some(analyzer) = idx_cache.get(tokenizer_name) {
return Ok(analyzer.clone());
}
}
let hash = tokenizer.config_hash(filters);
{
let hashed_cache = self.hashed_cache.read().unwrap();
if let Some(analyzer) = hashed_cache.get(hash.as_ref()) {
let mut idx_cache = self.named_cache.write().unwrap();
idx_cache.insert(tokenizer_name.into(), analyzer.clone());
return Ok(analyzer.clone());
}
}
{
let analyzer = Arc::new(tokenizer.build(filters)?);
let mut hashed_cache = self.hashed_cache.write().unwrap();
hashed_cache.insert(hash.as_ref().to_vec(), analyzer.clone());
let mut idx_cache = self.named_cache.write().unwrap();
idx_cache.insert(tokenizer_name.into(), analyzer.clone());
return Ok(analyzer);
}
}
}

@ -136,7 +136,6 @@ mod stemmer;
mod stop_word_filter;
mod tokenized_string;
mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
@ -149,24 +148,16 @@ pub(crate) use self::simple_tokenizer::SimpleTokenizer;
pub(crate) use self::split_compound_words::SplitCompoundWords;
pub(crate) use self::stemmer::{Language, Stemmer};
pub(crate) use self::stop_word_filter::StopWordFilter;
pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
// pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub(crate) use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
};
pub(crate) use self::tokenizer_manager::TokenizerManager;
pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;
/// Maximum authorized len (in bytes) for a token.
///
/// Tokenizers are in charge of not emitting tokens larger than this value.
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
/// `2^16 - 1 - 5`, the token will simply be ignored downstream.
pub(crate) const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
#[cfg(test)]
pub(crate) mod tests {
use super::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token,
};
use crate::fts::tokenizer::TextAnalyzer;
@ -190,117 +181,4 @@ pub(crate) mod tests {
to, token
);
}
#[test]
fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
#[test]
fn test_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hello", 0, 5);
assert_token(&tokens[1], 1, "happi", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
fn test_non_en_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register(
"el_stem",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::Greek)),
);
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
}
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
}
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
ws_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
}

@ -1,42 +1,23 @@
import requests
LANGUAGES = [
"danish",
"dutch",
"finnish",
"french",
"german",
"italian",
"norwegian",
"portuguese",
"russian",
"spanish",
"swedish",
]
resp = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/stopwords-iso.json")
resp.raise_for_status()
data = resp.json()
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
mod.write("/*\n")
mod.write(
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
"These stop word lists are from the stopwords-iso project (https://github.com/stopwords-iso/stopwords-iso/) "
"which carries the MIT license."
)
mod.write("\n*/\n\n")
resp = sess.get(
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
)
resp.raise_for_status()
mod.write(resp.text)
mod.write("*/\n\n")
for lang in LANGUAGES:
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
resp.raise_for_status()
for lang, data in data.items():
mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
for line in resp.text.splitlines():
line, _, _ = line.partition("|")
for word in line.split():
mod.write(f' "{word}",\n')
for word in data:
mod.write(f' r#"{word}"#,\n')
mod.write("];\n\n")
print(f'"{lang}" => stopwords::{lang.upper()},')

@ -1,5 +1,5 @@
//! # Example
//! ```rust
//! ```text
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -16,9 +16,9 @@ mod stopwords;
use std::sync::Arc;
use rustc_hash::FxHashSet;
use crate::fts::tokenizer::Language;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
use miette::{bail, Result};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
@ -30,36 +30,74 @@ impl StopWordFilter {
/// Creates a new [`StopWordFilter`] for the given [`Language`]
///
/// Returns `Some` if a list of stop words is available and `None` otherwise.
pub(crate) fn new(language: Language) -> Option<Self> {
pub(crate) fn for_lang(language: &str) -> Result<Self> {
let words = match language {
Language::Danish => stopwords::DANISH,
Language::Dutch => stopwords::DUTCH,
Language::English => {
// This is the same list of words used by the Apache-licensed Lucene project,
// c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
&[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
]
}
Language::Finnish => stopwords::FINNISH,
Language::French => stopwords::FRENCH,
Language::German => stopwords::GERMAN,
Language::Italian => stopwords::ITALIAN,
Language::Norwegian => stopwords::NORWEGIAN,
Language::Portuguese => stopwords::PORTUGUESE,
Language::Russian => stopwords::RUSSIAN,
Language::Spanish => stopwords::SPANISH,
Language::Swedish => stopwords::SWEDISH,
_ => return None,
"af" => stopwords::AF,
"ar" => stopwords::AR,
"hy" => stopwords::HY,
"eu" => stopwords::EU,
"bn" => stopwords::BN,
"br" => stopwords::BR,
"bg" => stopwords::BG,
"ca" => stopwords::CA,
"zh" => stopwords::ZH,
"hr" => stopwords::HR,
"cs" => stopwords::CS,
"da" => stopwords::DA,
"nl" => stopwords::NL,
"en" => stopwords::EN,
"eo" => stopwords::EO,
"et" => stopwords::ET,
"fi" => stopwords::FI,
"fr" => stopwords::FR,
"gl" => stopwords::GL,
"de" => stopwords::DE,
"el" => stopwords::EL,
"gu" => stopwords::GU,
"ha" => stopwords::HA,
"he" => stopwords::HE,
"hi" => stopwords::HI,
"hu" => stopwords::HU,
"id" => stopwords::ID,
"ga" => stopwords::GA,
"it" => stopwords::IT,
"ja" => stopwords::JA,
"ko" => stopwords::KO,
"ku" => stopwords::KU,
"la" => stopwords::LA,
"lt" => stopwords::LT,
"lv" => stopwords::LV,
"ms" => stopwords::MS,
"mr" => stopwords::MR,
"no" => stopwords::NO,
"fa" => stopwords::FA,
"pl" => stopwords::PL,
"pt" => stopwords::PT,
"ro" => stopwords::RO,
"ru" => stopwords::RU,
"sk" => stopwords::SK,
"sl" => stopwords::SL,
"so" => stopwords::SO,
"st" => stopwords::ST,
"es" => stopwords::ES,
"sw" => stopwords::SW,
"sv" => stopwords::SV,
"th" => stopwords::TH,
"tl" => stopwords::TL,
"tr" => stopwords::TR,
"uk" => stopwords::UK,
"ur" => stopwords::UR,
"vi" => stopwords::VI,
"yo" => stopwords::YO,
"zu" => stopwords::ZU,
_ => bail!("Unsupported language: {}", language),
};
Some(Self::remove(words.iter().map(|&word| word.to_owned())))
Ok(Self::new(words.iter().map(|&word| word.to_owned())))
}
/// Creates a `StopWordFilter` given a list of words to remove
pub(crate) fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
pub(crate) fn new<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
StopWordFilter {
words: Arc::new(words.into_iter().collect()),
}
@ -128,7 +166,7 @@ mod tests {
"am".to_string(),
"i".to_string(),
];
let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::remove(stops));
let a = TextAnalyzer::from(SimpleTokenizer).filter(StopWordFilter::new(stops));
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {

File diff suppressed because it is too large Load Diff

@ -39,8 +39,8 @@ impl Default for Token {
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub(crate) struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>,
pub(crate) tokenizer: Box<dyn Tokenizer>,
pub(crate) token_filters: Vec<BoxTokenFilter>,
}
impl Default for TextAnalyzer {

@ -1,78 +0,0 @@
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use crate::fts::tokenizer::stemmer::Language;
use crate::fts::tokenizer::tokenizer::TextAnalyzer;
use crate::fts::tokenizer::{
LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer,
};
/// The tokenizer manager serves as a store for
/// all of the pre-configured tokenizer pipelines.
///
/// By default, it is populated with the following managers.
///
/// * `raw` : does not process nor tokenize the text.
/// * `default` : Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens
/// * `en_stem` : Like `default`, but also applies stemming on the
/// resulting tokens. Stemming can improve the recall of your
/// search engine.
/// * `whitespace` : Splits the text on whitespaces.
#[derive(Clone)]
pub(crate) struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
}
impl TokenizerManager {
/// Creates an empty tokenizer manager.
pub(crate) fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Registers a new tokenizer associated with a given name.
pub(crate) fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
.insert(tokenizer_name.to_string(), boxed_tokenizer);
}
/// Accessing a tokenizer given its name.
pub(crate) fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")
.get(tokenizer_name)
.cloned()
}
}
impl Default for TokenizerManager {
/// Creates an `TokenizerManager` prepopulated with
/// the default pre-configured tokenizers of `tantivy`.
fn default() -> TokenizerManager {
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer);
manager.register(
"default",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser),
);
manager.register(
"en_stem",
TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English)),
);
manager.register("whitespace", WhitespaceTokenizer);
manager
}
}

@ -57,6 +57,7 @@ use crate::runtime::transact::SessionTx;
use crate::storage::temp::TempStorage;
use crate::storage::{Storage, StoreTx};
use crate::{decode_tuple_from_kv, FixedRule};
use crate::fts::TokenizerCache;
pub(crate) struct RunningQueryHandle {
pub(crate) started_at: f64,
@ -91,6 +92,7 @@ pub struct Db<S> {
pub(crate) queries_count: Arc<AtomicU64>,
pub(crate) running_queries: Arc<Mutex<BTreeMap<u64, RunningQueryHandle>>>,
pub(crate) fixed_rules: Arc<ShardedLock<BTreeMap<String, Arc<Box<dyn FixedRule>>>>>,
pub(crate) tokenizers: Arc<TokenizerCache>,
#[cfg(not(target_arch = "wasm32"))]
callback_count: Arc<AtomicU32>,
#[cfg(not(target_arch = "wasm32"))]
@ -239,6 +241,7 @@ impl<'s, S: Storage<'s>> Db<S> {
queries_count: Default::default(),
running_queries: Default::default(),
fixed_rules: Arc::new(ShardedLock::new(DEFAULT_FIXED_RULES.clone())),
tokenizers: Arc::new(Default::default()),
#[cfg(not(target_arch = "wasm32"))]
callback_count: Default::default(),
// callback_receiver: Arc::new(receiver),

@ -19,6 +19,7 @@ use crate::data::expr::Expr;
use crate::data::symb::Symbol;
use crate::data::value::DataValue;
use crate::fixed_rule::FixedRulePayload;
use crate::fts::{TokenizerCache, TokenizerFilterConfig};
use crate::parse::SourceSpan;
use crate::runtime::callback::CallbackOp;
use crate::runtime::db::Poison;
@ -613,16 +614,15 @@ fn test_index() {
#[test]
fn test_json_objects() {
let db = new_cozo_mem().unwrap();
db.run_script(
"?[a] := a = {'a': 1}",
Default::default(),
).unwrap();
db.run_script("?[a] := a = {'a': 1}", Default::default())
.unwrap();
db.run_script(
r"?[a] := a = {
'a': 1
}",
Default::default(),
).unwrap();
)
.unwrap();
}
#[test]
@ -944,15 +944,22 @@ fn test_insertions() {
#[test]
fn tentivy_tokenizers() {
use crate::fts::cangjie::tokenizer::CangJieTokenizer;
use crate::fts::cangjie::options::TokenizerOption;
use crate::fts::tokenizer::*;
use jieba_rs::Jieba;
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English));
let tokenizers = TokenizerCache::default();
let tokenizer = tokenizers
.get(
"simple",
&TokenizerFilterConfig {
name: "Simple".into(),
args: vec![],
},
&[],
)
.unwrap();
// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
// .filter(RemoveLongFilter::limit(40))
// .filter(LowerCaser)
// .filter(Stemmer::new(Language::English));
let mut token_stream = tokenizer.token_stream("It is closer to Apache Lucene than to Elasticsearch or Apache Solr in the sense it is not an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine.");
while let Some(token) = token_stream.next() {
println!("Token {:?}", token.text);
@ -960,13 +967,16 @@ fn tentivy_tokenizers() {
println!("XXXXXXXXXXXXX");
let tokenizer = TextAnalyzer::from(CangJieTokenizer {
worker: std::sync::Arc::new(Jieba::new()),
option: TokenizerOption::Default { hmm: false },
})
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new(Language::English));
let tokenizer = tokenizers
.get(
"cangjie",
&TokenizerFilterConfig {
name: "Cangjie".into(),
args: vec![],
},
&[],
)
.unwrap();
let mut token_stream = tokenizer.token_stream("这个产品Finchat.io是一个相对比较有特色的文档问答类网站它集成了750多家公司的经融数据。感觉是把财报等数据借助Embedding都向量化了然后接入ChatGPT进行对话。");
while let Some(token) = token_stream.next() {

Loading…
Cancel
Save