Ziyang Hu 1 year ago
parent 113c91a5de
commit 147101b7f3

@ -22,10 +22,10 @@ use smartstring::{LazyCompact, SmartString};
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
pub(crate) mod ast;
pub(crate) mod cangjie; pub(crate) mod cangjie;
pub(crate) mod tokenizer;
pub(crate) mod indexing; pub(crate) mod indexing;
pub(crate) mod ast; pub(crate) mod tokenizer;
#[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)] #[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)]
pub(crate) struct FtsIndexManifest { pub(crate) struct FtsIndexManifest {
@ -139,7 +139,7 @@ impl TokenizerConfig {
Ok(match &self.name as &str { Ok(match &self.name as &str {
"AlphaNumOnly" => AlphaNumOnlyFilter.into(), "AlphaNumOnly" => AlphaNumOnlyFilter.into(),
"AsciiFolding" => AsciiFoldingFilter.into(), "AsciiFolding" => AsciiFoldingFilter.into(),
"LowerCase" => LowerCaser.into(), "LowerCase" | "Lowercase" => LowerCaser.into(),
"RemoveLong" => RemoveLongFilter::limit( "RemoveLong" => RemoveLongFilter::limit(
self.args self.args
.get(0) .get(0)
@ -180,7 +180,10 @@ impl TokenizerConfig {
.get_str() .get_str()
.ok_or_else(|| { .ok_or_else(|| {
miette!("First argument `language` to Stemmer must be a string") miette!("First argument `language` to Stemmer must be a string")
})? { })?
.to_lowercase()
.as_str()
{
"arabic" => Language::Arabic, "arabic" => Language::Arabic,
"danish" => Language::Danish, "danish" => Language::Danish,
"dutch" => Language::Dutch, "dutch" => Language::Dutch,
@ -199,7 +202,7 @@ impl TokenizerConfig {
"swedish" => Language::Swedish, "swedish" => Language::Swedish,
"tamil" => Language::Tamil, "tamil" => Language::Tamil,
"turkish" => Language::Turkish, "turkish" => Language::Turkish,
_ => bail!("Unsupported language: {}", self.name), lang => bail!("Unsupported language: {}", lang),
}; };
Stemmer::new(language).into() Stemmer::new(language).into()
} }
@ -226,7 +229,7 @@ impl TokenizerConfig {
_ => bail!("Filter Stopwords requires language name or a list of stopwords"), _ => bail!("Filter Stopwords requires language name or a list of stopwords"),
} }
} }
_ => bail!("Unknown token filter: {}", self.name), _ => bail!("Unknown token filter: {:?}", self.name),
}) })
} }
} }

@ -320,7 +320,7 @@ pub(crate) fn parse_sys(
expr.partial_eval()?; expr.partial_eval()?;
match expr { match expr {
Expr::Apply { op, args, .. } => { Expr::Apply { op, args, .. } => {
if op.name != "LIST" { if op.name != "OP_LIST" {
bail!("Filters must be a list of filters"); bail!("Filters must be a list of filters");
} }
for arg in args.iter() { for arg in args.iter() {
@ -454,7 +454,7 @@ pub(crate) fn parse_sys(
expr.partial_eval()?; expr.partial_eval()?;
match expr { match expr {
Expr::Apply { op, args, .. } => { Expr::Apply { op, args, .. } => {
if op.name != "LIST" { if op.name != "OP_LIST" {
bail!("Filters must be a list of filters"); bail!("Filters must be a list of filters");
} }
for arg in args.iter() { for arg in args.iter() {

@ -914,7 +914,11 @@ fn test_fts_indexing() {
) )
.unwrap(); .unwrap();
db.run_script( db.run_script(
r"::fts create a:fts {extractor: v, tokenizer: Simple }", r"::fts create a:fts {
extractor: v,
tokenizer: Simple,
filters: [Lowercase, Stemmer('English'), Stopwords('en')]
}",
Default::default(), Default::default(),
) )
.unwrap(); .unwrap();
@ -939,6 +943,7 @@ fn test_fts_indexing() {
for row in res.into_json()["rows"].as_array().unwrap() { for row in res.into_json()["rows"].as_array().unwrap() {
println!("{}", row); println!("{}", row);
} }
println!("query");
let res = db let res = db
.run_script( .run_script(
r"?[k, v, s] := ~a:fts{k, v | query: 'world', k: 2, bind_score: s}", r"?[k, v, s] := ~a:fts{k, v | query: 'world', k: 2, bind_score: s}",

Loading…
Cancel
Save