clean up imported code
parent
271f36301d
commit
a880431726
@ -1,42 +1,23 @@
|
||||
import requests
|
||||
|
||||
LANGUAGES = [
|
||||
"danish",
|
||||
"dutch",
|
||||
"finnish",
|
||||
"french",
|
||||
"german",
|
||||
"italian",
|
||||
"norwegian",
|
||||
"portuguese",
|
||||
"russian",
|
||||
"spanish",
|
||||
"swedish",
|
||||
]
|
||||
resp = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/stopwords-iso.json")
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
|
||||
mod.write("/*\n")
|
||||
mod.write(
|
||||
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
|
||||
)
|
||||
|
||||
resp = sess.get(
|
||||
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
|
||||
"These stop word lists are from the stopwords-iso project (https://github.com/stopwords-iso/stopwords-iso/) "
|
||||
"which carries the MIT license."
|
||||
)
|
||||
resp.raise_for_status()
|
||||
mod.write(resp.text)
|
||||
mod.write("*/\n\n")
|
||||
mod.write("\n*/\n\n")
|
||||
|
||||
for lang in LANGUAGES:
|
||||
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
|
||||
resp.raise_for_status()
|
||||
for lang, data in data.items():
|
||||
|
||||
mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
|
||||
|
||||
for line in resp.text.splitlines():
|
||||
line, _, _ = line.partition("|")
|
||||
|
||||
for word in line.split():
|
||||
mod.write(f' "{word}",\n')
|
||||
for word in data:
|
||||
mod.write(f' r#"{word}"#,\n')
|
||||
|
||||
mod.write("];\n\n")
|
||||
print(f'"{lang}" => stopwords::{lang.upper()},')
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,78 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use crate::fts::tokenizer::stemmer::Language;
|
||||
use crate::fts::tokenizer::tokenizer::TextAnalyzer;
|
||||
use crate::fts::tokenizer::{
|
||||
LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer,
|
||||
};
|
||||
|
||||
/// The tokenizer manager serves as a store for
|
||||
/// all of the pre-configured tokenizer pipelines.
|
||||
///
|
||||
/// By default, it is populated with the following managers.
|
||||
///
|
||||
/// * `raw` : does not process nor tokenize the text.
|
||||
/// * `default` : Chops the text on according to whitespace and
|
||||
/// punctuation, removes tokens that are too long, and lowercases
|
||||
/// tokens
|
||||
/// * `en_stem` : Like `default`, but also applies stemming on the
|
||||
/// resulting tokens. Stemming can improve the recall of your
|
||||
/// search engine.
|
||||
/// * `whitespace` : Splits the text on whitespaces.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct TokenizerManager {
|
||||
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Creates an empty tokenizer manager.
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub(crate) fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where TextAnalyzer: From<T> {
|
||||
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||
self.tokenizers
|
||||
.write()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.insert(tokenizer_name.to_string(), boxed_tokenizer);
|
||||
}
|
||||
|
||||
/// Accessing a tokenizer given its name.
|
||||
pub(crate) fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
|
||||
self.tokenizers
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.get(tokenizer_name)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TokenizerManager {
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
);
|
||||
manager.register("whitespace", WhitespaceTokenizer);
|
||||
manager
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue