clean up imported code
parent
271f36301d
commit
a880431726
@ -1,42 +1,23 @@
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
LANGUAGES = [
|
resp = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/stopwords-iso.json")
|
||||||
"danish",
|
resp.raise_for_status()
|
||||||
"dutch",
|
data = resp.json()
|
||||||
"finnish",
|
|
||||||
"french",
|
|
||||||
"german",
|
|
||||||
"italian",
|
|
||||||
"norwegian",
|
|
||||||
"portuguese",
|
|
||||||
"russian",
|
|
||||||
"spanish",
|
|
||||||
"swedish",
|
|
||||||
]
|
|
||||||
|
|
||||||
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
|
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
|
||||||
mod.write("/*\n")
|
mod.write("/*\n")
|
||||||
mod.write(
|
mod.write(
|
||||||
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
|
"These stop word lists are from the stopwords-iso project (https://github.com/stopwords-iso/stopwords-iso/) "
|
||||||
|
"which carries the MIT license."
|
||||||
)
|
)
|
||||||
|
mod.write("\n*/\n\n")
|
||||||
|
|
||||||
resp = sess.get(
|
for lang, data in data.items():
|
||||||
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
mod.write(resp.text)
|
|
||||||
mod.write("*/\n\n")
|
|
||||||
|
|
||||||
for lang in LANGUAGES:
|
|
||||||
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
|
|
||||||
resp.raise_for_status()
|
|
||||||
|
|
||||||
mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
|
mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
|
||||||
|
|
||||||
for line in resp.text.splitlines():
|
for word in data:
|
||||||
line, _, _ = line.partition("|")
|
mod.write(f' r#"{word}"#,\n')
|
||||||
|
|
||||||
for word in line.split():
|
|
||||||
mod.write(f' "{word}",\n')
|
|
||||||
|
|
||||||
mod.write("];\n\n")
|
mod.write("];\n\n")
|
||||||
|
print(f'"{lang}" => stopwords::{lang.upper()},')
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,78 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::sync::{Arc, RwLock};
|
|
||||||
|
|
||||||
use crate::fts::tokenizer::stemmer::Language;
|
|
||||||
use crate::fts::tokenizer::tokenizer::TextAnalyzer;
|
|
||||||
use crate::fts::tokenizer::{
|
|
||||||
LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// The tokenizer manager serves as a store for
|
|
||||||
/// all of the pre-configured tokenizer pipelines.
|
|
||||||
///
|
|
||||||
/// By default, it is populated with the following managers.
|
|
||||||
///
|
|
||||||
/// * `raw` : does not process nor tokenize the text.
|
|
||||||
/// * `default` : Chops the text on according to whitespace and
|
|
||||||
/// punctuation, removes tokens that are too long, and lowercases
|
|
||||||
/// tokens
|
|
||||||
/// * `en_stem` : Like `default`, but also applies stemming on the
|
|
||||||
/// resulting tokens. Stemming can improve the recall of your
|
|
||||||
/// search engine.
|
|
||||||
/// * `whitespace` : Splits the text on whitespaces.
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct TokenizerManager {
|
|
||||||
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TokenizerManager {
|
|
||||||
/// Creates an empty tokenizer manager.
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Registers a new tokenizer associated with a given name.
|
|
||||||
pub(crate) fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
|
||||||
where TextAnalyzer: From<T> {
|
|
||||||
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
|
||||||
self.tokenizers
|
|
||||||
.write()
|
|
||||||
.expect("Acquiring the lock should never fail")
|
|
||||||
.insert(tokenizer_name.to_string(), boxed_tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessing a tokenizer given its name.
|
|
||||||
pub(crate) fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
|
|
||||||
self.tokenizers
|
|
||||||
.read()
|
|
||||||
.expect("Acquiring the lock should never fail")
|
|
||||||
.get(tokenizer_name)
|
|
||||||
.cloned()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for TokenizerManager {
|
|
||||||
/// Creates an `TokenizerManager` prepopulated with
|
|
||||||
/// the default pre-configured tokenizers of `tantivy`.
|
|
||||||
fn default() -> TokenizerManager {
|
|
||||||
let manager = TokenizerManager::new();
|
|
||||||
manager.register("raw", RawTokenizer);
|
|
||||||
manager.register(
|
|
||||||
"default",
|
|
||||||
TextAnalyzer::from(SimpleTokenizer)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser),
|
|
||||||
);
|
|
||||||
manager.register(
|
|
||||||
"en_stem",
|
|
||||||
TextAnalyzer::from(SimpleTokenizer)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(Stemmer::new(Language::English)),
|
|
||||||
);
|
|
||||||
manager.register("whitespace", WhitespaceTokenizer);
|
|
||||||
manager
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue