imported code visibility

main
Ziyang Hu 1 year ago
parent 2a5e568d58
commit 271f36301d

32
Cargo.lock generated

@ -336,6 +336,15 @@ version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "bindgen" name = "bindgen"
version = "0.57.0" version = "0.57.0"
@ -700,6 +709,7 @@ dependencies = [
"document-features", "document-features",
"either", "either",
"env_logger", "env_logger",
"fast2s",
"graph", "graph",
"itertools 0.10.5", "itertools 0.10.5",
"jieba-rs", "jieba-rs",
@ -738,7 +748,6 @@ dependencies = [
"tokio", "tokio",
"unicode-normalization", "unicode-normalization",
"uuid", "uuid",
"whatlang",
] ]
[[package]] [[package]]
@ -1203,6 +1212,17 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"
[[package]]
name = "fast2s"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1316063b5422f1f7bf4cc784c959eaf04b843de7c9ecbd4190c60614aa23b27e"
dependencies = [
"bincode",
"hashbrown",
"lazy_static",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "1.9.0" version = "1.9.0"
@ -4389,16 +4409,6 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "whatlang"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
dependencies = [
"hashbrown",
"once_cell",
]
[[package]] [[package]]
name = "which" name = "which"
version = "4.4.0" version = "4.4.0"

@ -7,9 +7,3 @@
pub(crate) mod options; pub(crate) mod options;
pub(crate) mod stream; pub(crate) mod stream;
pub(crate) mod tokenizer; pub(crate) mod tokenizer;
pub(crate) use {
options::TokenizerOption, stream::CangjieTokenStream, tokenizer::CangJieTokenizer,
};
pub const CANG_JIE: &str = "CANG_JIE";

@ -1,6 +1,6 @@
/// Tokenizer Option /// Tokenizer Option
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum TokenizerOption { pub(crate) enum TokenizerOption {
/// Cut the input text, return all possible words /// Cut the input text, return all possible words
All, All,
/// Cut the input text /// Cut the input text

@ -1,7 +1,7 @@
use crate::fts::tokenizer::Token; use crate::fts::tokenizer::Token;
#[derive(Debug)] #[derive(Debug)]
pub struct CangjieTokenStream<'a> { pub(crate) struct CangjieTokenStream<'a> {
result: Vec<&'a str>, result: Vec<&'a str>,
// Begin with 1 // Begin with 1
index: usize, index: usize,
@ -10,7 +10,7 @@ pub struct CangjieTokenStream<'a> {
} }
impl<'a> CangjieTokenStream<'a> { impl<'a> CangjieTokenStream<'a> {
pub fn new(result: Vec<&'a str>) -> Self { pub(crate) fn new(result: Vec<&'a str>) -> Self {
CangjieTokenStream { CangjieTokenStream {
result, result,
index: 0, index: 0,

@ -5,11 +5,11 @@ use std::sync::Arc;
use crate::fts::tokenizer::BoxTokenStream; use crate::fts::tokenizer::BoxTokenStream;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct CangJieTokenizer { pub(crate) struct CangJieTokenizer {
/// Separation algorithm provider /// Separation algorithm provider
pub worker: Arc<Jieba>, pub(crate) worker: Arc<Jieba>,
/// Separation config /// Separation config
pub option: TokenizerOption, pub(crate) option: TokenizerOption,
} }
impl Default for CangJieTokenizer { impl Default for CangJieTokenizer {

@ -1,32 +1,32 @@
//! # Example // # Example
//! ```rust // ```rust
//! use tantivy::tokenizer::*; // use tantivy::tokenizer::*;
//! //
//! let tokenizer = TextAnalyzer::from(RawTokenizer) // let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter); // .filter(AlphaNumOnlyFilter);
//! //
//! let mut stream = tokenizer.token_stream("hello there"); // let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that // // is none because the raw filter emits one token that
//! // contains a space // // contains a space
//! assert!(stream.next().is_none()); // assert!(stream.next().is_none());
//! //
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) // let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter); // .filter(AlphaNumOnlyFilter);
//! //
//! let mut stream = tokenizer.token_stream("hello there 💣"); // let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some()); // assert!(stream.next().is_some());
//! assert!(stream.next().is_some()); // assert!(stream.next().is_some());
//! // the "emoji" is dropped because its not an alphanum // // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none()); // assert!(stream.next().is_none());
//! ``` // ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non /// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters. /// ascii alphanumeric characters.
#[derive(Clone)] #[derive(Clone)]
pub struct AlphaNumOnlyFilter; pub(crate) struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> { pub(crate) struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }

@ -6,7 +6,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists. /// block) into their ASCII equivalents, if one exists.
#[derive(Clone)] #[derive(Clone)]
pub struct AsciiFoldingFilter; pub(crate) struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter { impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
@ -17,7 +17,7 @@ impl TokenFilter for AsciiFoldingFilter {
} }
} }
pub struct AsciiFoldingFilterTokenStream<'a> { pub(crate) struct AsciiFoldingFilterTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }

@ -14,9 +14,9 @@ impl TokenFilter for LowerCaser {
/// Token filter that lowercase terms. /// Token filter that lowercase terms.
#[derive(Clone)] #[derive(Clone)]
pub struct LowerCaser; pub(crate) struct LowerCaser;
pub struct LowerCaserTokenStream<'a> { pub(crate) struct LowerCaserTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }

@ -10,7 +10,7 @@
//! You must define in your schema which tokenizer should be used for //! You must define in your schema which tokenizer should be used for
//! each of your fields : //! each of your fields :
//! //!
//! ```rust //! ```text
//! use tantivy::schema::*; //! use tantivy::schema::*;
//! //!
//! let mut schema_builder = Schema::builder(); //! let mut schema_builder = Schema::builder();
@ -67,7 +67,7 @@
//! //!
//! For instance, the `en_stem` is defined as follows. //! For instance, the `en_stem` is defined as follows.
//! //!
//! ```rust //! ```text
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer) //! let en_stem = TextAnalyzer::from(SimpleTokenizer)
@ -79,7 +79,7 @@
//! Once your tokenizer is defined, you need to //! Once your tokenizer is defined, you need to
//! register it with a name in your index's [`TokenizerManager`]. //! register it with a name in your index's [`TokenizerManager`].
//! //!
//! ```rust //! ```text
//! # use tantivy::schema::Schema; //! # use tantivy::schema::Schema;
//! # use tantivy::tokenizer::*; //! # use tantivy::tokenizer::*;
//! # use tantivy::Index; //! # use tantivy::Index;
@ -99,7 +99,7 @@
//! //!
//! # Example //! # Example
//! //!
//! ```rust //! ```text
//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing}; //! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! use tantivy::Index; //! use tantivy::Index;
@ -139,32 +139,32 @@ mod tokenizer;
mod tokenizer_manager; mod tokenizer_manager;
mod whitespace_tokenizer; mod whitespace_tokenizer;
pub use self::alphanum_only::AlphaNumOnlyFilter; pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter; pub(crate) use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::lower_caser::LowerCaser; pub(crate) use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer; pub(crate) use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer; pub(crate) use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter; pub(crate) use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer; pub(crate) use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords; pub(crate) use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer}; pub(crate) use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{ pub(crate) use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
}; };
pub use self::tokenizer_manager::TokenizerManager; pub(crate) use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer; pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;
/// Maximum authorized len (in bytes) for a token. /// Maximum authorized len (in bytes) for a token.
/// ///
/// Tokenizers are in charge of not emitting tokens larger than this value. /// Tokenizers are in charge of not emitting tokens larger than this value.
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than /// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
/// `2^16 - 1 - 5`, the token will simply be ignored downstream. /// `2^16 - 1 - 5`, the token will simply be ignored downstream.
pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5; pub(crate) const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
#[cfg(test)] #[cfg(test)]
pub mod tests { pub(crate) mod tests {
use super::{ use super::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager, Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
}; };
@ -172,7 +172,7 @@ pub mod tests {
/// This is a function that can be used in tests and doc tests /// This is a function that can be used in tests and doc tests
/// to assert a token's correctness. /// to assert a token's correctness.
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { pub(crate) fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!( assert_eq!(
token.position, position, token.position, position,
"expected position {} but {:?}", "expected position {} but {:?}",

@ -31,7 +31,7 @@ use crate::fts::tokenizer::BoxTokenStream;
/// ///
/// # Example /// # Example
/// ///
/// ```rust /// ```text
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// let tokenizer = NgramTokenizer::new(2, 3, false); /// let tokenizer = NgramTokenizer::new(2, 3, false);
@ -81,7 +81,7 @@ use crate::fts::tokenizer::BoxTokenStream;
/// assert!(stream.next().is_none()); /// assert!(stream.next().is_none());
/// ``` /// ```
#[derive(Clone)] #[derive(Clone)]
pub struct NgramTokenizer { pub(crate) struct NgramTokenizer {
/// min size of the n-gram /// min size of the n-gram
min_gram: usize, min_gram: usize,
/// max size of the n-gram /// max size of the n-gram
@ -92,7 +92,7 @@ pub struct NgramTokenizer {
impl NgramTokenizer { impl NgramTokenizer {
/// Configures a new Ngram tokenizer /// Configures a new Ngram tokenizer
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer { pub(crate) fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0"); assert!(min_gram > 0, "min_gram must be greater than 0");
assert!( assert!(
min_gram <= max_gram, min_gram <= max_gram,
@ -108,19 +108,19 @@ impl NgramTokenizer {
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams. /// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
/// ///
/// This is as opposed to only prefix ngrams . /// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false) Self::new(min_gram, max_gram, false)
} }
/// Create a `NGramTokenizer` which only generates tokens for the /// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams. /// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true) Self::new(min_gram, max_gram, true)
} }
} }
/// TokenStream associate to the `NgramTokenizer` /// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> { pub(crate) struct NgramTokenStream<'a> {
/// parameters /// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>, ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
/// true if the NgramTokenStream is in prefix mode. /// true if the NgramTokenStream is in prefix mode.
@ -194,7 +194,7 @@ struct StutteringIterator<T> {
impl<T> StutteringIterator<T> impl<T> StutteringIterator<T>
where T: Iterator<Item = usize> where T: Iterator<Item = usize>
{ {
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> { pub(crate) fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
assert!(min_gram > 0); assert!(min_gram > 0);
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect(); let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
if memory.len() <= min_gram { if memory.len() <= min_gram {

@ -3,9 +3,9 @@ use crate::fts::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token. /// For each value of the field, emit a single unprocessed token.
#[derive(Clone)] #[derive(Clone)]
pub struct RawTokenizer; pub(crate) struct RawTokenizer;
pub struct RawTokenStream { pub(crate) struct RawTokenStream {
token: Token, token: Token,
has_token: bool, has_token: bool,
} }

@ -1,5 +1,5 @@
//! # Example //! # Example
//! ```rust //! ```text
//! use tantivy::tokenizer::*; //! use tantivy::tokenizer::*;
//! //!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) //! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -20,13 +20,13 @@ use crate::fts::tokenizer::BoxTokenStream;
/// It is especially useful when indexing unconstrained content. /// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc. /// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)] #[derive(Clone)]
pub struct RemoveLongFilter { pub(crate) struct RemoveLongFilter {
length_limit: usize, length_limit: usize,
} }
impl RemoveLongFilter { impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation. /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter { pub(crate) fn limit(length_limit: usize) -> RemoveLongFilter {
RemoveLongFilter { length_limit } RemoveLongFilter { length_limit }
} }
} }
@ -46,7 +46,7 @@ impl TokenFilter for RemoveLongFilter {
} }
} }
pub struct RemoveLongFilterStream<'a> { pub(crate) struct RemoveLongFilterStream<'a> {
token_length_limit: usize, token_length_limit: usize,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }

@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation. /// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)] #[derive(Clone)]
pub struct SimpleTokenizer; pub(crate) struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> { pub(crate) struct SimpleTokenStream<'a> {
text: &'a str, text: &'a str,
chars: CharIndices<'a>, chars: CharIndices<'a>,
token: Token, token: Token,

@ -17,7 +17,7 @@ use miette::{IntoDiagnostic, Result};
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat" /// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example. /// is not split in the following example.
/// ///
/// ```rust /// ```text
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
/// ///
/// let tokenizer = /// let tokenizer =
@ -38,7 +38,7 @@ use miette::{IntoDiagnostic, Result};
/// ///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics) /// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)] #[derive(Clone)]
pub struct SplitCompoundWords { pub(crate) struct SplitCompoundWords {
dict: AhoCorasick, dict: AhoCorasick,
} }
@ -48,7 +48,7 @@ impl SplitCompoundWords {
/// The dictionary will be used to construct an [`AhoCorasick`] automaton /// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required. /// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Result<Self> pub(crate) fn from_dictionary<I, P>(dict: I) -> Result<Self>
where where
I: IntoIterator<Item = P>, I: IntoIterator<Item = P>,
P: AsRef<[u8]>, P: AsRef<[u8]>,
@ -67,7 +67,7 @@ impl SplitCompoundWords {
/// ///
/// The automaton should use one of the leftmost-first match kinds /// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored. /// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick) -> Self { pub(crate) fn from_automaton(dict: AhoCorasick) -> Self {
Self { dict } Self { dict }
} }
} }

@ -9,7 +9,7 @@ use crate::fts::tokenizer::BoxTokenStream;
/// Available stemmer languages. /// Available stemmer languages.
#[derive(Debug, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq, Copy, Clone)] #[derive(Debug, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq, Copy, Clone)]
#[allow(missing_docs)] #[allow(missing_docs)]
pub enum Language { pub(crate) enum Language {
Arabic, Arabic,
Danish, Danish,
Dutch, Dutch,
@ -60,13 +60,13 @@ impl Language {
/// languages. /// languages.
/// Tokens are expected to be lowercased beforehand. /// Tokens are expected to be lowercased beforehand.
#[derive(Clone)] #[derive(Clone)]
pub struct Stemmer { pub(crate) struct Stemmer {
stemmer_algorithm: Algorithm, stemmer_algorithm: Algorithm,
} }
impl Stemmer { impl Stemmer {
/// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm. /// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm.
pub fn new(language: Language) -> Stemmer { pub(crate) fn new(language: Language) -> Stemmer {
Stemmer { Stemmer {
stemmer_algorithm: language.algorithm(), stemmer_algorithm: language.algorithm(),
} }
@ -91,7 +91,7 @@ impl TokenFilter for Stemmer {
} }
} }
pub struct StemmerTokenStream<'a> { pub(crate) struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer, stemmer: rust_stemmers::Stemmer,
buffer: String, buffer: String,

@ -31,7 +31,7 @@ with requests.Session() as sess, open("stopwords.rs", "w") as mod:
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt") resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
resp.raise_for_status() resp.raise_for_status()
mod.write(f"pub const {lang.upper()}: &[&str] = &[\n") mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
for line in resp.text.splitlines(): for line in resp.text.splitlines():
line, _, _ = line.partition("|") line, _, _ = line.partition("|")

@ -10,19 +10,19 @@
//! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
#[cfg(feature = "stopwords")]
#[rustfmt::skip] #[rustfmt::skip]
mod stopwords; mod stopwords;
use std::sync::Arc; use std::sync::Arc;
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
use crate::fts::tokenizer::Language;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes stop words from a token stream /// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)] #[derive(Clone)]
pub struct StopWordFilter { pub(crate) struct StopWordFilter {
words: Arc<FxHashSet<String>>, words: Arc<FxHashSet<String>>,
} }
@ -30,8 +30,7 @@ impl StopWordFilter {
/// Creates a new [`StopWordFilter`] for the given [`Language`] /// Creates a new [`StopWordFilter`] for the given [`Language`]
/// ///
/// Returns `Some` if a list of stop words is available and `None` otherwise. /// Returns `Some` if a list of stop words is available and `None` otherwise.
#[cfg(feature = "stopwords")] pub(crate) fn new(language: Language) -> Option<Self> {
pub fn new(language: Language) -> Option<Self> {
let words = match language { let words = match language {
Language::Danish => stopwords::DANISH, Language::Danish => stopwords::DANISH,
Language::Dutch => stopwords::DUTCH, Language::Dutch => stopwords::DUTCH,
@ -60,14 +59,14 @@ impl StopWordFilter {
} }
/// Creates a `StopWordFilter` given a list of words to remove /// Creates a `StopWordFilter` given a list of words to remove
pub fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter { pub(crate) fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
StopWordFilter { StopWordFilter {
words: Arc::new(words.into_iter().collect()), words: Arc::new(words.into_iter().collect()),
} }
} }
} }
pub struct StopWordFilterStream<'a> { pub(crate) struct StopWordFilterStream<'a> {
words: Arc<FxHashSet<String>>, words: Arc<FxHashSet<String>>,
tail: BoxTokenStream<'a>, tail: BoxTokenStream<'a>,
} }

@ -33,7 +33,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
pub const DANISH: &[&str] = &[ pub(crate) const DANISH: &[&str] = &[
"og", "og",
"i", "i",
"jeg", "jeg",
@ -130,7 +130,7 @@ pub const DANISH: &[&str] = &[
"sådan", "sådan",
]; ];
pub const DUTCH: &[&str] = &[ pub(crate) const DUTCH: &[&str] = &[
"de", "de",
"en", "en",
"van", "van",
@ -234,7 +234,7 @@ pub const DUTCH: &[&str] = &[
"andere", "andere",
]; ];
pub const FINNISH: &[&str] = &[ pub(crate) const FINNISH: &[&str] = &[
"olla", "olla",
"olen", "olen",
"olet", "olet",
@ -471,7 +471,7 @@ pub const FINNISH: &[&str] = &[
"itse", "itse",
]; ];
pub const FRENCH: &[&str] = &[ pub(crate) const FRENCH: &[&str] = &[
"au", "au",
"aux", "aux",
"avec", "avec",
@ -628,7 +628,7 @@ pub const FRENCH: &[&str] = &[
"soi", "soi",
]; ];
pub const GERMAN: &[&str] = &[ pub(crate) const GERMAN: &[&str] = &[
"aber", "aber",
"alle", "alle",
"allem", "allem",
@ -862,7 +862,7 @@ pub const GERMAN: &[&str] = &[
"zwischen", "zwischen",
]; ];
pub const ITALIAN: &[&str] = &[ pub(crate) const ITALIAN: &[&str] = &[
"ad", "ad",
"al", "al",
"allo", "allo",
@ -1144,7 +1144,7 @@ pub const ITALIAN: &[&str] = &[
"stando", "stando",
]; ];
pub const NORWEGIAN: &[&str] = &[ pub(crate) const NORWEGIAN: &[&str] = &[
"og", "og",
"i", "i",
"jeg", "jeg",
@ -1319,7 +1319,7 @@ pub const NORWEGIAN: &[&str] = &[
"vart", "vart",
]; ];
pub const PORTUGUESE: &[&str] = &[ pub(crate) const PORTUGUESE: &[&str] = &[
"de", "de",
"a", "a",
"o", "o",
@ -1525,7 +1525,7 @@ pub const PORTUGUESE: &[&str] = &[
"teriam", "teriam",
]; ];
pub const RUSSIAN: &[&str] = &[ pub(crate) const RUSSIAN: &[&str] = &[
"и", "и",
"в", "в",
"во", "во",
@ -1687,7 +1687,7 @@ pub const RUSSIAN: &[&str] = &[
"между", "между",
]; ];
pub const SPANISH: &[&str] = &[ pub(crate) const SPANISH: &[&str] = &[
"de", "de",
"la", "la",
"que", "que",
@ -1998,7 +1998,7 @@ pub const SPANISH: &[&str] = &[
"tened", "tened",
]; ];
pub const SWEDISH: &[&str] = &[ pub(crate) const SWEDISH: &[&str] = &[
"och", "och",
"det", "det",
"att", "att",

@ -4,11 +4,11 @@ use crate::fts::tokenizer::{Token, TokenStream};
/// Struct representing pre-tokenized text /// Struct representing pre-tokenized text
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)] #[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct PreTokenizedString { pub(crate) struct PreTokenizedString {
/// Original text /// Original text
pub text: String, pub(crate) text: String,
/// Tokens derived from the text /// Tokens derived from the text
pub tokens: Vec<Token>, pub(crate) tokens: Vec<Token>,
} }
impl Ord for PreTokenizedString { impl Ord for PreTokenizedString {
@ -24,7 +24,7 @@ impl PartialOrd for PreTokenizedString {
} }
/// [`TokenStream`] implementation which wraps [`PreTokenizedString`] /// [`TokenStream`] implementation which wraps [`PreTokenizedString`]
pub struct PreTokenizedStream { pub(crate) struct PreTokenizedStream {
tokenized_string: PreTokenizedString, tokenized_string: PreTokenizedString,
current_token: i64, current_token: i64,
} }

@ -7,20 +7,20 @@ use crate::fts::tokenizer::empty_tokenizer::EmptyTokenizer;
/// Token /// Token
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)] #[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct Token { pub(crate) struct Token {
/// Offset (byte index) of the first character of the token. /// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters. /// Offsets shall not be modified by token filters.
pub offset_from: usize, pub(crate) offset_from: usize,
/// Offset (byte index) of the last character of the token + 1. /// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by /// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to] /// &text[token.offset_from..token.offset_to]
pub offset_to: usize, pub(crate) offset_to: usize,
/// Position, expressed in number of tokens. /// Position, expressed in number of tokens.
pub position: usize, pub(crate) position: usize,
/// Actual text content of the token. /// Actual text content of the token.
pub text: String, pub(crate) text: String,
/// Is the length expressed in term of number of original tokens. /// Is the length expressed in term of number of original tokens.
pub position_length: usize, pub(crate) position_length: usize,
} }
impl Default for Token { impl Default for Token {
@ -38,7 +38,7 @@ impl Default for Token {
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
/// ///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer { pub(crate) struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>, tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>, token_filters: Vec<BoxTokenFilter>,
} }
@ -60,7 +60,7 @@ impl TextAnalyzer {
/// ///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`. /// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer { pub(crate) fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer { TextAnalyzer {
tokenizer: Box::new(tokenizer), tokenizer: Box::new(tokenizer),
token_filters, token_filters,
@ -74,7 +74,7 @@ impl TextAnalyzer {
/// ///
/// # Example /// # Example
/// ///
/// ```rust /// ```text
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer) /// let en_stem = TextAnalyzer::from(SimpleTokenizer)
@ -83,13 +83,13 @@ impl TextAnalyzer {
/// .filter(Stemmer::default()); /// .filter(Stemmer::default());
/// ``` /// ```
#[must_use] #[must_use]
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self { pub(crate) fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into()); self.token_filters.push(token_filter.into());
self self
} }
/// Creates a token stream for a given `str`. /// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { pub(crate) fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text); let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters { for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream); token_stream = token_filter.transform(token_stream);
@ -119,12 +119,12 @@ impl Clone for TextAnalyzer {
/// # Warning /// # Warning
/// ///
/// This API may change to use associated types. /// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { pub(crate) trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`. /// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
} }
pub trait TokenizerClone { pub(crate) trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>; fn box_clone(&self) -> Box<dyn Tokenizer>;
} }
@ -154,7 +154,7 @@ impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`. /// Simple wrapper of `Box<dyn TokenStream + 'a>`.
/// ///
/// See [`TokenStream`] for more information. /// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>); pub(crate) struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a> impl<'a, T> From<T> for BoxTokenStream<'a>
where where
@ -181,7 +181,7 @@ impl<'a> DerefMut for BoxTokenStream<'a> {
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`. /// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
/// ///
/// See [`TokenFilter`] for more information. /// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>); pub(crate) struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter { impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter; type Target = dyn TokenFilter;
@ -203,7 +203,7 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
/// ///
/// # Example /// # Example
/// ///
/// ``` /// ```text
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) /// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -225,7 +225,7 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
/// assert_eq!(token.position, 1); /// assert_eq!(token.position, 1);
/// } /// }
/// ``` /// ```
pub trait TokenStream { pub(crate) trait TokenStream {
/// Advance to the next token /// Advance to the next token
/// ///
/// Returns false if there are no other tokens. /// Returns false if there are no other tokens.
@ -241,7 +241,7 @@ pub trait TokenStream {
/// simply combines a call to `.advance()` /// simply combines a call to `.advance()`
/// and `.token()`. /// and `.token()`.
/// ///
/// ``` /// ```text
/// use tantivy::tokenizer::*; /// use tantivy::tokenizer::*;
/// ///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) /// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -271,12 +271,12 @@ pub trait TokenStream {
} }
} }
pub trait TokenFilterClone { pub(crate) trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter; fn box_clone(&self) -> BoxTokenFilter;
} }
/// Trait for the pluggable components of `Tokenizer`s. /// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { pub(crate) trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one. /// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
} }

@ -21,20 +21,20 @@ use crate::fts::tokenizer::{
/// search engine. /// search engine.
/// * `whitespace` : Splits the text on whitespaces. /// * `whitespace` : Splits the text on whitespaces.
#[derive(Clone)] #[derive(Clone)]
pub struct TokenizerManager { pub(crate) struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>, tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
} }
impl TokenizerManager { impl TokenizerManager {
/// Creates an empty tokenizer manager. /// Creates an empty tokenizer manager.
pub fn new() -> Self { pub(crate) fn new() -> Self {
Self { Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())), tokenizers: Arc::new(RwLock::new(HashMap::new())),
} }
} }
/// Registers a new tokenizer associated with a given name. /// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T) pub(crate) fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> { where TextAnalyzer: From<T> {
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers self.tokenizers
@ -44,7 +44,7 @@ impl TokenizerManager {
} }
/// Accessing a tokenizer given its name. /// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> { pub(crate) fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
self.tokenizers self.tokenizers
.read() .read()
.expect("Acquiring the lock should never fail") .expect("Acquiring the lock should never fail")

@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces. /// Tokenize the text by splitting on whitespaces.
#[derive(Clone)] #[derive(Clone)]
pub struct WhitespaceTokenizer; pub(crate) struct WhitespaceTokenizer;
pub struct WhitespaceTokenStream<'a> { pub(crate) struct WhitespaceTokenStream<'a> {
text: &'a str, text: &'a str,
chars: CharIndices<'a>, chars: CharIndices<'a>,
token: Token, token: Token,

@ -610,6 +610,21 @@ fn test_index() {
assert!(joins.contains(&json!(":friends:rev"))); assert!(joins.contains(&json!(":friends:rev")));
} }
#[test]
fn test_json_objects() {
let db = new_cozo_mem().unwrap();
db.run_script(
"?[a] := a = {'a': 1}",
Default::default(),
).unwrap();
db.run_script(
r"?[a] := a = {
'a': 1
}",
Default::default(),
).unwrap();
}
#[test] #[test]
fn test_custom_rules() { fn test_custom_rules() {
let db = new_cozo_mem().unwrap(); let db = new_cozo_mem().unwrap();
@ -899,7 +914,10 @@ fn test_insertions() {
db.run_script(r"?[k, v] := *a{k, v}", Default::default()) db.run_script(r"?[k, v] := *a{k, v}", Default::default())
.unwrap(); .unwrap();
db.run_script( db.run_script(
r"::hnsw create a:i {fields: [v], dim: 1536, ef: 16, m: 32, filter: k % 3 == 0}", r"::hnsw create a:i {
fields: [v], dim: 1536, ef: 16, filter: k % 3 == 0,
m: 32
}",
Default::default(), Default::default(),
) )
.unwrap(); .unwrap();
@ -926,7 +944,8 @@ fn test_insertions() {
#[test] #[test]
fn tentivy_tokenizers() { fn tentivy_tokenizers() {
use crate::fts::cangjie::*; use crate::fts::cangjie::tokenizer::CangJieTokenizer;
use crate::fts::cangjie::options::TokenizerOption;
use crate::fts::tokenizer::*; use crate::fts::tokenizer::*;
use jieba_rs::Jieba; use jieba_rs::Jieba;

Loading…
Cancel
Save