diff --git a/Cargo.lock b/Cargo.lock index 82318533..2b800839 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -336,6 +336,15 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bindgen" version = "0.57.0" @@ -700,6 +709,7 @@ dependencies = [ "document-features", "either", "env_logger", + "fast2s", "graph", "itertools 0.10.5", "jieba-rs", @@ -738,7 +748,6 @@ dependencies = [ "tokio", "unicode-normalization", "uuid", - "whatlang", ] [[package]] @@ -1203,6 +1212,17 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" +[[package]] +name = "fast2s" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1316063b5422f1f7bf4cc784c959eaf04b843de7c9ecbd4190c60614aa23b27e" +dependencies = [ + "bincode", + "hashbrown", + "lazy_static", +] + [[package]] name = "fastrand" version = "1.9.0" @@ -4389,16 +4409,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "whatlang" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043" -dependencies = [ - "hashbrown", - "once_cell", -] - [[package]] name = "which" version = "4.4.0" diff --git a/cozo-core/src/fts/cangjie/mod.rs b/cozo-core/src/fts/cangjie/mod.rs index 047c2e03..6c9f26e5 100644 --- a/cozo-core/src/fts/cangjie/mod.rs +++ b/cozo-core/src/fts/cangjie/mod.rs @@ -7,9 +7,3 @@ pub(crate) mod options; pub(crate) mod stream; pub(crate) mod tokenizer; - -pub(crate) use { - options::TokenizerOption, stream::CangjieTokenStream, tokenizer::CangJieTokenizer, -}; - -pub const CANG_JIE: &str = "CANG_JIE"; diff --git a/cozo-core/src/fts/cangjie/options.rs b/cozo-core/src/fts/cangjie/options.rs index ba11e67c..ed3825f6 100644 --- a/cozo-core/src/fts/cangjie/options.rs +++ b/cozo-core/src/fts/cangjie/options.rs @@ -1,6 +1,6 @@ /// Tokenizer Option #[derive(Debug, Clone)] -pub enum TokenizerOption { +pub(crate) enum TokenizerOption { /// Cut the input text, return all possible words All, /// Cut the input text diff --git a/cozo-core/src/fts/cangjie/stream.rs b/cozo-core/src/fts/cangjie/stream.rs index e0568cc2..6f94235e 100644 --- a/cozo-core/src/fts/cangjie/stream.rs +++ b/cozo-core/src/fts/cangjie/stream.rs @@ -1,7 +1,7 @@ use crate::fts::tokenizer::Token; #[derive(Debug)] -pub struct CangjieTokenStream<'a> { +pub(crate) struct CangjieTokenStream<'a> { result: Vec<&'a str>, // Begin with 1 index: usize, @@ -10,7 +10,7 @@ pub struct CangjieTokenStream<'a> { } impl<'a> CangjieTokenStream<'a> { - pub fn new(result: Vec<&'a str>) -> Self { + pub(crate) fn new(result: Vec<&'a str>) -> Self { CangjieTokenStream { result, index: 0, diff --git a/cozo-core/src/fts/cangjie/tokenizer.rs b/cozo-core/src/fts/cangjie/tokenizer.rs index 09589260..d14a9321 100644 --- a/cozo-core/src/fts/cangjie/tokenizer.rs +++ b/cozo-core/src/fts/cangjie/tokenizer.rs @@ -5,11 +5,11 @@ use std::sync::Arc; use crate::fts::tokenizer::BoxTokenStream; #[derive(Clone, Debug)] -pub struct CangJieTokenizer { +pub(crate) struct CangJieTokenizer { /// Separation algorithm provider - pub worker: Arc, + pub(crate) worker: Arc, /// Separation config - pub option: TokenizerOption, + pub(crate) option: TokenizerOption, } impl Default for CangJieTokenizer { diff --git a/cozo-core/src/fts/tokenizer/alphanum_only.rs b/cozo-core/src/fts/tokenizer/alphanum_only.rs index ada5899f..ad0e8f49 100644 --- a/cozo-core/src/fts/tokenizer/alphanum_only.rs +++ b/cozo-core/src/fts/tokenizer/alphanum_only.rs @@ -1,32 +1,32 @@ -//! # Example -//! ```rust -//! use tantivy::tokenizer::*; -//! -//! let tokenizer = TextAnalyzer::from(RawTokenizer) -//! .filter(AlphaNumOnlyFilter); -//! -//! let mut stream = tokenizer.token_stream("hello there"); -//! // is none because the raw filter emits one token that -//! // contains a space -//! assert!(stream.next().is_none()); -//! -//! let tokenizer = TextAnalyzer::from(SimpleTokenizer) -//! .filter(AlphaNumOnlyFilter); -//! -//! let mut stream = tokenizer.token_stream("hello there 💣"); -//! assert!(stream.next().is_some()); -//! assert!(stream.next().is_some()); -//! // the "emoji" is dropped because its not an alphanum -//! assert!(stream.next().is_none()); -//! ``` +// # Example +// ```rust +// use tantivy::tokenizer::*; +// +// let tokenizer = TextAnalyzer::from(RawTokenizer) +// .filter(AlphaNumOnlyFilter); +// +// let mut stream = tokenizer.token_stream("hello there"); +// // is none because the raw filter emits one token that +// // contains a space +// assert!(stream.next().is_none()); +// +// let tokenizer = TextAnalyzer::from(SimpleTokenizer) +// .filter(AlphaNumOnlyFilter); +// +// let mut stream = tokenizer.token_stream("hello there 💣"); +// assert!(stream.next().is_some()); +// assert!(stream.next().is_some()); +// // the "emoji" is dropped because its not an alphanum +// assert!(stream.next().is_none()); +// ``` use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. #[derive(Clone)] -pub struct AlphaNumOnlyFilter; +pub(crate) struct AlphaNumOnlyFilter; -pub struct AlphaNumOnlyFilterStream<'a> { +pub(crate) struct AlphaNumOnlyFilterStream<'a> { tail: BoxTokenStream<'a>, } diff --git a/cozo-core/src/fts/tokenizer/ascii_folding_filter.rs b/cozo-core/src/fts/tokenizer/ascii_folding_filter.rs index 17e37698..1d5317ec 100644 --- a/cozo-core/src/fts/tokenizer/ascii_folding_filter.rs +++ b/cozo-core/src/fts/tokenizer/ascii_folding_filter.rs @@ -6,7 +6,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// block) into their ASCII equivalents, if one exists. #[derive(Clone)] -pub struct AsciiFoldingFilter; +pub(crate) struct AsciiFoldingFilter; impl TokenFilter for AsciiFoldingFilter { fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { @@ -17,7 +17,7 @@ impl TokenFilter for AsciiFoldingFilter { } } -pub struct AsciiFoldingFilterTokenStream<'a> { +pub(crate) struct AsciiFoldingFilterTokenStream<'a> { buffer: String, tail: BoxTokenStream<'a>, } diff --git a/cozo-core/src/fts/tokenizer/lower_caser.rs b/cozo-core/src/fts/tokenizer/lower_caser.rs index d08808c1..3ab6453e 100644 --- a/cozo-core/src/fts/tokenizer/lower_caser.rs +++ b/cozo-core/src/fts/tokenizer/lower_caser.rs @@ -14,9 +14,9 @@ impl TokenFilter for LowerCaser { /// Token filter that lowercase terms. #[derive(Clone)] -pub struct LowerCaser; +pub(crate) struct LowerCaser; -pub struct LowerCaserTokenStream<'a> { +pub(crate) struct LowerCaserTokenStream<'a> { buffer: String, tail: BoxTokenStream<'a>, } diff --git a/cozo-core/src/fts/tokenizer/mod.rs b/cozo-core/src/fts/tokenizer/mod.rs index 86adb03f..02bcae0b 100644 --- a/cozo-core/src/fts/tokenizer/mod.rs +++ b/cozo-core/src/fts/tokenizer/mod.rs @@ -10,7 +10,7 @@ //! You must define in your schema which tokenizer should be used for //! each of your fields : //! -//! ```rust +//! ```text //! use tantivy::schema::*; //! //! let mut schema_builder = Schema::builder(); @@ -67,7 +67,7 @@ //! //! For instance, the `en_stem` is defined as follows. //! -//! ```rust +//! ```text //! use tantivy::tokenizer::*; //! //! let en_stem = TextAnalyzer::from(SimpleTokenizer) @@ -79,7 +79,7 @@ //! Once your tokenizer is defined, you need to //! register it with a name in your index's [`TokenizerManager`]. //! -//! ```rust +//! ```text //! # use tantivy::schema::Schema; //! # use tantivy::tokenizer::*; //! # use tantivy::Index; @@ -99,7 +99,7 @@ //! //! # Example //! -//! ```rust +//! ```text //! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing}; //! use tantivy::tokenizer::*; //! use tantivy::Index; @@ -139,32 +139,32 @@ mod tokenizer; mod tokenizer_manager; mod whitespace_tokenizer; -pub use self::alphanum_only::AlphaNumOnlyFilter; -pub use self::ascii_folding_filter::AsciiFoldingFilter; -pub use self::lower_caser::LowerCaser; -pub use self::ngram_tokenizer::NgramTokenizer; -pub use self::raw_tokenizer::RawTokenizer; -pub use self::remove_long::RemoveLongFilter; -pub use self::simple_tokenizer::SimpleTokenizer; -pub use self::split_compound_words::SplitCompoundWords; -pub use self::stemmer::{Language, Stemmer}; -pub use self::stop_word_filter::StopWordFilter; -pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub use self::tokenizer::{ +pub(crate) use self::alphanum_only::AlphaNumOnlyFilter; +pub(crate) use self::ascii_folding_filter::AsciiFoldingFilter; +pub(crate) use self::lower_caser::LowerCaser; +pub(crate) use self::ngram_tokenizer::NgramTokenizer; +pub(crate) use self::raw_tokenizer::RawTokenizer; +pub(crate) use self::remove_long::RemoveLongFilter; +pub(crate) use self::simple_tokenizer::SimpleTokenizer; +pub(crate) use self::split_compound_words::SplitCompoundWords; +pub(crate) use self::stemmer::{Language, Stemmer}; +pub(crate) use self::stop_word_filter::StopWordFilter; +pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; +pub(crate) use self::tokenizer::{ BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, }; -pub use self::tokenizer_manager::TokenizerManager; -pub use self::whitespace_tokenizer::WhitespaceTokenizer; +pub(crate) use self::tokenizer_manager::TokenizerManager; +pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer; /// Maximum authorized len (in bytes) for a token. /// /// Tokenizers are in charge of not emitting tokens larger than this value. /// Currently, if a faulty tokenizer implementation emits tokens with a length larger than /// `2^16 - 1 - 5`, the token will simply be ignored downstream. -pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5; +pub(crate) const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5; #[cfg(test)] -pub mod tests { +pub(crate) mod tests { use super::{ Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager, }; @@ -172,7 +172,7 @@ pub mod tests { /// This is a function that can be used in tests and doc tests /// to assert a token's correctness. - pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { + pub(crate) fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { assert_eq!( token.position, position, "expected position {} but {:?}", diff --git a/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs b/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs index 9b8b2bb0..561542e1 100644 --- a/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs @@ -31,7 +31,7 @@ use crate::fts::tokenizer::BoxTokenStream; /// /// # Example /// -/// ```rust +/// ```text /// use tantivy::tokenizer::*; /// /// let tokenizer = NgramTokenizer::new(2, 3, false); @@ -81,7 +81,7 @@ use crate::fts::tokenizer::BoxTokenStream; /// assert!(stream.next().is_none()); /// ``` #[derive(Clone)] -pub struct NgramTokenizer { +pub(crate) struct NgramTokenizer { /// min size of the n-gram min_gram: usize, /// max size of the n-gram @@ -92,7 +92,7 @@ pub struct NgramTokenizer { impl NgramTokenizer { /// Configures a new Ngram tokenizer - pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer { + pub(crate) fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer { assert!(min_gram > 0, "min_gram must be greater than 0"); assert!( min_gram <= max_gram, @@ -108,19 +108,19 @@ impl NgramTokenizer { /// Create a `NGramTokenizer` which generates tokens for all inner ngrams. /// /// This is as opposed to only prefix ngrams . - pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { + pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { Self::new(min_gram, max_gram, false) } /// Create a `NGramTokenizer` which only generates tokens for the /// prefix ngrams. - pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { + pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { Self::new(min_gram, max_gram, true) } } /// TokenStream associate to the `NgramTokenizer` -pub struct NgramTokenStream<'a> { +pub(crate) struct NgramTokenStream<'a> { /// parameters ngram_charidx_iterator: StutteringIterator>, /// true if the NgramTokenStream is in prefix mode. @@ -194,7 +194,7 @@ struct StutteringIterator { impl StutteringIterator where T: Iterator { - pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator { + pub(crate) fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator { assert!(min_gram > 0); let memory: Vec = (&mut underlying).take(max_gram + 1).collect(); if memory.len() <= min_gram { diff --git a/cozo-core/src/fts/tokenizer/raw_tokenizer.rs b/cozo-core/src/fts/tokenizer/raw_tokenizer.rs index 607cedf6..70976908 100644 --- a/cozo-core/src/fts/tokenizer/raw_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/raw_tokenizer.rs @@ -3,9 +3,9 @@ use crate::fts::tokenizer::BoxTokenStream; /// For each value of the field, emit a single unprocessed token. #[derive(Clone)] -pub struct RawTokenizer; +pub(crate) struct RawTokenizer; -pub struct RawTokenStream { +pub(crate) struct RawTokenStream { token: Token, has_token: bool, } diff --git a/cozo-core/src/fts/tokenizer/remove_long.rs b/cozo-core/src/fts/tokenizer/remove_long.rs index 9f992bf1..3714b7e5 100644 --- a/cozo-core/src/fts/tokenizer/remove_long.rs +++ b/cozo-core/src/fts/tokenizer/remove_long.rs @@ -1,5 +1,5 @@ //! # Example -//! ```rust +//! ```text //! use tantivy::tokenizer::*; //! //! let tokenizer = TextAnalyzer::from(SimpleTokenizer) @@ -20,13 +20,13 @@ use crate::fts::tokenizer::BoxTokenStream; /// It is especially useful when indexing unconstrained content. /// e.g. Mail containing base-64 encoded pictures etc. #[derive(Clone)] -pub struct RemoveLongFilter { +pub(crate) struct RemoveLongFilter { length_limit: usize, } impl RemoveLongFilter { /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation. - pub fn limit(length_limit: usize) -> RemoveLongFilter { + pub(crate) fn limit(length_limit: usize) -> RemoveLongFilter { RemoveLongFilter { length_limit } } } @@ -46,7 +46,7 @@ impl TokenFilter for RemoveLongFilter { } } -pub struct RemoveLongFilterStream<'a> { +pub(crate) struct RemoveLongFilterStream<'a> { token_length_limit: usize, tail: BoxTokenStream<'a>, } diff --git a/cozo-core/src/fts/tokenizer/simple_tokenizer.rs b/cozo-core/src/fts/tokenizer/simple_tokenizer.rs index 7ec71182..76dd7c20 100644 --- a/cozo-core/src/fts/tokenizer/simple_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/simple_tokenizer.rs @@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone)] -pub struct SimpleTokenizer; +pub(crate) struct SimpleTokenizer; -pub struct SimpleTokenStream<'a> { +pub(crate) struct SimpleTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, token: Token, diff --git a/cozo-core/src/fts/tokenizer/split_compound_words.rs b/cozo-core/src/fts/tokenizer/split_compound_words.rs index ce3cf323..5b8b002c 100644 --- a/cozo-core/src/fts/tokenizer/split_compound_words.rs +++ b/cozo-core/src/fts/tokenizer/split_compound_words.rs @@ -17,7 +17,7 @@ use miette::{IntoDiagnostic, Result}; /// e.g. the missing stem "back" of "backen" implies that "brotbackautomat" /// is not split in the following example. /// -/// ```rust +/// ```text /// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer}; /// /// let tokenizer = @@ -38,7 +38,7 @@ use miette::{IntoDiagnostic, Result}; /// /// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics) #[derive(Clone)] -pub struct SplitCompoundWords { +pub(crate) struct SplitCompoundWords { dict: AhoCorasick, } @@ -48,7 +48,7 @@ impl SplitCompoundWords { /// The dictionary will be used to construct an [`AhoCorasick`] automaton /// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if /// more control over its construction is required. - pub fn from_dictionary(dict: I) -> Result + pub(crate) fn from_dictionary(dict: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, @@ -67,7 +67,7 @@ impl SplitCompoundWords { /// /// The automaton should use one of the leftmost-first match kinds /// and it should not be anchored. - pub fn from_automaton(dict: AhoCorasick) -> Self { + pub(crate) fn from_automaton(dict: AhoCorasick) -> Self { Self { dict } } } diff --git a/cozo-core/src/fts/tokenizer/stemmer.rs b/cozo-core/src/fts/tokenizer/stemmer.rs index 5a946446..ee7b6316 100644 --- a/cozo-core/src/fts/tokenizer/stemmer.rs +++ b/cozo-core/src/fts/tokenizer/stemmer.rs @@ -9,7 +9,7 @@ use crate::fts::tokenizer::BoxTokenStream; /// Available stemmer languages. #[derive(Debug, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq, Copy, Clone)] #[allow(missing_docs)] -pub enum Language { +pub(crate) enum Language { Arabic, Danish, Dutch, @@ -60,13 +60,13 @@ impl Language { /// languages. /// Tokens are expected to be lowercased beforehand. #[derive(Clone)] -pub struct Stemmer { +pub(crate) struct Stemmer { stemmer_algorithm: Algorithm, } impl Stemmer { /// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm. - pub fn new(language: Language) -> Stemmer { + pub(crate) fn new(language: Language) -> Stemmer { Stemmer { stemmer_algorithm: language.algorithm(), } @@ -91,7 +91,7 @@ impl TokenFilter for Stemmer { } } -pub struct StemmerTokenStream<'a> { +pub(crate) struct StemmerTokenStream<'a> { tail: BoxTokenStream<'a>, stemmer: rust_stemmers::Stemmer, buffer: String, diff --git a/cozo-core/src/fts/tokenizer/stop_word_filter/gen_stopwords.py b/cozo-core/src/fts/tokenizer/stop_word_filter/gen_stopwords.py index 333fa92c..22f1df5e 100644 --- a/cozo-core/src/fts/tokenizer/stop_word_filter/gen_stopwords.py +++ b/cozo-core/src/fts/tokenizer/stop_word_filter/gen_stopwords.py @@ -31,7 +31,7 @@ with requests.Session() as sess, open("stopwords.rs", "w") as mod: resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt") resp.raise_for_status() - mod.write(f"pub const {lang.upper()}: &[&str] = &[\n") + mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n") for line in resp.text.splitlines(): line, _, _ = line.partition("|") diff --git a/cozo-core/src/fts/tokenizer/stop_word_filter/mod.rs b/cozo-core/src/fts/tokenizer/stop_word_filter/mod.rs index 1d68d56a..65e3ace7 100644 --- a/cozo-core/src/fts/tokenizer/stop_word_filter/mod.rs +++ b/cozo-core/src/fts/tokenizer/stop_word_filter/mod.rs @@ -10,19 +10,19 @@ //! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert!(stream.next().is_none()); //! ``` -#[cfg(feature = "stopwords")] #[rustfmt::skip] mod stopwords; use std::sync::Arc; use rustc_hash::FxHashSet; +use crate::fts::tokenizer::Language; use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes stop words from a token stream #[derive(Clone)] -pub struct StopWordFilter { +pub(crate) struct StopWordFilter { words: Arc>, } @@ -30,8 +30,7 @@ impl StopWordFilter { /// Creates a new [`StopWordFilter`] for the given [`Language`] /// /// Returns `Some` if a list of stop words is available and `None` otherwise. - #[cfg(feature = "stopwords")] - pub fn new(language: Language) -> Option { + pub(crate) fn new(language: Language) -> Option { let words = match language { Language::Danish => stopwords::DANISH, Language::Dutch => stopwords::DUTCH, @@ -60,14 +59,14 @@ impl StopWordFilter { } /// Creates a `StopWordFilter` given a list of words to remove - pub fn remove>(words: W) -> StopWordFilter { + pub(crate) fn remove>(words: W) -> StopWordFilter { StopWordFilter { words: Arc::new(words.into_iter().collect()), } } } -pub struct StopWordFilterStream<'a> { +pub(crate) struct StopWordFilterStream<'a> { words: Arc>, tail: BoxTokenStream<'a>, } diff --git a/cozo-core/src/fts/tokenizer/stop_word_filter/stopwords.rs b/cozo-core/src/fts/tokenizer/stop_word_filter/stopwords.rs index 7fc47ac4..ad6010f3 100644 --- a/cozo-core/src/fts/tokenizer/stop_word_filter/stopwords.rs +++ b/cozo-core/src/fts/tokenizer/stop_word_filter/stopwords.rs @@ -33,7 +33,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -pub const DANISH: &[&str] = &[ +pub(crate) const DANISH: &[&str] = &[ "og", "i", "jeg", @@ -130,7 +130,7 @@ pub const DANISH: &[&str] = &[ "sådan", ]; -pub const DUTCH: &[&str] = &[ +pub(crate) const DUTCH: &[&str] = &[ "de", "en", "van", @@ -234,7 +234,7 @@ pub const DUTCH: &[&str] = &[ "andere", ]; -pub const FINNISH: &[&str] = &[ +pub(crate) const FINNISH: &[&str] = &[ "olla", "olen", "olet", @@ -471,7 +471,7 @@ pub const FINNISH: &[&str] = &[ "itse", ]; -pub const FRENCH: &[&str] = &[ +pub(crate) const FRENCH: &[&str] = &[ "au", "aux", "avec", @@ -628,7 +628,7 @@ pub const FRENCH: &[&str] = &[ "soi", ]; -pub const GERMAN: &[&str] = &[ +pub(crate) const GERMAN: &[&str] = &[ "aber", "alle", "allem", @@ -862,7 +862,7 @@ pub const GERMAN: &[&str] = &[ "zwischen", ]; -pub const ITALIAN: &[&str] = &[ +pub(crate) const ITALIAN: &[&str] = &[ "ad", "al", "allo", @@ -1144,7 +1144,7 @@ pub const ITALIAN: &[&str] = &[ "stando", ]; -pub const NORWEGIAN: &[&str] = &[ +pub(crate) const NORWEGIAN: &[&str] = &[ "og", "i", "jeg", @@ -1319,7 +1319,7 @@ pub const NORWEGIAN: &[&str] = &[ "vart", ]; -pub const PORTUGUESE: &[&str] = &[ +pub(crate) const PORTUGUESE: &[&str] = &[ "de", "a", "o", @@ -1525,7 +1525,7 @@ pub const PORTUGUESE: &[&str] = &[ "teriam", ]; -pub const RUSSIAN: &[&str] = &[ +pub(crate) const RUSSIAN: &[&str] = &[ "и", "в", "во", @@ -1687,7 +1687,7 @@ pub const RUSSIAN: &[&str] = &[ "между", ]; -pub const SPANISH: &[&str] = &[ +pub(crate) const SPANISH: &[&str] = &[ "de", "la", "que", @@ -1998,7 +1998,7 @@ pub const SPANISH: &[&str] = &[ "tened", ]; -pub const SWEDISH: &[&str] = &[ +pub(crate) const SWEDISH: &[&str] = &[ "och", "det", "att", diff --git a/cozo-core/src/fts/tokenizer/tokenized_string.rs b/cozo-core/src/fts/tokenizer/tokenized_string.rs index 1e369134..ccb579cf 100644 --- a/cozo-core/src/fts/tokenizer/tokenized_string.rs +++ b/cozo-core/src/fts/tokenizer/tokenized_string.rs @@ -4,11 +4,11 @@ use crate::fts::tokenizer::{Token, TokenStream}; /// Struct representing pre-tokenized text #[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)] -pub struct PreTokenizedString { +pub(crate) struct PreTokenizedString { /// Original text - pub text: String, + pub(crate) text: String, /// Tokens derived from the text - pub tokens: Vec, + pub(crate) tokens: Vec, } impl Ord for PreTokenizedString { @@ -24,7 +24,7 @@ impl PartialOrd for PreTokenizedString { } /// [`TokenStream`] implementation which wraps [`PreTokenizedString`] -pub struct PreTokenizedStream { +pub(crate) struct PreTokenizedStream { tokenized_string: PreTokenizedString, current_token: i64, } diff --git a/cozo-core/src/fts/tokenizer/tokenizer.rs b/cozo-core/src/fts/tokenizer/tokenizer.rs index 7e80db97..61e7fac4 100644 --- a/cozo-core/src/fts/tokenizer/tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/tokenizer.rs @@ -7,20 +7,20 @@ use crate::fts::tokenizer::empty_tokenizer::EmptyTokenizer; /// Token #[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)] -pub struct Token { +pub(crate) struct Token { /// Offset (byte index) of the first character of the token. /// Offsets shall not be modified by token filters. - pub offset_from: usize, + pub(crate) offset_from: usize, /// Offset (byte index) of the last character of the token + 1. /// The text that generated the token should be obtained by /// &text[token.offset_from..token.offset_to] - pub offset_to: usize, + pub(crate) offset_to: usize, /// Position, expressed in number of tokens. - pub position: usize, + pub(crate) position: usize, /// Actual text content of the token. - pub text: String, + pub(crate) text: String, /// Is the length expressed in term of number of original tokens. - pub position_length: usize, + pub(crate) position_length: usize, } impl Default for Token { @@ -38,7 +38,7 @@ impl Default for Token { /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. -pub struct TextAnalyzer { +pub(crate) struct TextAnalyzer { tokenizer: Box, token_filters: Vec, } @@ -60,7 +60,7 @@ impl TextAnalyzer { /// /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using /// `TextAnalyzer::from(tokenizer)`. - pub fn new(tokenizer: T, token_filters: Vec) -> TextAnalyzer { + pub(crate) fn new(tokenizer: T, token_filters: Vec) -> TextAnalyzer { TextAnalyzer { tokenizer: Box::new(tokenizer), token_filters, @@ -74,7 +74,7 @@ impl TextAnalyzer { /// /// # Example /// - /// ```rust + /// ```text /// use tantivy::tokenizer::*; /// /// let en_stem = TextAnalyzer::from(SimpleTokenizer) @@ -83,13 +83,13 @@ impl TextAnalyzer { /// .filter(Stemmer::default()); /// ``` #[must_use] - pub fn filter>(mut self, token_filter: F) -> Self { + pub(crate) fn filter>(mut self, token_filter: F) -> Self { self.token_filters.push(token_filter.into()); self } /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + pub(crate) fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { let mut token_stream = self.tokenizer.token_stream(text); for token_filter in &self.token_filters { token_stream = token_filter.transform(token_stream); @@ -119,12 +119,12 @@ impl Clone for TextAnalyzer { /// # Warning /// /// This API may change to use associated types. -pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { +pub(crate) trait Tokenizer: 'static + Send + Sync + TokenizerClone { /// Creates a token stream for a given `str`. fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; } -pub trait TokenizerClone { +pub(crate) trait TokenizerClone { fn box_clone(&self) -> Box; } @@ -154,7 +154,7 @@ impl<'a> TokenStream for Box { /// Simple wrapper of `Box`. /// /// See [`TokenStream`] for more information. -pub struct BoxTokenStream<'a>(Box); +pub(crate) struct BoxTokenStream<'a>(Box); impl<'a, T> From for BoxTokenStream<'a> where @@ -181,7 +181,7 @@ impl<'a> DerefMut for BoxTokenStream<'a> { /// Simple wrapper of `Box`. /// /// See [`TokenFilter`] for more information. -pub struct BoxTokenFilter(Box); +pub(crate) struct BoxTokenFilter(Box); impl Deref for BoxTokenFilter { type Target = dyn TokenFilter; @@ -203,7 +203,7 @@ impl From for BoxTokenFilter { /// /// # Example /// -/// ``` +/// ```text /// use tantivy::tokenizer::*; /// /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) @@ -225,7 +225,7 @@ impl From for BoxTokenFilter { /// assert_eq!(token.position, 1); /// } /// ``` -pub trait TokenStream { +pub(crate) trait TokenStream { /// Advance to the next token /// /// Returns false if there are no other tokens. @@ -241,7 +241,7 @@ pub trait TokenStream { /// simply combines a call to `.advance()` /// and `.token()`. /// - /// ``` + /// ```text /// use tantivy::tokenizer::*; /// /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) @@ -271,12 +271,12 @@ pub trait TokenStream { } } -pub trait TokenFilterClone { +pub(crate) trait TokenFilterClone { fn box_clone(&self) -> BoxTokenFilter; } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { +pub(crate) trait TokenFilter: 'static + Send + Sync + TokenFilterClone { /// Wraps a token stream and returns the modified one. fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; } diff --git a/cozo-core/src/fts/tokenizer/tokenizer_manager.rs b/cozo-core/src/fts/tokenizer/tokenizer_manager.rs index 471c85ac..6b0d47b2 100644 --- a/cozo-core/src/fts/tokenizer/tokenizer_manager.rs +++ b/cozo-core/src/fts/tokenizer/tokenizer_manager.rs @@ -21,20 +21,20 @@ use crate::fts::tokenizer::{ /// search engine. /// * `whitespace` : Splits the text on whitespaces. #[derive(Clone)] -pub struct TokenizerManager { +pub(crate) struct TokenizerManager { tokenizers: Arc>>, } impl TokenizerManager { /// Creates an empty tokenizer manager. - pub fn new() -> Self { + pub(crate) fn new() -> Self { Self { tokenizers: Arc::new(RwLock::new(HashMap::new())), } } /// Registers a new tokenizer associated with a given name. - pub fn register(&self, tokenizer_name: &str, tokenizer: T) + pub(crate) fn register(&self, tokenizer_name: &str, tokenizer: T) where TextAnalyzer: From { let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); self.tokenizers @@ -44,7 +44,7 @@ impl TokenizerManager { } /// Accessing a tokenizer given its name. - pub fn get(&self, tokenizer_name: &str) -> Option { + pub(crate) fn get(&self, tokenizer_name: &str) -> Option { self.tokenizers .read() .expect("Acquiring the lock should never fail") diff --git a/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs b/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs index a12e6440..ba6b6658 100644 --- a/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs @@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; /// Tokenize the text by splitting on whitespaces. #[derive(Clone)] -pub struct WhitespaceTokenizer; +pub(crate) struct WhitespaceTokenizer; -pub struct WhitespaceTokenStream<'a> { +pub(crate) struct WhitespaceTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, token: Token, diff --git a/cozo-core/src/runtime/tests.rs b/cozo-core/src/runtime/tests.rs index 795411d3..ed2814c1 100644 --- a/cozo-core/src/runtime/tests.rs +++ b/cozo-core/src/runtime/tests.rs @@ -610,6 +610,21 @@ fn test_index() { assert!(joins.contains(&json!(":friends:rev"))); } +#[test] +fn test_json_objects() { + let db = new_cozo_mem().unwrap(); + db.run_script( + "?[a] := a = {'a': 1}", + Default::default(), + ).unwrap(); + db.run_script( + r"?[a] := a = { + 'a': 1 + }", + Default::default(), + ).unwrap(); +} + #[test] fn test_custom_rules() { let db = new_cozo_mem().unwrap(); @@ -899,7 +914,10 @@ fn test_insertions() { db.run_script(r"?[k, v] := *a{k, v}", Default::default()) .unwrap(); db.run_script( - r"::hnsw create a:i {fields: [v], dim: 1536, ef: 16, m: 32, filter: k % 3 == 0}", + r"::hnsw create a:i { + fields: [v], dim: 1536, ef: 16, filter: k % 3 == 0, + m: 32 + }", Default::default(), ) .unwrap(); @@ -926,7 +944,8 @@ fn test_insertions() { #[test] fn tentivy_tokenizers() { - use crate::fts::cangjie::*; + use crate::fts::cangjie::tokenizer::CangJieTokenizer; + use crate::fts::cangjie::options::TokenizerOption; use crate::fts::tokenizer::*; use jieba_rs::Jieba;