imported code visibility

main
Ziyang Hu 1 year ago
parent 2a5e568d58
commit 271f36301d

32
Cargo.lock generated

@ -336,6 +336,15 @@ version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "bindgen"
version = "0.57.0"
@ -700,6 +709,7 @@ dependencies = [
"document-features",
"either",
"env_logger",
"fast2s",
"graph",
"itertools 0.10.5",
"jieba-rs",
@ -738,7 +748,6 @@ dependencies = [
"tokio",
"unicode-normalization",
"uuid",
"whatlang",
]
[[package]]
@ -1203,6 +1212,17 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"
[[package]]
name = "fast2s"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1316063b5422f1f7bf4cc784c959eaf04b843de7c9ecbd4190c60614aa23b27e"
dependencies = [
"bincode",
"hashbrown",
"lazy_static",
]
[[package]]
name = "fastrand"
version = "1.9.0"
@ -4389,16 +4409,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "whatlang"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
dependencies = [
"hashbrown",
"once_cell",
]
[[package]]
name = "which"
version = "4.4.0"

@ -7,9 +7,3 @@
pub(crate) mod options;
pub(crate) mod stream;
pub(crate) mod tokenizer;
pub(crate) use {
options::TokenizerOption, stream::CangjieTokenStream, tokenizer::CangJieTokenizer,
};
pub const CANG_JIE: &str = "CANG_JIE";

@ -1,6 +1,6 @@
/// Tokenizer Option
#[derive(Debug, Clone)]
pub enum TokenizerOption {
pub(crate) enum TokenizerOption {
/// Cut the input text, return all possible words
All,
/// Cut the input text

@ -1,7 +1,7 @@
use crate::fts::tokenizer::Token;
#[derive(Debug)]
pub struct CangjieTokenStream<'a> {
pub(crate) struct CangjieTokenStream<'a> {
result: Vec<&'a str>,
// Begin with 1
index: usize,
@ -10,7 +10,7 @@ pub struct CangjieTokenStream<'a> {
}
impl<'a> CangjieTokenStream<'a> {
pub fn new(result: Vec<&'a str>) -> Self {
pub(crate) fn new(result: Vec<&'a str>) -> Self {
CangjieTokenStream {
result,
index: 0,

@ -5,11 +5,11 @@ use std::sync::Arc;
use crate::fts::tokenizer::BoxTokenStream;
#[derive(Clone, Debug)]
pub struct CangJieTokenizer {
pub(crate) struct CangJieTokenizer {
/// Separation algorithm provider
pub worker: Arc<Jieba>,
pub(crate) worker: Arc<Jieba>,
/// Separation config
pub option: TokenizerOption,
pub(crate) option: TokenizerOption,
}
impl Default for CangJieTokenizer {

@ -1,32 +1,32 @@
//! # Example
//! ```rust
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
//! assert!(stream.next().is_some());
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
// # Example
// ```rust
// use tantivy::tokenizer::*;
//
// let tokenizer = TextAnalyzer::from(RawTokenizer)
// .filter(AlphaNumOnlyFilter);
//
// let mut stream = tokenizer.token_stream("hello there");
// // is none because the raw filter emits one token that
// // contains a space
// assert!(stream.next().is_none());
//
// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
// .filter(AlphaNumOnlyFilter);
//
// let mut stream = tokenizer.token_stream("hello there 💣");
// assert!(stream.next().is_some());
// assert!(stream.next().is_some());
// // the "emoji" is dropped because its not an alphanum
// assert!(stream.next().is_none());
// ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.
#[derive(Clone)]
pub struct AlphaNumOnlyFilter;
pub(crate) struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> {
pub(crate) struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>,
}

@ -6,7 +6,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone)]
pub struct AsciiFoldingFilter;
pub(crate) struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
@ -17,7 +17,7 @@ impl TokenFilter for AsciiFoldingFilter {
}
}
pub struct AsciiFoldingFilterTokenStream<'a> {
pub(crate) struct AsciiFoldingFilterTokenStream<'a> {
buffer: String,
tail: BoxTokenStream<'a>,
}

@ -14,9 +14,9 @@ impl TokenFilter for LowerCaser {
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
pub(crate) struct LowerCaser;
pub struct LowerCaserTokenStream<'a> {
pub(crate) struct LowerCaserTokenStream<'a> {
buffer: String,
tail: BoxTokenStream<'a>,
}

@ -10,7 +10,7 @@
//! You must define in your schema which tokenizer should be used for
//! each of your fields :
//!
//! ```rust
//! ```text
//! use tantivy::schema::*;
//!
//! let mut schema_builder = Schema::builder();
@ -67,7 +67,7 @@
//!
//! For instance, the `en_stem` is defined as follows.
//!
//! ```rust
//! ```text
//! use tantivy::tokenizer::*;
//!
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
@ -79,7 +79,7 @@
//! Once your tokenizer is defined, you need to
//! register it with a name in your index's [`TokenizerManager`].
//!
//! ```rust
//! ```text
//! # use tantivy::schema::Schema;
//! # use tantivy::tokenizer::*;
//! # use tantivy::Index;
@ -99,7 +99,7 @@
//!
//! # Example
//!
//! ```rust
//! ```text
//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
//! use tantivy::tokenizer::*;
//! use tantivy::Index;
@ -139,32 +139,32 @@ mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
pub(crate) use self::ascii_folding_filter::AsciiFoldingFilter;
pub(crate) use self::lower_caser::LowerCaser;
pub(crate) use self::ngram_tokenizer::NgramTokenizer;
pub(crate) use self::raw_tokenizer::RawTokenizer;
pub(crate) use self::remove_long::RemoveLongFilter;
pub(crate) use self::simple_tokenizer::SimpleTokenizer;
pub(crate) use self::split_compound_words::SplitCompoundWords;
pub(crate) use self::stemmer::{Language, Stemmer};
pub(crate) use self::stop_word_filter::StopWordFilter;
pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub(crate) use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
pub(crate) use self::tokenizer_manager::TokenizerManager;
pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;
/// Maximum authorized len (in bytes) for a token.
///
/// Tokenizers are in charge of not emitting tokens larger than this value.
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
/// `2^16 - 1 - 5`, the token will simply be ignored downstream.
pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
pub(crate) const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use super::{
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
};
@ -172,7 +172,7 @@ pub mod tests {
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
pub(crate) fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {} but {:?}",

@ -31,7 +31,7 @@ use crate::fts::tokenizer::BoxTokenStream;
///
/// # Example
///
/// ```rust
/// ```text
/// use tantivy::tokenizer::*;
///
/// let tokenizer = NgramTokenizer::new(2, 3, false);
@ -81,7 +81,7 @@ use crate::fts::tokenizer::BoxTokenStream;
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone)]
pub struct NgramTokenizer {
pub(crate) struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
/// max size of the n-gram
@ -92,7 +92,7 @@ pub struct NgramTokenizer {
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
pub(crate) fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
@ -108,19 +108,19 @@ impl NgramTokenizer {
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
///
/// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false)
}
/// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true)
}
}
/// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> {
pub(crate) struct NgramTokenStream<'a> {
/// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
/// true if the NgramTokenStream is in prefix mode.
@ -194,7 +194,7 @@ struct StutteringIterator<T> {
impl<T> StutteringIterator<T>
where T: Iterator<Item = usize>
{
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
pub(crate) fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
assert!(min_gram > 0);
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
if memory.len() <= min_gram {

@ -3,9 +3,9 @@ use crate::fts::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone)]
pub struct RawTokenizer;
pub(crate) struct RawTokenizer;
pub struct RawTokenStream {
pub(crate) struct RawTokenStream {
token: Token,
has_token: bool,
}

@ -1,5 +1,5 @@
//! # Example
//! ```rust
//! ```text
//! use tantivy::tokenizer::*;
//!
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -20,13 +20,13 @@ use crate::fts::tokenizer::BoxTokenStream;
/// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)]
pub struct RemoveLongFilter {
pub(crate) struct RemoveLongFilter {
length_limit: usize,
}
impl RemoveLongFilter {
/// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
pub(crate) fn limit(length_limit: usize) -> RemoveLongFilter {
RemoveLongFilter { length_limit }
}
}
@ -46,7 +46,7 @@ impl TokenFilter for RemoveLongFilter {
}
}
pub struct RemoveLongFilterStream<'a> {
pub(crate) struct RemoveLongFilterStream<'a> {
token_length_limit: usize,
tail: BoxTokenStream<'a>,
}

@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)]
pub struct SimpleTokenizer;
pub(crate) struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
pub(crate) struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,

@ -17,7 +17,7 @@ use miette::{IntoDiagnostic, Result};
/// e.g. the missing stem "back" of "backen" implies that "brotbackautomat"
/// is not split in the following example.
///
/// ```rust
/// ```text
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
///
/// let tokenizer =
@ -38,7 +38,7 @@ use miette::{IntoDiagnostic, Result};
///
/// [compound]: https://en.wikipedia.org/wiki/Compound_(linguistics)
#[derive(Clone)]
pub struct SplitCompoundWords {
pub(crate) struct SplitCompoundWords {
dict: AhoCorasick,
}
@ -48,7 +48,7 @@ impl SplitCompoundWords {
/// The dictionary will be used to construct an [`AhoCorasick`] automaton
/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
/// more control over its construction is required.
pub fn from_dictionary<I, P>(dict: I) -> Result<Self>
pub(crate) fn from_dictionary<I, P>(dict: I) -> Result<Self>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
@ -67,7 +67,7 @@ impl SplitCompoundWords {
///
/// The automaton should use one of the leftmost-first match kinds
/// and it should not be anchored.
pub fn from_automaton(dict: AhoCorasick) -> Self {
pub(crate) fn from_automaton(dict: AhoCorasick) -> Self {
Self { dict }
}
}

@ -9,7 +9,7 @@ use crate::fts::tokenizer::BoxTokenStream;
/// Available stemmer languages.
#[derive(Debug, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq, Copy, Clone)]
#[allow(missing_docs)]
pub enum Language {
pub(crate) enum Language {
Arabic,
Danish,
Dutch,
@ -60,13 +60,13 @@ impl Language {
/// languages.
/// Tokens are expected to be lowercased beforehand.
#[derive(Clone)]
pub struct Stemmer {
pub(crate) struct Stemmer {
stemmer_algorithm: Algorithm,
}
impl Stemmer {
/// Creates a new `Stemmer` [`TokenFilter`] for a given language algorithm.
pub fn new(language: Language) -> Stemmer {
pub(crate) fn new(language: Language) -> Stemmer {
Stemmer {
stemmer_algorithm: language.algorithm(),
}
@ -91,7 +91,7 @@ impl TokenFilter for Stemmer {
}
}
pub struct StemmerTokenStream<'a> {
pub(crate) struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>,
stemmer: rust_stemmers::Stemmer,
buffer: String,

@ -31,7 +31,7 @@ with requests.Session() as sess, open("stopwords.rs", "w") as mod:
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
resp.raise_for_status()
mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")
mod.write(f"pub(crate) const {lang.upper()}: &[&str] = &[\n")
for line in resp.text.splitlines():
line, _, _ = line.partition("|")

@ -10,19 +10,19 @@
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! ```
#[cfg(feature = "stopwords")]
#[rustfmt::skip]
mod stopwords;
use std::sync::Arc;
use rustc_hash::FxHashSet;
use crate::fts::tokenizer::Language;
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
pub struct StopWordFilter {
pub(crate) struct StopWordFilter {
words: Arc<FxHashSet<String>>,
}
@ -30,8 +30,7 @@ impl StopWordFilter {
/// Creates a new [`StopWordFilter`] for the given [`Language`]
///
/// Returns `Some` if a list of stop words is available and `None` otherwise.
#[cfg(feature = "stopwords")]
pub fn new(language: Language) -> Option<Self> {
pub(crate) fn new(language: Language) -> Option<Self> {
let words = match language {
Language::Danish => stopwords::DANISH,
Language::Dutch => stopwords::DUTCH,
@ -60,14 +59,14 @@ impl StopWordFilter {
}
/// Creates a `StopWordFilter` given a list of words to remove
pub fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
pub(crate) fn remove<W: IntoIterator<Item = String>>(words: W) -> StopWordFilter {
StopWordFilter {
words: Arc::new(words.into_iter().collect()),
}
}
}
pub struct StopWordFilterStream<'a> {
pub(crate) struct StopWordFilterStream<'a> {
words: Arc<FxHashSet<String>>,
tail: BoxTokenStream<'a>,
}

@ -33,7 +33,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
pub const DANISH: &[&str] = &[
pub(crate) const DANISH: &[&str] = &[
"og",
"i",
"jeg",
@ -130,7 +130,7 @@ pub const DANISH: &[&str] = &[
"sådan",
];
pub const DUTCH: &[&str] = &[
pub(crate) const DUTCH: &[&str] = &[
"de",
"en",
"van",
@ -234,7 +234,7 @@ pub const DUTCH: &[&str] = &[
"andere",
];
pub const FINNISH: &[&str] = &[
pub(crate) const FINNISH: &[&str] = &[
"olla",
"olen",
"olet",
@ -471,7 +471,7 @@ pub const FINNISH: &[&str] = &[
"itse",
];
pub const FRENCH: &[&str] = &[
pub(crate) const FRENCH: &[&str] = &[
"au",
"aux",
"avec",
@ -628,7 +628,7 @@ pub const FRENCH: &[&str] = &[
"soi",
];
pub const GERMAN: &[&str] = &[
pub(crate) const GERMAN: &[&str] = &[
"aber",
"alle",
"allem",
@ -862,7 +862,7 @@ pub const GERMAN: &[&str] = &[
"zwischen",
];
pub const ITALIAN: &[&str] = &[
pub(crate) const ITALIAN: &[&str] = &[
"ad",
"al",
"allo",
@ -1144,7 +1144,7 @@ pub const ITALIAN: &[&str] = &[
"stando",
];
pub const NORWEGIAN: &[&str] = &[
pub(crate) const NORWEGIAN: &[&str] = &[
"og",
"i",
"jeg",
@ -1319,7 +1319,7 @@ pub const NORWEGIAN: &[&str] = &[
"vart",
];
pub const PORTUGUESE: &[&str] = &[
pub(crate) const PORTUGUESE: &[&str] = &[
"de",
"a",
"o",
@ -1525,7 +1525,7 @@ pub const PORTUGUESE: &[&str] = &[
"teriam",
];
pub const RUSSIAN: &[&str] = &[
pub(crate) const RUSSIAN: &[&str] = &[
"и",
"в",
"во",
@ -1687,7 +1687,7 @@ pub const RUSSIAN: &[&str] = &[
"между",
];
pub const SPANISH: &[&str] = &[
pub(crate) const SPANISH: &[&str] = &[
"de",
"la",
"que",
@ -1998,7 +1998,7 @@ pub const SPANISH: &[&str] = &[
"tened",
];
pub const SWEDISH: &[&str] = &[
pub(crate) const SWEDISH: &[&str] = &[
"och",
"det",
"att",

@ -4,11 +4,11 @@ use crate::fts::tokenizer::{Token, TokenStream};
/// Struct representing pre-tokenized text
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct PreTokenizedString {
pub(crate) struct PreTokenizedString {
/// Original text
pub text: String,
pub(crate) text: String,
/// Tokens derived from the text
pub tokens: Vec<Token>,
pub(crate) tokens: Vec<Token>,
}
impl Ord for PreTokenizedString {
@ -24,7 +24,7 @@ impl PartialOrd for PreTokenizedString {
}
/// [`TokenStream`] implementation which wraps [`PreTokenizedString`]
pub struct PreTokenizedStream {
pub(crate) struct PreTokenizedStream {
tokenized_string: PreTokenizedString,
current_token: i64,
}

@ -7,20 +7,20 @@ use crate::fts::tokenizer::empty_tokenizer::EmptyTokenizer;
/// Token
#[derive(Debug, Clone, serde_derive::Serialize, serde_derive::Deserialize, Eq, PartialEq)]
pub struct Token {
pub(crate) struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
pub(crate) offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
pub(crate) offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
pub(crate) position: usize,
/// Actual text content of the token.
pub text: String,
pub(crate) text: String,
/// Is the length expressed in term of number of original tokens.
pub position_length: usize,
pub(crate) position_length: usize,
}
impl Default for Token {
@ -38,7 +38,7 @@ impl Default for Token {
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer {
pub(crate) struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>,
}
@ -60,7 +60,7 @@ impl TextAnalyzer {
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
pub(crate) fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
token_filters,
@ -74,7 +74,7 @@ impl TextAnalyzer {
///
/// # Example
///
/// ```rust
/// ```text
/// use tantivy::tokenizer::*;
///
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
@ -83,13 +83,13 @@ impl TextAnalyzer {
/// .filter(Stemmer::default());
/// ```
#[must_use]
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
pub(crate) fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into());
self
}
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
pub(crate) fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
@ -119,12 +119,12 @@ impl Clone for TextAnalyzer {
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
pub(crate) trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
pub(crate) trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
@ -154,7 +154,7 @@ impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
pub(crate) struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
@ -181,7 +181,7 @@ impl<'a> DerefMut for BoxTokenStream<'a> {
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
pub(crate) struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
@ -203,7 +203,7 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
///
/// # Example
///
/// ```
/// ```text
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -225,7 +225,7 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
/// assert_eq!(token.position, 1);
/// }
/// ```
pub trait TokenStream {
pub(crate) trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
@ -241,7 +241,7 @@ pub trait TokenStream {
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// ```text
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
@ -271,12 +271,12 @@ pub trait TokenStream {
}
}
pub trait TokenFilterClone {
pub(crate) trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
pub(crate) trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}

@ -21,20 +21,20 @@ use crate::fts::tokenizer::{
/// search engine.
/// * `whitespace` : Splits the text on whitespaces.
#[derive(Clone)]
pub struct TokenizerManager {
pub(crate) struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
}
impl TokenizerManager {
/// Creates an empty tokenizer manager.
pub fn new() -> Self {
pub(crate) fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
pub(crate) fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers
@ -44,7 +44,7 @@ impl TokenizerManager {
}
/// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
pub(crate) fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")

@ -4,9 +4,9 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces.
#[derive(Clone)]
pub struct WhitespaceTokenizer;
pub(crate) struct WhitespaceTokenizer;
pub struct WhitespaceTokenStream<'a> {
pub(crate) struct WhitespaceTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,

@ -610,6 +610,21 @@ fn test_index() {
assert!(joins.contains(&json!(":friends:rev")));
}
#[test]
fn test_json_objects() {
let db = new_cozo_mem().unwrap();
db.run_script(
"?[a] := a = {'a': 1}",
Default::default(),
).unwrap();
db.run_script(
r"?[a] := a = {
'a': 1
}",
Default::default(),
).unwrap();
}
#[test]
fn test_custom_rules() {
let db = new_cozo_mem().unwrap();
@ -899,7 +914,10 @@ fn test_insertions() {
db.run_script(r"?[k, v] := *a{k, v}", Default::default())
.unwrap();
db.run_script(
r"::hnsw create a:i {fields: [v], dim: 1536, ef: 16, m: 32, filter: k % 3 == 0}",
r"::hnsw create a:i {
fields: [v], dim: 1536, ef: 16, filter: k % 3 == 0,
m: 32
}",
Default::default(),
)
.unwrap();
@ -926,7 +944,8 @@ fn test_insertions() {
#[test]
fn tentivy_tokenizers() {
use crate::fts::cangjie::*;
use crate::fts::cangjie::tokenizer::CangJieTokenizer;
use crate::fts::cangjie::options::TokenizerOption;
use crate::fts::tokenizer::*;
use jieba_rs::Jieba;

Loading…
Cancel
Save