From 9933a637a90ee64995849b9db0314fe0b8e6d554 Mon Sep 17 00:00:00 2001 From: Ziyang Hu Date: Tue, 25 Apr 2023 21:39:29 +0800 Subject: [PATCH] fts index during creation --- cozo-core/src/fts/indexing.rs | 105 ++++++++++ cozo-core/src/fts/mod.rs | 1 + cozo-core/src/fts/tokenizer/mod.rs | 4 +- .../src/fts/tokenizer/ngram_tokenizer.rs | 182 +++++++++--------- .../src/fts/tokenizer/simple_tokenizer.rs | 2 +- .../{tokenizer.rs => tokenizer_impl.rs} | 2 +- .../src/fts/tokenizer/whitespace_tokenizer.rs | 2 +- cozo-core/src/runtime/db.rs | 4 +- cozo-core/src/runtime/relation.rs | 39 +++- cozo-core/src/runtime/tests.rs | 33 ++++ cozo-core/src/runtime/transact.rs | 2 + 11 files changed, 278 insertions(+), 98 deletions(-) create mode 100644 cozo-core/src/fts/indexing.rs rename cozo-core/src/fts/tokenizer/{tokenizer.rs => tokenizer_impl.rs} (99%) diff --git a/cozo-core/src/fts/indexing.rs b/cozo-core/src/fts/indexing.rs new file mode 100644 index 00000000..a3464ee3 --- /dev/null +++ b/cozo-core/src/fts/indexing.rs @@ -0,0 +1,105 @@ +/* + * Copyright 2023, The Cozo Project Authors. + * + * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + * If a copy of the MPL was not distributed with this file, + * You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use crate::data::expr::{eval_bytecode, Bytecode}; +use crate::fts::tokenizer::TextAnalyzer; +use crate::runtime::relation::RelationHandle; +use crate::runtime::transact::SessionTx; +use crate::DataValue; +use miette::{bail, Diagnostic, Result}; +use rustc_hash::{FxHashMap, FxHashSet}; +use smartstring::{LazyCompact, SmartString}; +use std::collections::HashMap; +use thiserror::Error; + +impl<'a> SessionTx<'a> { + pub(crate) fn put_fts_index_item( + &mut self, + tuple: &[DataValue], + extractor: &[Bytecode], + stack: &mut Vec, + tokenizer: &TextAnalyzer, + rel_handle: &RelationHandle, + idx_handle: &RelationHandle, + ) -> Result<()> { + let to_index = match eval_bytecode(extractor, tuple, stack)? { + DataValue::Null => return Ok(()), + DataValue::Str(s) => s, + val => { + #[derive(Debug, Diagnostic, Error)] + #[error("FTS index extractor must return a string, got {0}")] + #[diagnostic(code(eval::fts::extractor::invalid_return_type))] + struct FtsExtractError(String); + + bail!(FtsExtractError(format!("{}", val))) + } + }; + let mut token_stream = tokenizer.token_stream(&to_index); + let mut collector: HashMap<_, (Vec<_>, Vec<_>), _> = FxHashMap::default(); + while let Some(token) = token_stream.next() { + let text = SmartString::::from(&token.text); + let (fr, to) = collector.entry(text).or_default(); + fr.push(DataValue::from(token.offset_from as i64)); + to.push(DataValue::from(token.offset_to as i64)); + } + let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len()); + key.push(DataValue::Bot); + for k in &tuple[..rel_handle.metadata.keys.len()] { + key.push(k.clone()); + } + let mut val = vec![DataValue::Bot, DataValue::Bot]; + for (text, (from, to)) in collector { + key[0] = DataValue::Str(text); + val[0] = DataValue::List(from); + val[1] = DataValue::List(to); + let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?; + let val_bytes = idx_handle.encode_val_only_for_store(&val, Default::default())?; + self.store_tx.put(&key_bytes, &val_bytes)?; + } + Ok(()) + } + pub(crate) fn del_fts_index_item( + &mut self, + tuple: &[DataValue], + extractor: &[Bytecode], + stack: &mut Vec, + tokenizer: &TextAnalyzer, + rel_handle: &RelationHandle, + idx_handle: &RelationHandle, + ) -> Result<()> { + let to_index = match eval_bytecode(extractor, tuple, stack)? { + DataValue::Null => return Ok(()), + DataValue::Str(s) => s, + val => { + #[derive(Debug, Diagnostic, Error)] + #[error("FTS index extractor must return a string, got {0}")] + #[diagnostic(code(eval::fts::extractor::invalid_return_type))] + struct FtsExtractError(String); + + bail!(FtsExtractError(format!("{}", val))) + } + }; + let mut token_stream = tokenizer.token_stream(&to_index); + let mut collector = FxHashSet::default(); + while let Some(token) = token_stream.next() { + let text = SmartString::::from(&token.text); + collector.insert(text); + } + let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len()); + key.push(DataValue::Bot); + for k in &tuple[..rel_handle.metadata.keys.len()] { + key.push(k.clone()); + } + for text in collector { + key[0] = DataValue::Str(text); + let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?; + self.store_tx.del(&key_bytes)?; + } + Ok(()) + } +} diff --git a/cozo-core/src/fts/mod.rs b/cozo-core/src/fts/mod.rs index 2d47f113..4464f4d4 100644 --- a/cozo-core/src/fts/mod.rs +++ b/cozo-core/src/fts/mod.rs @@ -24,6 +24,7 @@ use std::sync::{Arc, RwLock}; pub(crate) mod cangjie; pub(crate) mod tokenizer; +pub(crate) mod indexing; #[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)] pub(crate) struct FtsIndexManifest { diff --git a/cozo-core/src/fts/tokenizer/mod.rs b/cozo-core/src/fts/tokenizer/mod.rs index 41668eca..a90e3184 100644 --- a/cozo-core/src/fts/tokenizer/mod.rs +++ b/cozo-core/src/fts/tokenizer/mod.rs @@ -135,7 +135,7 @@ mod split_compound_words; mod stemmer; mod stop_word_filter; mod tokenized_string; -mod tokenizer; +mod tokenizer_impl; mod whitespace_tokenizer; pub(crate) use self::alphanum_only::AlphaNumOnlyFilter; @@ -149,7 +149,7 @@ pub(crate) use self::split_compound_words::SplitCompoundWords; pub(crate) use self::stemmer::{Language, Stemmer}; pub(crate) use self::stop_word_filter::StopWordFilter; // pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub(crate) use self::tokenizer::{ +pub(crate) use self::tokenizer_impl::{ BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, }; pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer; diff --git a/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs b/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs index 561542e1..2f7982a3 100644 --- a/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/ngram_tokenizer.rs @@ -105,18 +105,18 @@ impl NgramTokenizer { } } - /// Create a `NGramTokenizer` which generates tokens for all inner ngrams. - /// - /// This is as opposed to only prefix ngrams . - pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { - Self::new(min_gram, max_gram, false) - } - - /// Create a `NGramTokenizer` which only generates tokens for the - /// prefix ngrams. - pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { - Self::new(min_gram, max_gram, true) - } + // /// Create a `NGramTokenizer` which generates tokens for all inner ngrams. + // /// + // /// This is as opposed to only prefix ngrams . + // pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { + // Self::new(min_gram, max_gram, false) + // } + // + // /// Create a `NGramTokenizer` which only generates tokens for the + // /// prefix ngrams. + // pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { + // Self::new(min_gram, max_gram, true) + // } } /// TokenStream associate to the `NgramTokenizer` @@ -303,7 +303,7 @@ mod tests { use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use crate::fts::tokenizer::tests::assert_token; - use crate::fts::tokenizer::tokenizer::Tokenizer; + // use crate::fts::tokenizer::tokenizer_impl::Tokenizer; use crate::fts::tokenizer::{BoxTokenStream, Token}; fn test_helper(mut tokenizer: BoxTokenStream<'_>) -> Vec { @@ -345,84 +345,84 @@ mod tests { ); } - #[test] - fn test_ngram_tokenizer_1_2_false() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); - assert_eq!(tokens.len(), 9); - assert_token(&tokens[0], 0, "h", 0, 1); - assert_token(&tokens[1], 0, "he", 0, 2); - assert_token(&tokens[2], 0, "e", 1, 2); - assert_token(&tokens[3], 0, "el", 1, 3); - assert_token(&tokens[4], 0, "l", 2, 3); - assert_token(&tokens[5], 0, "ll", 2, 4); - assert_token(&tokens[6], 0, "l", 3, 4); - assert_token(&tokens[7], 0, "lo", 3, 5); - assert_token(&tokens[8], 0, "o", 4, 5); - } - - #[test] - fn test_ngram_tokenizer_min_max_equal() { - let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); - assert_eq!(tokens.len(), 3); - assert_token(&tokens[0], 0, "hel", 0, 3); - assert_token(&tokens[1], 0, "ell", 1, 4); - assert_token(&tokens[2], 0, "llo", 2, 5); - } - - #[test] - fn test_ngram_tokenizer_2_5_prefix() { - let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); - assert_eq!(tokens.len(), 4); - assert_token(&tokens[0], 0, "fr", 0, 2); - assert_token(&tokens[1], 0, "fra", 0, 3); - assert_token(&tokens[2], 0, "fran", 0, 4); - assert_token(&tokens[3], 0, "frank", 0, 5); - } - - #[test] - fn test_ngram_non_ascii_1_2() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); - assert_eq!(tokens.len(), 9); - assert_token(&tokens[0], 0, "h", 0, 1); - assert_token(&tokens[1], 0, "hε", 0, 3); - assert_token(&tokens[2], 0, "ε", 1, 3); - assert_token(&tokens[3], 0, "εl", 1, 4); - assert_token(&tokens[4], 0, "l", 3, 4); - assert_token(&tokens[5], 0, "ll", 3, 5); - assert_token(&tokens[6], 0, "l", 4, 5); - assert_token(&tokens[7], 0, "lo", 4, 6); - assert_token(&tokens[8], 0, "o", 5, 6); - } - - #[test] - fn test_ngram_non_ascii_2_5_prefix() { - let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); - assert_eq!(tokens.len(), 4); - assert_token(&tokens[0], 0, "hε", 0, 3); - assert_token(&tokens[1], 0, "hεl", 0, 4); - assert_token(&tokens[2], 0, "hεll", 0, 5); - assert_token(&tokens[3], 0, "hεllo", 0, 6); - } - - #[test] - fn test_ngram_empty() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); - assert!(tokens.is_empty()); - let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); - assert!(tokens.is_empty()); - } - - #[test] - #[should_panic(expected = "min_gram must be greater than 0")] - fn test_ngram_min_max_interval_empty() { - test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); - } - - #[test] - #[should_panic(expected = "min_gram must not be greater than max_gram")] - fn test_invalid_interval_should_panic_if_smaller() { - NgramTokenizer::all_ngrams(2, 1); - } + // #[test] + // fn test_ngram_tokenizer_1_2_false() { + // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); + // assert_eq!(tokens.len(), 9); + // assert_token(&tokens[0], 0, "h", 0, 1); + // assert_token(&tokens[1], 0, "he", 0, 2); + // assert_token(&tokens[2], 0, "e", 1, 2); + // assert_token(&tokens[3], 0, "el", 1, 3); + // assert_token(&tokens[4], 0, "l", 2, 3); + // assert_token(&tokens[5], 0, "ll", 2, 4); + // assert_token(&tokens[6], 0, "l", 3, 4); + // assert_token(&tokens[7], 0, "lo", 3, 5); + // assert_token(&tokens[8], 0, "o", 4, 5); + // } + + // #[test] + // fn test_ngram_tokenizer_min_max_equal() { + // let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); + // assert_eq!(tokens.len(), 3); + // assert_token(&tokens[0], 0, "hel", 0, 3); + // assert_token(&tokens[1], 0, "ell", 1, 4); + // assert_token(&tokens[2], 0, "llo", 2, 5); + // } + + // #[test] + // fn test_ngram_tokenizer_2_5_prefix() { + // let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); + // assert_eq!(tokens.len(), 4); + // assert_token(&tokens[0], 0, "fr", 0, 2); + // assert_token(&tokens[1], 0, "fra", 0, 3); + // assert_token(&tokens[2], 0, "fran", 0, 4); + // assert_token(&tokens[3], 0, "frank", 0, 5); + // } + + // #[test] + // fn test_ngram_non_ascii_1_2() { + // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); + // assert_eq!(tokens.len(), 9); + // assert_token(&tokens[0], 0, "h", 0, 1); + // assert_token(&tokens[1], 0, "hε", 0, 3); + // assert_token(&tokens[2], 0, "ε", 1, 3); + // assert_token(&tokens[3], 0, "εl", 1, 4); + // assert_token(&tokens[4], 0, "l", 3, 4); + // assert_token(&tokens[5], 0, "ll", 3, 5); + // assert_token(&tokens[6], 0, "l", 4, 5); + // assert_token(&tokens[7], 0, "lo", 4, 6); + // assert_token(&tokens[8], 0, "o", 5, 6); + // } + + // #[test] + // fn test_ngram_non_ascii_2_5_prefix() { + // let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); + // assert_eq!(tokens.len(), 4); + // assert_token(&tokens[0], 0, "hε", 0, 3); + // assert_token(&tokens[1], 0, "hεl", 0, 4); + // assert_token(&tokens[2], 0, "hεll", 0, 5); + // assert_token(&tokens[3], 0, "hεllo", 0, 6); + // } + + // #[test] + // fn test_ngram_empty() { + // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); + // assert!(tokens.is_empty()); + // let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); + // assert!(tokens.is_empty()); + // } + + // #[test] + // #[should_panic(expected = "min_gram must be greater than 0")] + // fn test_ngram_min_max_interval_empty() { + // test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); + // } + + // #[test] + // #[should_panic(expected = "min_gram must not be greater than max_gram")] + // fn test_invalid_interval_should_panic_if_smaller() { + // NgramTokenizer::all_ngrams(2, 1); + // } #[test] fn test_stutterring_iterator_empty() { diff --git a/cozo-core/src/fts/tokenizer/simple_tokenizer.rs b/cozo-core/src/fts/tokenizer/simple_tokenizer.rs index 76dd7c20..924d0960 100644 --- a/cozo-core/src/fts/tokenizer/simple_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/simple_tokenizer.rs @@ -26,7 +26,7 @@ impl<'a> SimpleTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) - .filter(|&(_, ref c)| !c.is_alphanumeric()) + .filter(|(_, c)| !c.is_alphanumeric()) .map(|(offset, _)| offset) .next() .unwrap_or(self.text.len()) diff --git a/cozo-core/src/fts/tokenizer/tokenizer.rs b/cozo-core/src/fts/tokenizer/tokenizer_impl.rs similarity index 99% rename from cozo-core/src/fts/tokenizer/tokenizer.rs rename to cozo-core/src/fts/tokenizer/tokenizer_impl.rs index 8b5e732d..b8711c51 100644 --- a/cozo-core/src/fts/tokenizer/tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/tokenizer_impl.rs @@ -82,7 +82,7 @@ impl TextAnalyzer { /// .filter(LowerCaser) /// .filter(Stemmer::default()); /// ``` - #[must_use] + #[allow(unused)] pub(crate) fn filter>(mut self, token_filter: F) -> Self { self.token_filters.push(token_filter.into()); self diff --git a/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs b/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs index ba6b6658..5579899f 100644 --- a/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs +++ b/cozo-core/src/fts/tokenizer/whitespace_tokenizer.rs @@ -26,7 +26,7 @@ impl<'a> WhitespaceTokenStream<'a> { // search for the end of the current token. fn search_token_end(&mut self) -> usize { (&mut self.chars) - .filter(|&(_, ref c)| c.is_ascii_whitespace()) + .filter(|(_, c)| c.is_ascii_whitespace()) .map(|(offset, _)| offset) .next() .unwrap_or(self.text.len()) diff --git a/cozo-core/src/runtime/db.rs b/cozo-core/src/runtime/db.rs index caddebd6..f909f228 100644 --- a/cozo-core/src/runtime/db.rs +++ b/cozo-core/src/runtime/db.rs @@ -39,6 +39,7 @@ use crate::data::relation::ColumnDef; use crate::data::tuple::{Tuple, TupleT}; use crate::data::value::{DataValue, ValidityTs, LARGEST_UTF_CHAR}; use crate::fixed_rule::DEFAULT_FIXED_RULES; +use crate::fts::TokenizerCache; use crate::parse::sys::SysOp; use crate::parse::{parse_script, CozoScript, SourceSpan}; use crate::query::compile::{CompiledProgram, CompiledRule, CompiledRuleSet}; @@ -57,7 +58,6 @@ use crate::runtime::transact::SessionTx; use crate::storage::temp::TempStorage; use crate::storage::{Storage, StoreTx}; use crate::{decode_tuple_from_kv, FixedRule}; -use crate::fts::TokenizerCache; pub(crate) struct RunningQueryHandle { pub(crate) started_at: f64, @@ -806,6 +806,7 @@ impl<'s, S: Storage<'s>> Db { temp_store_tx: self.temp_db.transact(true)?, relation_store_id: self.relation_store_id.clone(), temp_store_id: Default::default(), + tokenizers: self.tokenizers.clone(), }; Ok(ret) } @@ -815,6 +816,7 @@ impl<'s, S: Storage<'s>> Db { temp_store_tx: self.temp_db.transact(true)?, relation_store_id: self.relation_store_id.clone(), temp_store_id: Default::default(), + tokenizers: self.tokenizers.clone(), }; Ok(ret) } diff --git a/cozo-core/src/runtime/relation.rs b/cozo-core/src/runtime/relation.rs index 17db1e05..c92986b4 100644 --- a/cozo-core/src/runtime/relation.rs +++ b/cozo-core/src/runtime/relation.rs @@ -749,7 +749,44 @@ impl<'a> SessionTx<'a> { filters: config.filters, }; - // populate index TODO + // populate index + let tokenizer = + self.tokenizers + .get(&idx_handle.name, &manifest.tokenizer, &manifest.filters)?; + + let parsed = CozoScriptParser::parse(Rule::expr, &manifest.extractor) + .into_diagnostic()? + .next() + .unwrap(); + let mut code_expr = build_expr(parsed, &Default::default())?; + let binding_map = rel_handle.raw_binding_map(); + code_expr.fill_binding_indices(&binding_map)?; + let extractor = code_expr.compile()?; + + let mut stack = vec![]; + + let existing: Vec<_> = rel_handle.scan_all(self).try_collect()?; + for tuple in existing { + let key_part = &tuple[..rel_handle.metadata.keys.len()]; + if rel_handle.exists(self, key_part)? { + self.del_fts_index_item( + &tuple, + &extractor, + &mut stack, + &tokenizer, + &rel_handle, + &idx_handle, + )?; + } + self.put_fts_index_item( + &tuple, + &extractor, + &mut stack, + &tokenizer, + &rel_handle, + &idx_handle, + )?; + } rel_handle .fts_indices diff --git a/cozo-core/src/runtime/tests.rs b/cozo-core/src/runtime/tests.rs index 172cda3f..1833675e 100644 --- a/cozo-core/src/runtime/tests.rs +++ b/cozo-core/src/runtime/tests.rs @@ -901,6 +901,39 @@ fn test_vec_index() { } } +#[test] +fn test_fts_indexing() { + let db = DbInstance::new("mem", "", "").unwrap(); + db.run_script(r":create a {k: String => v: String}", Default::default()) + .unwrap(); + db.run_script( + r"?[k, v] <- [['a', 'hello world!'], ['b', 'the world is round']] :put a {k => v}", + Default::default(), + ) + .unwrap(); + db.run_script( + r"::fts create a:fts {extractor: v, tokenizer: Simple }", + Default::default(), + ) + .unwrap(); + db.run_script( + r"?[k, v] <- [['c', 'see you at the end of the world!']] :put a {k => v}", + Default::default(), + ) + .unwrap(); + let res = db + .run_script( + r" + ?[word, src_k, offset_from, offset_to] := *a:fts{word, src_k, offset_from, offset_to} + ", + Default::default(), + ) + .unwrap(); + for row in res.into_json()["rows"].as_array().unwrap() { + println!("{}", row); + } +} + #[test] fn test_insertions() { let db = DbInstance::new("mem", "", "").unwrap(); diff --git a/cozo-core/src/runtime/transact.rs b/cozo-core/src/runtime/transact.rs index b64c05dd..ad7f052a 100644 --- a/cozo-core/src/runtime/transact.rs +++ b/cozo-core/src/runtime/transact.rs @@ -13,6 +13,7 @@ use miette::{bail, Result}; use crate::data::tuple::TupleT; use crate::data::value::DataValue; +use crate::fts::TokenizerCache; use crate::runtime::relation::RelationId; use crate::storage::temp::TempTx; use crate::storage::StoreTx; @@ -22,6 +23,7 @@ pub struct SessionTx<'a> { pub(crate) temp_store_tx: TempTx, pub(crate) relation_store_id: Arc, pub(crate) temp_store_id: AtomicU32, + pub(crate) tokenizers: Arc, } pub const CURRENT_STORAGE_VERSION: [u8; 1] = [0x00];