fts index during creation

main
Ziyang Hu 1 year ago
parent ac8ccbc094
commit 9933a637a9

@ -0,0 +1,105 @@
/*
* Copyright 2023, The Cozo Project Authors.
*
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
* If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/.
*/
use crate::data::expr::{eval_bytecode, Bytecode};
use crate::fts::tokenizer::TextAnalyzer;
use crate::runtime::relation::RelationHandle;
use crate::runtime::transact::SessionTx;
use crate::DataValue;
use miette::{bail, Diagnostic, Result};
use rustc_hash::{FxHashMap, FxHashSet};
use smartstring::{LazyCompact, SmartString};
use std::collections::HashMap;
use thiserror::Error;
impl<'a> SessionTx<'a> {
pub(crate) fn put_fts_index_item(
&mut self,
tuple: &[DataValue],
extractor: &[Bytecode],
stack: &mut Vec<DataValue>,
tokenizer: &TextAnalyzer,
rel_handle: &RelationHandle,
idx_handle: &RelationHandle,
) -> Result<()> {
let to_index = match eval_bytecode(extractor, tuple, stack)? {
DataValue::Null => return Ok(()),
DataValue::Str(s) => s,
val => {
#[derive(Debug, Diagnostic, Error)]
#[error("FTS index extractor must return a string, got {0}")]
#[diagnostic(code(eval::fts::extractor::invalid_return_type))]
struct FtsExtractError(String);
bail!(FtsExtractError(format!("{}", val)))
}
};
let mut token_stream = tokenizer.token_stream(&to_index);
let mut collector: HashMap<_, (Vec<_>, Vec<_>), _> = FxHashMap::default();
while let Some(token) = token_stream.next() {
let text = SmartString::<LazyCompact>::from(&token.text);
let (fr, to) = collector.entry(text).or_default();
fr.push(DataValue::from(token.offset_from as i64));
to.push(DataValue::from(token.offset_to as i64));
}
let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len());
key.push(DataValue::Bot);
for k in &tuple[..rel_handle.metadata.keys.len()] {
key.push(k.clone());
}
let mut val = vec![DataValue::Bot, DataValue::Bot];
for (text, (from, to)) in collector {
key[0] = DataValue::Str(text);
val[0] = DataValue::List(from);
val[1] = DataValue::List(to);
let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?;
let val_bytes = idx_handle.encode_val_only_for_store(&val, Default::default())?;
self.store_tx.put(&key_bytes, &val_bytes)?;
}
Ok(())
}
pub(crate) fn del_fts_index_item(
&mut self,
tuple: &[DataValue],
extractor: &[Bytecode],
stack: &mut Vec<DataValue>,
tokenizer: &TextAnalyzer,
rel_handle: &RelationHandle,
idx_handle: &RelationHandle,
) -> Result<()> {
let to_index = match eval_bytecode(extractor, tuple, stack)? {
DataValue::Null => return Ok(()),
DataValue::Str(s) => s,
val => {
#[derive(Debug, Diagnostic, Error)]
#[error("FTS index extractor must return a string, got {0}")]
#[diagnostic(code(eval::fts::extractor::invalid_return_type))]
struct FtsExtractError(String);
bail!(FtsExtractError(format!("{}", val)))
}
};
let mut token_stream = tokenizer.token_stream(&to_index);
let mut collector = FxHashSet::default();
while let Some(token) = token_stream.next() {
let text = SmartString::<LazyCompact>::from(&token.text);
collector.insert(text);
}
let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len());
key.push(DataValue::Bot);
for k in &tuple[..rel_handle.metadata.keys.len()] {
key.push(k.clone());
}
for text in collector {
key[0] = DataValue::Str(text);
let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?;
self.store_tx.del(&key_bytes)?;
}
Ok(())
}
}

@ -24,6 +24,7 @@ use std::sync::{Arc, RwLock};
pub(crate) mod cangjie; pub(crate) mod cangjie;
pub(crate) mod tokenizer; pub(crate) mod tokenizer;
pub(crate) mod indexing;
#[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)] #[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)]
pub(crate) struct FtsIndexManifest { pub(crate) struct FtsIndexManifest {

@ -135,7 +135,7 @@ mod split_compound_words;
mod stemmer; mod stemmer;
mod stop_word_filter; mod stop_word_filter;
mod tokenized_string; mod tokenized_string;
mod tokenizer; mod tokenizer_impl;
mod whitespace_tokenizer; mod whitespace_tokenizer;
pub(crate) use self::alphanum_only::AlphaNumOnlyFilter; pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
@ -149,7 +149,7 @@ pub(crate) use self::split_compound_words::SplitCompoundWords;
pub(crate) use self::stemmer::{Language, Stemmer}; pub(crate) use self::stemmer::{Language, Stemmer};
pub(crate) use self::stop_word_filter::StopWordFilter; pub(crate) use self::stop_word_filter::StopWordFilter;
// pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; // pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub(crate) use self::tokenizer::{ pub(crate) use self::tokenizer_impl::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
}; };
pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer; pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;

@ -105,18 +105,18 @@ impl NgramTokenizer {
} }
} }
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams. // /// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
/// // ///
/// This is as opposed to only prefix ngrams . // /// This is as opposed to only prefix ngrams .
pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer { // pub(crate) fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false) // Self::new(min_gram, max_gram, false)
} // }
//
/// Create a `NGramTokenizer` which only generates tokens for the // /// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams. // /// prefix ngrams.
pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer { // pub(crate) fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true) // Self::new(min_gram, max_gram, true)
} // }
} }
/// TokenStream associate to the `NgramTokenizer` /// TokenStream associate to the `NgramTokenizer`
@ -303,7 +303,7 @@ mod tests {
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
use crate::fts::tokenizer::tests::assert_token; use crate::fts::tokenizer::tests::assert_token;
use crate::fts::tokenizer::tokenizer::Tokenizer; // use crate::fts::tokenizer::tokenizer_impl::Tokenizer;
use crate::fts::tokenizer::{BoxTokenStream, Token}; use crate::fts::tokenizer::{BoxTokenStream, Token};
fn test_helper(mut tokenizer: BoxTokenStream<'_>) -> Vec<Token> { fn test_helper(mut tokenizer: BoxTokenStream<'_>) -> Vec<Token> {
@ -345,84 +345,84 @@ mod tests {
); );
} }
#[test] // #[test]
fn test_ngram_tokenizer_1_2_false() { // fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
assert_eq!(tokens.len(), 9); // assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1); // assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2); // assert_token(&tokens[1], 0, "he", 0, 2);
assert_token(&tokens[2], 0, "e", 1, 2); // assert_token(&tokens[2], 0, "e", 1, 2);
assert_token(&tokens[3], 0, "el", 1, 3); // assert_token(&tokens[3], 0, "el", 1, 3);
assert_token(&tokens[4], 0, "l", 2, 3); // assert_token(&tokens[4], 0, "l", 2, 3);
assert_token(&tokens[5], 0, "ll", 2, 4); // assert_token(&tokens[5], 0, "ll", 2, 4);
assert_token(&tokens[6], 0, "l", 3, 4); // assert_token(&tokens[6], 0, "l", 3, 4);
assert_token(&tokens[7], 0, "lo", 3, 5); // assert_token(&tokens[7], 0, "lo", 3, 5);
assert_token(&tokens[8], 0, "o", 4, 5); // assert_token(&tokens[8], 0, "o", 4, 5);
} // }
#[test] // #[test]
fn test_ngram_tokenizer_min_max_equal() { // fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); // let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
assert_eq!(tokens.len(), 3); // assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3); // assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4); // assert_token(&tokens[1], 0, "ell", 1, 4);
assert_token(&tokens[2], 0, "llo", 2, 5); // assert_token(&tokens[2], 0, "llo", 2, 5);
} // }
#[test] // #[test]
fn test_ngram_tokenizer_2_5_prefix() { // fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); // let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
assert_eq!(tokens.len(), 4); // assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2); // assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3); // assert_token(&tokens[1], 0, "fra", 0, 3);
assert_token(&tokens[2], 0, "fran", 0, 4); // assert_token(&tokens[2], 0, "fran", 0, 4);
assert_token(&tokens[3], 0, "frank", 0, 5); // assert_token(&tokens[3], 0, "frank", 0, 5);
} // }
#[test] // #[test]
fn test_ngram_non_ascii_1_2() { // fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
assert_eq!(tokens.len(), 9); // assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1); // assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "hε", 0, 3); // assert_token(&tokens[1], 0, "hε", 0, 3);
assert_token(&tokens[2], 0, "ε", 1, 3); // assert_token(&tokens[2], 0, "ε", 1, 3);
assert_token(&tokens[3], 0, "εl", 1, 4); // assert_token(&tokens[3], 0, "εl", 1, 4);
assert_token(&tokens[4], 0, "l", 3, 4); // assert_token(&tokens[4], 0, "l", 3, 4);
assert_token(&tokens[5], 0, "ll", 3, 5); // assert_token(&tokens[5], 0, "ll", 3, 5);
assert_token(&tokens[6], 0, "l", 4, 5); // assert_token(&tokens[6], 0, "l", 4, 5);
assert_token(&tokens[7], 0, "lo", 4, 6); // assert_token(&tokens[7], 0, "lo", 4, 6);
assert_token(&tokens[8], 0, "o", 5, 6); // assert_token(&tokens[8], 0, "o", 5, 6);
} // }
#[test] // #[test]
fn test_ngram_non_ascii_2_5_prefix() { // fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); // let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
assert_eq!(tokens.len(), 4); // assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "hε", 0, 3); // assert_token(&tokens[0], 0, "hε", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4); // assert_token(&tokens[1], 0, "hεl", 0, 4);
assert_token(&tokens[2], 0, "hεll", 0, 5); // assert_token(&tokens[2], 0, "hεll", 0, 5);
assert_token(&tokens[3], 0, "hεllo", 0, 6); // assert_token(&tokens[3], 0, "hεllo", 0, 6);
} // }
#[test] // #[test]
fn test_ngram_empty() { // fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); // let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
assert!(tokens.is_empty()); // assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); // let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
assert!(tokens.is_empty()); // assert!(tokens.is_empty());
} // }
#[test] // #[test]
#[should_panic(expected = "min_gram must be greater than 0")] // #[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() { // fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); // test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
} // }
#[test] // #[test]
#[should_panic(expected = "min_gram must not be greater than max_gram")] // #[should_panic(expected = "min_gram must not be greater than max_gram")]
fn test_invalid_interval_should_panic_if_smaller() { // fn test_invalid_interval_should_panic_if_smaller() {
NgramTokenizer::all_ngrams(2, 1); // NgramTokenizer::all_ngrams(2, 1);
} // }
#[test] #[test]
fn test_stutterring_iterator_empty() { fn test_stutterring_iterator_empty() {

@ -26,7 +26,7 @@ impl<'a> SimpleTokenStream<'a> {
// search for the end of the current token. // search for the end of the current token.
fn search_token_end(&mut self) -> usize { fn search_token_end(&mut self) -> usize {
(&mut self.chars) (&mut self.chars)
.filter(|&(_, ref c)| !c.is_alphanumeric()) .filter(|(_, c)| !c.is_alphanumeric())
.map(|(offset, _)| offset) .map(|(offset, _)| offset)
.next() .next()
.unwrap_or(self.text.len()) .unwrap_or(self.text.len())

@ -82,7 +82,7 @@ impl TextAnalyzer {
/// .filter(LowerCaser) /// .filter(LowerCaser)
/// .filter(Stemmer::default()); /// .filter(Stemmer::default());
/// ``` /// ```
#[must_use] #[allow(unused)]
pub(crate) fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self { pub(crate) fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into()); self.token_filters.push(token_filter.into());
self self

@ -26,7 +26,7 @@ impl<'a> WhitespaceTokenStream<'a> {
// search for the end of the current token. // search for the end of the current token.
fn search_token_end(&mut self) -> usize { fn search_token_end(&mut self) -> usize {
(&mut self.chars) (&mut self.chars)
.filter(|&(_, ref c)| c.is_ascii_whitespace()) .filter(|(_, c)| c.is_ascii_whitespace())
.map(|(offset, _)| offset) .map(|(offset, _)| offset)
.next() .next()
.unwrap_or(self.text.len()) .unwrap_or(self.text.len())

@ -39,6 +39,7 @@ use crate::data::relation::ColumnDef;
use crate::data::tuple::{Tuple, TupleT}; use crate::data::tuple::{Tuple, TupleT};
use crate::data::value::{DataValue, ValidityTs, LARGEST_UTF_CHAR}; use crate::data::value::{DataValue, ValidityTs, LARGEST_UTF_CHAR};
use crate::fixed_rule::DEFAULT_FIXED_RULES; use crate::fixed_rule::DEFAULT_FIXED_RULES;
use crate::fts::TokenizerCache;
use crate::parse::sys::SysOp; use crate::parse::sys::SysOp;
use crate::parse::{parse_script, CozoScript, SourceSpan}; use crate::parse::{parse_script, CozoScript, SourceSpan};
use crate::query::compile::{CompiledProgram, CompiledRule, CompiledRuleSet}; use crate::query::compile::{CompiledProgram, CompiledRule, CompiledRuleSet};
@ -57,7 +58,6 @@ use crate::runtime::transact::SessionTx;
use crate::storage::temp::TempStorage; use crate::storage::temp::TempStorage;
use crate::storage::{Storage, StoreTx}; use crate::storage::{Storage, StoreTx};
use crate::{decode_tuple_from_kv, FixedRule}; use crate::{decode_tuple_from_kv, FixedRule};
use crate::fts::TokenizerCache;
pub(crate) struct RunningQueryHandle { pub(crate) struct RunningQueryHandle {
pub(crate) started_at: f64, pub(crate) started_at: f64,
@ -806,6 +806,7 @@ impl<'s, S: Storage<'s>> Db<S> {
temp_store_tx: self.temp_db.transact(true)?, temp_store_tx: self.temp_db.transact(true)?,
relation_store_id: self.relation_store_id.clone(), relation_store_id: self.relation_store_id.clone(),
temp_store_id: Default::default(), temp_store_id: Default::default(),
tokenizers: self.tokenizers.clone(),
}; };
Ok(ret) Ok(ret)
} }
@ -815,6 +816,7 @@ impl<'s, S: Storage<'s>> Db<S> {
temp_store_tx: self.temp_db.transact(true)?, temp_store_tx: self.temp_db.transact(true)?,
relation_store_id: self.relation_store_id.clone(), relation_store_id: self.relation_store_id.clone(),
temp_store_id: Default::default(), temp_store_id: Default::default(),
tokenizers: self.tokenizers.clone(),
}; };
Ok(ret) Ok(ret)
} }

@ -749,7 +749,44 @@ impl<'a> SessionTx<'a> {
filters: config.filters, filters: config.filters,
}; };
// populate index TODO // populate index
let tokenizer =
self.tokenizers
.get(&idx_handle.name, &manifest.tokenizer, &manifest.filters)?;
let parsed = CozoScriptParser::parse(Rule::expr, &manifest.extractor)
.into_diagnostic()?
.next()
.unwrap();
let mut code_expr = build_expr(parsed, &Default::default())?;
let binding_map = rel_handle.raw_binding_map();
code_expr.fill_binding_indices(&binding_map)?;
let extractor = code_expr.compile()?;
let mut stack = vec![];
let existing: Vec<_> = rel_handle.scan_all(self).try_collect()?;
for tuple in existing {
let key_part = &tuple[..rel_handle.metadata.keys.len()];
if rel_handle.exists(self, key_part)? {
self.del_fts_index_item(
&tuple,
&extractor,
&mut stack,
&tokenizer,
&rel_handle,
&idx_handle,
)?;
}
self.put_fts_index_item(
&tuple,
&extractor,
&mut stack,
&tokenizer,
&rel_handle,
&idx_handle,
)?;
}
rel_handle rel_handle
.fts_indices .fts_indices

@ -901,6 +901,39 @@ fn test_vec_index() {
} }
} }
#[test]
fn test_fts_indexing() {
let db = DbInstance::new("mem", "", "").unwrap();
db.run_script(r":create a {k: String => v: String}", Default::default())
.unwrap();
db.run_script(
r"?[k, v] <- [['a', 'hello world!'], ['b', 'the world is round']] :put a {k => v}",
Default::default(),
)
.unwrap();
db.run_script(
r"::fts create a:fts {extractor: v, tokenizer: Simple }",
Default::default(),
)
.unwrap();
db.run_script(
r"?[k, v] <- [['c', 'see you at the end of the world!']] :put a {k => v}",
Default::default(),
)
.unwrap();
let res = db
.run_script(
r"
?[word, src_k, offset_from, offset_to] := *a:fts{word, src_k, offset_from, offset_to}
",
Default::default(),
)
.unwrap();
for row in res.into_json()["rows"].as_array().unwrap() {
println!("{}", row);
}
}
#[test] #[test]
fn test_insertions() { fn test_insertions() {
let db = DbInstance::new("mem", "", "").unwrap(); let db = DbInstance::new("mem", "", "").unwrap();

@ -13,6 +13,7 @@ use miette::{bail, Result};
use crate::data::tuple::TupleT; use crate::data::tuple::TupleT;
use crate::data::value::DataValue; use crate::data::value::DataValue;
use crate::fts::TokenizerCache;
use crate::runtime::relation::RelationId; use crate::runtime::relation::RelationId;
use crate::storage::temp::TempTx; use crate::storage::temp::TempTx;
use crate::storage::StoreTx; use crate::storage::StoreTx;
@ -22,6 +23,7 @@ pub struct SessionTx<'a> {
pub(crate) temp_store_tx: TempTx, pub(crate) temp_store_tx: TempTx,
pub(crate) relation_store_id: Arc<AtomicU64>, pub(crate) relation_store_id: Arc<AtomicU64>,
pub(crate) temp_store_id: AtomicU32, pub(crate) temp_store_id: AtomicU32,
pub(crate) tokenizers: Arc<TokenizerCache>,
} }
pub const CURRENT_STORAGE_VERSION: [u8; 1] = [0x00]; pub const CURRENT_STORAGE_VERSION: [u8; 1] = [0x00];

Loading…
Cancel
Save