fts index during creation
parent
ac8ccbc094
commit
9933a637a9
@ -0,0 +1,105 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2023, The Cozo Project Authors.
|
||||||
|
*
|
||||||
|
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
|
||||||
|
* If a copy of the MPL was not distributed with this file,
|
||||||
|
* You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use crate::data::expr::{eval_bytecode, Bytecode};
|
||||||
|
use crate::fts::tokenizer::TextAnalyzer;
|
||||||
|
use crate::runtime::relation::RelationHandle;
|
||||||
|
use crate::runtime::transact::SessionTx;
|
||||||
|
use crate::DataValue;
|
||||||
|
use miette::{bail, Diagnostic, Result};
|
||||||
|
use rustc_hash::{FxHashMap, FxHashSet};
|
||||||
|
use smartstring::{LazyCompact, SmartString};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
impl<'a> SessionTx<'a> {
|
||||||
|
pub(crate) fn put_fts_index_item(
|
||||||
|
&mut self,
|
||||||
|
tuple: &[DataValue],
|
||||||
|
extractor: &[Bytecode],
|
||||||
|
stack: &mut Vec<DataValue>,
|
||||||
|
tokenizer: &TextAnalyzer,
|
||||||
|
rel_handle: &RelationHandle,
|
||||||
|
idx_handle: &RelationHandle,
|
||||||
|
) -> Result<()> {
|
||||||
|
let to_index = match eval_bytecode(extractor, tuple, stack)? {
|
||||||
|
DataValue::Null => return Ok(()),
|
||||||
|
DataValue::Str(s) => s,
|
||||||
|
val => {
|
||||||
|
#[derive(Debug, Diagnostic, Error)]
|
||||||
|
#[error("FTS index extractor must return a string, got {0}")]
|
||||||
|
#[diagnostic(code(eval::fts::extractor::invalid_return_type))]
|
||||||
|
struct FtsExtractError(String);
|
||||||
|
|
||||||
|
bail!(FtsExtractError(format!("{}", val)))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut token_stream = tokenizer.token_stream(&to_index);
|
||||||
|
let mut collector: HashMap<_, (Vec<_>, Vec<_>), _> = FxHashMap::default();
|
||||||
|
while let Some(token) = token_stream.next() {
|
||||||
|
let text = SmartString::<LazyCompact>::from(&token.text);
|
||||||
|
let (fr, to) = collector.entry(text).or_default();
|
||||||
|
fr.push(DataValue::from(token.offset_from as i64));
|
||||||
|
to.push(DataValue::from(token.offset_to as i64));
|
||||||
|
}
|
||||||
|
let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len());
|
||||||
|
key.push(DataValue::Bot);
|
||||||
|
for k in &tuple[..rel_handle.metadata.keys.len()] {
|
||||||
|
key.push(k.clone());
|
||||||
|
}
|
||||||
|
let mut val = vec![DataValue::Bot, DataValue::Bot];
|
||||||
|
for (text, (from, to)) in collector {
|
||||||
|
key[0] = DataValue::Str(text);
|
||||||
|
val[0] = DataValue::List(from);
|
||||||
|
val[1] = DataValue::List(to);
|
||||||
|
let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?;
|
||||||
|
let val_bytes = idx_handle.encode_val_only_for_store(&val, Default::default())?;
|
||||||
|
self.store_tx.put(&key_bytes, &val_bytes)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
pub(crate) fn del_fts_index_item(
|
||||||
|
&mut self,
|
||||||
|
tuple: &[DataValue],
|
||||||
|
extractor: &[Bytecode],
|
||||||
|
stack: &mut Vec<DataValue>,
|
||||||
|
tokenizer: &TextAnalyzer,
|
||||||
|
rel_handle: &RelationHandle,
|
||||||
|
idx_handle: &RelationHandle,
|
||||||
|
) -> Result<()> {
|
||||||
|
let to_index = match eval_bytecode(extractor, tuple, stack)? {
|
||||||
|
DataValue::Null => return Ok(()),
|
||||||
|
DataValue::Str(s) => s,
|
||||||
|
val => {
|
||||||
|
#[derive(Debug, Diagnostic, Error)]
|
||||||
|
#[error("FTS index extractor must return a string, got {0}")]
|
||||||
|
#[diagnostic(code(eval::fts::extractor::invalid_return_type))]
|
||||||
|
struct FtsExtractError(String);
|
||||||
|
|
||||||
|
bail!(FtsExtractError(format!("{}", val)))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut token_stream = tokenizer.token_stream(&to_index);
|
||||||
|
let mut collector = FxHashSet::default();
|
||||||
|
while let Some(token) = token_stream.next() {
|
||||||
|
let text = SmartString::<LazyCompact>::from(&token.text);
|
||||||
|
collector.insert(text);
|
||||||
|
}
|
||||||
|
let mut key = Vec::with_capacity(1 + rel_handle.metadata.keys.len());
|
||||||
|
key.push(DataValue::Bot);
|
||||||
|
for k in &tuple[..rel_handle.metadata.keys.len()] {
|
||||||
|
key.push(k.clone());
|
||||||
|
}
|
||||||
|
for text in collector {
|
||||||
|
key[0] = DataValue::Str(text);
|
||||||
|
let key_bytes = idx_handle.encode_key_for_store(&key, Default::default())?;
|
||||||
|
self.store_tx.del(&key_bytes)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue