diff --git a/cozo-core/src/cozoscript.pest b/cozo-core/src/cozoscript.pest index 36df7164..2fa0be52 100644 --- a/cozo-core/src/cozoscript.pest +++ b/cozo-core/src/cozoscript.pest @@ -241,8 +241,17 @@ loop_block = {("%mark" ~ ident)? ~ "%loop" ~ imperative_block ~ "%end"} temp_swap = {"%swap" ~ underscore_ident ~ underscore_ident} debug_stmt = {"%debug" ~ (ident | underscore_ident)} -/* - -yield is no longer necessary! - -*/ \ No newline at end of file +fts_doc = {SOI ~ fts_expr+ ~ EOI} +fts_phrase_simple = @{!("AND" | "OR" | "NOT" | "NEAR") ~ (XID_CONTINUE+)} +fts_phrase_group = {fts_phrase_simple+} +fts_prefix_marker = @{"*"} +fts_booster = {"^" ~ (dot_float | pos_int)} +fts_phrase = {(fts_phrase_group | quoted_string | s_quoted_string | raw_string) ~ fts_prefix_marker? ~ fts_booster?} +fts_near = {"NEAR" ~ ("/" ~ pos_int)? ~ "(" ~ fts_phrase+ ~ ")"} +fts_term = _{fts_phrase | fts_near | fts_grouped} +fts_grouped = {"(" ~ fts_expr+ ~ ")"} +fts_expr = {fts_term ~ (fts_op ~ fts_term)*} +fts_op = _{fts_and | fts_or | fts_not} +fts_and = {"AND"} +fts_or = {"OR"} +fts_not = {"NOT"} \ No newline at end of file diff --git a/cozo-core/src/fts/ast.rs b/cozo-core/src/fts/ast.rs new file mode 100644 index 00000000..c952d36b --- /dev/null +++ b/cozo-core/src/fts/ast.rs @@ -0,0 +1,169 @@ +/* + * Copyright 2023, The Cozo Project Authors. + * + * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + * If a copy of the MPL was not distributed with this file, + * You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use crate::fts::tokenizer::TextAnalyzer; +use smartstring::{LazyCompact, SmartString}; + +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct FtsLiteral { + pub(crate) value: SmartString, + pub(crate) is_prefix: bool, + pub(crate) booster: f64, +} + +impl FtsLiteral { + pub(crate) fn tokenize(self, tokenizer: &TextAnalyzer, coll: &mut Vec) { + if self.is_prefix { + coll.push(self); + return; + } + + let mut tokens = tokenizer.token_stream(&self.value); + while let Some(t) = tokens.next() { + coll.push(FtsLiteral { + value: SmartString::from(&t.text), + is_prefix: false, + booster: self.booster, + }) + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct FtsNear { + pub(crate) literals: Vec, + pub(crate) distance: u32, +} + +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum FtsExpr { + Literal(FtsLiteral), + Near(FtsNear), + And(Vec), + Or(Vec), + Not(Box, Box), +} + +impl FtsExpr { + pub(crate) fn needs_idf(&self) -> bool { + match self { + FtsExpr::Literal(_) => false, + FtsExpr::Near(_) => false, + FtsExpr::And(exprs) => exprs.iter().any(|e| e.needs_idf()), + FtsExpr::Or(_) => true, + FtsExpr::Not(lhs, _) => lhs.needs_idf(), + } + } + + pub(crate) fn tokenize(self, tokenizer: &TextAnalyzer) -> Self { + self.do_tokenize(tokenizer).flatten() + } + + pub(crate) fn is_empty(&self) -> bool { + match self { + FtsExpr::Literal(l) => { + l.booster == 0. || l.value.is_empty() + }, + FtsExpr::Near(FtsNear{ literals, .. }) => {literals.is_empty()} + FtsExpr::And(v) => {v.is_empty()} + FtsExpr::Or(v) => {v.is_empty()} + FtsExpr::Not(lhs, _) => {lhs.is_empty()} + } + } + + pub(crate) fn flatten(self) -> Self { + match self { + FtsExpr::And(exprs) => { + let mut flattened = vec![]; + for e in exprs { + match e.flatten() { + FtsExpr::And(es) => flattened.extend(es), + e => { + if !e.is_empty() { + flattened.push(e) + } + }, + } + } + if flattened.len() == 1 { + flattened.into_iter().next().unwrap() + } else { + FtsExpr::And(flattened) + } + } + FtsExpr::Or(exprs) => { + let mut flattened = vec![]; + for e in exprs { + match e.flatten() { + FtsExpr::Or(es) => flattened.extend(es), + e => { + if !e.is_empty() { + flattened.push(e) + } + }, } + } + if flattened.len() == 1 { + flattened.into_iter().next().unwrap() + } else { + FtsExpr::Or(flattened) + } + } + FtsExpr::Not(lhs, rhs) => { + let lhs = lhs.flatten(); + let rhs = rhs.flatten(); + if rhs.is_empty() { + lhs + } else { + FtsExpr::Not(Box::new(lhs), Box::new(rhs)) + } + } + FtsExpr::Literal(l) => FtsExpr::Literal(l), + FtsExpr::Near(n) => FtsExpr::Near(n), + } + } + + fn do_tokenize(self, tokenizer: &TextAnalyzer) -> Self { + match self { + FtsExpr::Literal(l) => { + let mut tokens = vec![]; + l.tokenize(tokenizer, &mut tokens); + if tokens.len() == 1 { + FtsExpr::Literal(tokens.into_iter().next().unwrap()) + } else { + FtsExpr::And(tokens.into_iter().map(FtsExpr::Literal).collect()) + } + } + FtsExpr::Near(FtsNear { literals, distance }) => { + let mut tokens = vec![]; + for l in literals { + l.tokenize(tokenizer, &mut tokens); + } + FtsExpr::Near(FtsNear { + literals: tokens, + distance, + }) + } + FtsExpr::And(exprs) => FtsExpr::And( + exprs + .into_iter() + .map(|e| e.do_tokenize(tokenizer)) + .collect(), + ), + FtsExpr::Or(exprs) => FtsExpr::Or( + exprs + .into_iter() + .map(|e| e.do_tokenize(tokenizer)) + .collect(), + ), + FtsExpr::Not(lhs, rhs) => FtsExpr::Not( + Box::new(lhs.do_tokenize(tokenizer)), + Box::new(rhs.do_tokenize(tokenizer)), + ), + } + } +} diff --git a/cozo-core/src/fts/mod.rs b/cozo-core/src/fts/mod.rs index 4464f4d4..f52ca08b 100644 --- a/cozo-core/src/fts/mod.rs +++ b/cozo-core/src/fts/mod.rs @@ -25,6 +25,7 @@ use std::sync::{Arc, RwLock}; pub(crate) mod cangjie; pub(crate) mod tokenizer; pub(crate) mod indexing; +pub(crate) mod ast; #[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)] pub(crate) struct FtsIndexManifest { diff --git a/cozo-core/src/parse/fts.rs b/cozo-core/src/parse/fts.rs new file mode 100644 index 00000000..513ad9e8 --- /dev/null +++ b/cozo-core/src/parse/fts.rs @@ -0,0 +1,163 @@ +/* + * Copyright 2023, The Cozo Project Authors. + * + * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + * If a copy of the MPL was not distributed with this file, + * You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use crate::fts::ast::{FtsExpr, FtsLiteral, FtsNear}; +use crate::parse::expr::parse_string; +use crate::parse::{CozoScriptParser, Pair, Rule}; +use itertools::Itertools; +use lazy_static::lazy_static; +use miette::{IntoDiagnostic, Result}; +use pest::pratt_parser::{Op, PrattParser}; +use pest::Parser; +use smartstring::SmartString; + +fn parse_fts_query(q: &str) -> Result { + let mut pairs = CozoScriptParser::parse(Rule::fts_doc, q).into_diagnostic()?; + let pairs = pairs.next().unwrap().into_inner(); + let pairs: Vec<_> = pairs + .filter(|r| r.as_rule() != Rule::EOI) + .map(|r| parse_fts_expr(r)) + .try_collect()?; + Ok(if pairs.len() == 1 { + pairs.into_iter().next().unwrap() + } else { + FtsExpr::And(pairs) + }) +} + +fn parse_fts_expr(pair: Pair<'_>) -> Result { + debug_assert!(pair.as_rule() == Rule::fts_expr); + let pairs = pair.into_inner(); + PRATT_PARSER + .map_primary(build_term) + .map_infix(build_infix) + .parse(pairs) +} + +fn build_infix(lhs: Result, op: Pair<'_>, rhs: Result) -> Result { + let lhs = lhs?; + let rhs = rhs?; + Ok(match op.as_rule() { + Rule::fts_and => FtsExpr::And(vec![lhs, rhs]), + Rule::fts_or => FtsExpr::Or(vec![lhs, rhs]), + Rule::fts_not => FtsExpr::Not(Box::new(lhs), Box::new(rhs)), + _ => unreachable!("unexpected rule: {:?}", op.as_rule()), + }) +} + +fn build_term(pair: Pair<'_>) -> Result { + Ok(match pair.as_rule() { + Rule::fts_grouped => { + let collected: Vec<_> = pair.into_inner().map(parse_fts_expr).try_collect()?; + if collected.len() == 1 { + collected.into_iter().next().unwrap() + } else { + FtsExpr::And(collected) + } + } + Rule::fts_near => { + let mut literals = vec![]; + let mut distance = 10; + for pair in pair.into_inner() { + match pair.as_rule() { + Rule::pos_int => { + let i = pair + .as_str() + .replace('_', "") + .parse::() + .into_diagnostic()?; + distance = i as u32; + } + _ => literals.push(build_phrase(pair)?), + } + } + FtsExpr::Near(FtsNear { literals, distance }) + } + Rule::fts_phrase => FtsExpr::Literal(build_phrase(pair)?), + r => panic!("unexpected rule: {:?}", r), + }) +} + +fn build_phrase(pair: Pair<'_>) -> Result { + let mut inner = pair.into_inner(); + let kernel = inner.next().unwrap(); + let core_text = match kernel.as_rule() { + Rule::fts_phrase_group => SmartString::from(kernel.as_str().trim()), + Rule::quoted_string | Rule::s_quoted_string | Rule::raw_string => parse_string(kernel)?, + _ => unreachable!("unexpected rule: {:?}", kernel.as_rule()), + }; + let mut is_quoted = false; + let mut booster = 1.0; + for pair in inner { + match pair.as_rule() { + Rule::fts_prefix_marker => is_quoted = true, + Rule::fts_booster => { + let boosted = pair.into_inner().next().unwrap(); + match boosted.as_rule() { + Rule::dot_float => { + let f = boosted + .as_str() + .replace('_', "") + .parse::() + .into_diagnostic()?; + booster = f; + } + Rule::int => { + let i = boosted + .as_str() + .replace('_', "") + .parse::() + .into_diagnostic()?; + booster = i as f64; + } + _ => unreachable!("unexpected rule: {:?}", boosted.as_rule()), + } + } + _ => unreachable!("unexpected rule: {:?}", pair.as_rule()), + } + } + Ok(FtsLiteral { + value: core_text, + is_prefix: is_quoted, + booster, + }) +} + +lazy_static! { + static ref PRATT_PARSER: PrattParser = { + use pest::pratt_parser::Assoc::*; + + PrattParser::new() + .op(Op::infix(Rule::fts_not, Left)) + .op(Op::infix(Rule::fts_and, Left)) + .op(Op::infix(Rule::fts_or, Left)) + }; +} + +#[cfg(test)] +mod tests { + use crate::fts::ast::{FtsExpr, FtsNear}; + use crate::parse::fts::parse_fts_query; + + #[test] + fn test_parse() { + let src = " hello world OR bye bye world"; + let res = parse_fts_query(src).unwrap().flatten(); + assert!(matches!(res, FtsExpr::Or(_))); + let src = " hello world AND bye bye world"; + let res = parse_fts_query(src).unwrap().flatten(); + assert!(matches!(res, FtsExpr::And(_))); + let src = " hello world NOT bye bye NOT 'ok, mates'"; + let res = parse_fts_query(src).unwrap().flatten(); + assert!(matches!(res, FtsExpr::Not(_, _))); + let src = " NEAR(abc def \"ghi\"^22.8) "; + let res = parse_fts_query(src).unwrap().flatten(); + assert!(matches!(res, FtsExpr::Near(FtsNear{distance: 10, ..}))); + println!("{:#?}", res); + } +} diff --git a/cozo-core/src/parse/mod.rs b/cozo-core/src/parse/mod.rs index ca3e1f46..18c8cd08 100644 --- a/cozo-core/src/parse/mod.rs +++ b/cozo-core/src/parse/mod.rs @@ -32,6 +32,7 @@ pub(crate) mod imperative; pub(crate) mod query; pub(crate) mod schema; pub(crate) mod sys; +pub(crate) mod fts; #[derive(pest_derive::Parser)] #[grammar = "cozoscript.pest"]