main
Ziyang Hu 1 year ago
parent 4f56ebe505
commit cc0025d514

@ -241,8 +241,17 @@ loop_block = {("%mark" ~ ident)? ~ "%loop" ~ imperative_block ~ "%end"}
temp_swap = {"%swap" ~ underscore_ident ~ underscore_ident} temp_swap = {"%swap" ~ underscore_ident ~ underscore_ident}
debug_stmt = {"%debug" ~ (ident | underscore_ident)} debug_stmt = {"%debug" ~ (ident | underscore_ident)}
/* fts_doc = {SOI ~ fts_expr+ ~ EOI}
fts_phrase_simple = @{!("AND" | "OR" | "NOT" | "NEAR") ~ (XID_CONTINUE+)}
yield is no longer necessary! fts_phrase_group = {fts_phrase_simple+}
fts_prefix_marker = @{"*"}
*/ fts_booster = {"^" ~ (dot_float | pos_int)}
fts_phrase = {(fts_phrase_group | quoted_string | s_quoted_string | raw_string) ~ fts_prefix_marker? ~ fts_booster?}
fts_near = {"NEAR" ~ ("/" ~ pos_int)? ~ "(" ~ fts_phrase+ ~ ")"}
fts_term = _{fts_phrase | fts_near | fts_grouped}
fts_grouped = {"(" ~ fts_expr+ ~ ")"}
fts_expr = {fts_term ~ (fts_op ~ fts_term)*}
fts_op = _{fts_and | fts_or | fts_not}
fts_and = {"AND"}
fts_or = {"OR"}
fts_not = {"NOT"}

@ -0,0 +1,169 @@
/*
* Copyright 2023, The Cozo Project Authors.
*
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
* If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/.
*/
use crate::fts::tokenizer::TextAnalyzer;
use smartstring::{LazyCompact, SmartString};
#[derive(Debug, Clone, PartialEq)]
pub(crate) struct FtsLiteral {
pub(crate) value: SmartString<LazyCompact>,
pub(crate) is_prefix: bool,
pub(crate) booster: f64,
}
impl FtsLiteral {
pub(crate) fn tokenize(self, tokenizer: &TextAnalyzer, coll: &mut Vec<Self>) {
if self.is_prefix {
coll.push(self);
return;
}
let mut tokens = tokenizer.token_stream(&self.value);
while let Some(t) = tokens.next() {
coll.push(FtsLiteral {
value: SmartString::from(&t.text),
is_prefix: false,
booster: self.booster,
})
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub(crate) struct FtsNear {
pub(crate) literals: Vec<FtsLiteral>,
pub(crate) distance: u32,
}
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum FtsExpr {
Literal(FtsLiteral),
Near(FtsNear),
And(Vec<FtsExpr>),
Or(Vec<FtsExpr>),
Not(Box<FtsExpr>, Box<FtsExpr>),
}
impl FtsExpr {
pub(crate) fn needs_idf(&self) -> bool {
match self {
FtsExpr::Literal(_) => false,
FtsExpr::Near(_) => false,
FtsExpr::And(exprs) => exprs.iter().any(|e| e.needs_idf()),
FtsExpr::Or(_) => true,
FtsExpr::Not(lhs, _) => lhs.needs_idf(),
}
}
pub(crate) fn tokenize(self, tokenizer: &TextAnalyzer) -> Self {
self.do_tokenize(tokenizer).flatten()
}
pub(crate) fn is_empty(&self) -> bool {
match self {
FtsExpr::Literal(l) => {
l.booster == 0. || l.value.is_empty()
},
FtsExpr::Near(FtsNear{ literals, .. }) => {literals.is_empty()}
FtsExpr::And(v) => {v.is_empty()}
FtsExpr::Or(v) => {v.is_empty()}
FtsExpr::Not(lhs, _) => {lhs.is_empty()}
}
}
pub(crate) fn flatten(self) -> Self {
match self {
FtsExpr::And(exprs) => {
let mut flattened = vec![];
for e in exprs {
match e.flatten() {
FtsExpr::And(es) => flattened.extend(es),
e => {
if !e.is_empty() {
flattened.push(e)
}
},
}
}
if flattened.len() == 1 {
flattened.into_iter().next().unwrap()
} else {
FtsExpr::And(flattened)
}
}
FtsExpr::Or(exprs) => {
let mut flattened = vec![];
for e in exprs {
match e.flatten() {
FtsExpr::Or(es) => flattened.extend(es),
e => {
if !e.is_empty() {
flattened.push(e)
}
}, }
}
if flattened.len() == 1 {
flattened.into_iter().next().unwrap()
} else {
FtsExpr::Or(flattened)
}
}
FtsExpr::Not(lhs, rhs) => {
let lhs = lhs.flatten();
let rhs = rhs.flatten();
if rhs.is_empty() {
lhs
} else {
FtsExpr::Not(Box::new(lhs), Box::new(rhs))
}
}
FtsExpr::Literal(l) => FtsExpr::Literal(l),
FtsExpr::Near(n) => FtsExpr::Near(n),
}
}
fn do_tokenize(self, tokenizer: &TextAnalyzer) -> Self {
match self {
FtsExpr::Literal(l) => {
let mut tokens = vec![];
l.tokenize(tokenizer, &mut tokens);
if tokens.len() == 1 {
FtsExpr::Literal(tokens.into_iter().next().unwrap())
} else {
FtsExpr::And(tokens.into_iter().map(FtsExpr::Literal).collect())
}
}
FtsExpr::Near(FtsNear { literals, distance }) => {
let mut tokens = vec![];
for l in literals {
l.tokenize(tokenizer, &mut tokens);
}
FtsExpr::Near(FtsNear {
literals: tokens,
distance,
})
}
FtsExpr::And(exprs) => FtsExpr::And(
exprs
.into_iter()
.map(|e| e.do_tokenize(tokenizer))
.collect(),
),
FtsExpr::Or(exprs) => FtsExpr::Or(
exprs
.into_iter()
.map(|e| e.do_tokenize(tokenizer))
.collect(),
),
FtsExpr::Not(lhs, rhs) => FtsExpr::Not(
Box::new(lhs.do_tokenize(tokenizer)),
Box::new(rhs.do_tokenize(tokenizer)),
),
}
}
}

@ -25,6 +25,7 @@ use std::sync::{Arc, RwLock};
pub(crate) mod cangjie; pub(crate) mod cangjie;
pub(crate) mod tokenizer; pub(crate) mod tokenizer;
pub(crate) mod indexing; pub(crate) mod indexing;
pub(crate) mod ast;
#[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)] #[derive(Debug, Clone, PartialEq, serde_derive::Serialize, serde_derive::Deserialize)]
pub(crate) struct FtsIndexManifest { pub(crate) struct FtsIndexManifest {

@ -0,0 +1,163 @@
/*
* Copyright 2023, The Cozo Project Authors.
*
* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
* If a copy of the MPL was not distributed with this file,
* You can obtain one at https://mozilla.org/MPL/2.0/.
*/
use crate::fts::ast::{FtsExpr, FtsLiteral, FtsNear};
use crate::parse::expr::parse_string;
use crate::parse::{CozoScriptParser, Pair, Rule};
use itertools::Itertools;
use lazy_static::lazy_static;
use miette::{IntoDiagnostic, Result};
use pest::pratt_parser::{Op, PrattParser};
use pest::Parser;
use smartstring::SmartString;
fn parse_fts_query(q: &str) -> Result<FtsExpr> {
let mut pairs = CozoScriptParser::parse(Rule::fts_doc, q).into_diagnostic()?;
let pairs = pairs.next().unwrap().into_inner();
let pairs: Vec<_> = pairs
.filter(|r| r.as_rule() != Rule::EOI)
.map(|r| parse_fts_expr(r))
.try_collect()?;
Ok(if pairs.len() == 1 {
pairs.into_iter().next().unwrap()
} else {
FtsExpr::And(pairs)
})
}
fn parse_fts_expr(pair: Pair<'_>) -> Result<FtsExpr> {
debug_assert!(pair.as_rule() == Rule::fts_expr);
let pairs = pair.into_inner();
PRATT_PARSER
.map_primary(build_term)
.map_infix(build_infix)
.parse(pairs)
}
fn build_infix(lhs: Result<FtsExpr>, op: Pair<'_>, rhs: Result<FtsExpr>) -> Result<FtsExpr> {
let lhs = lhs?;
let rhs = rhs?;
Ok(match op.as_rule() {
Rule::fts_and => FtsExpr::And(vec![lhs, rhs]),
Rule::fts_or => FtsExpr::Or(vec![lhs, rhs]),
Rule::fts_not => FtsExpr::Not(Box::new(lhs), Box::new(rhs)),
_ => unreachable!("unexpected rule: {:?}", op.as_rule()),
})
}
fn build_term(pair: Pair<'_>) -> Result<FtsExpr> {
Ok(match pair.as_rule() {
Rule::fts_grouped => {
let collected: Vec<_> = pair.into_inner().map(parse_fts_expr).try_collect()?;
if collected.len() == 1 {
collected.into_iter().next().unwrap()
} else {
FtsExpr::And(collected)
}
}
Rule::fts_near => {
let mut literals = vec![];
let mut distance = 10;
for pair in pair.into_inner() {
match pair.as_rule() {
Rule::pos_int => {
let i = pair
.as_str()
.replace('_', "")
.parse::<i64>()
.into_diagnostic()?;
distance = i as u32;
}
_ => literals.push(build_phrase(pair)?),
}
}
FtsExpr::Near(FtsNear { literals, distance })
}
Rule::fts_phrase => FtsExpr::Literal(build_phrase(pair)?),
r => panic!("unexpected rule: {:?}", r),
})
}
fn build_phrase(pair: Pair<'_>) -> Result<FtsLiteral> {
let mut inner = pair.into_inner();
let kernel = inner.next().unwrap();
let core_text = match kernel.as_rule() {
Rule::fts_phrase_group => SmartString::from(kernel.as_str().trim()),
Rule::quoted_string | Rule::s_quoted_string | Rule::raw_string => parse_string(kernel)?,
_ => unreachable!("unexpected rule: {:?}", kernel.as_rule()),
};
let mut is_quoted = false;
let mut booster = 1.0;
for pair in inner {
match pair.as_rule() {
Rule::fts_prefix_marker => is_quoted = true,
Rule::fts_booster => {
let boosted = pair.into_inner().next().unwrap();
match boosted.as_rule() {
Rule::dot_float => {
let f = boosted
.as_str()
.replace('_', "")
.parse::<f64>()
.into_diagnostic()?;
booster = f;
}
Rule::int => {
let i = boosted
.as_str()
.replace('_', "")
.parse::<i64>()
.into_diagnostic()?;
booster = i as f64;
}
_ => unreachable!("unexpected rule: {:?}", boosted.as_rule()),
}
}
_ => unreachable!("unexpected rule: {:?}", pair.as_rule()),
}
}
Ok(FtsLiteral {
value: core_text,
is_prefix: is_quoted,
booster,
})
}
lazy_static! {
static ref PRATT_PARSER: PrattParser<Rule> = {
use pest::pratt_parser::Assoc::*;
PrattParser::new()
.op(Op::infix(Rule::fts_not, Left))
.op(Op::infix(Rule::fts_and, Left))
.op(Op::infix(Rule::fts_or, Left))
};
}
#[cfg(test)]
mod tests {
use crate::fts::ast::{FtsExpr, FtsNear};
use crate::parse::fts::parse_fts_query;
#[test]
fn test_parse() {
let src = " hello world OR bye bye world";
let res = parse_fts_query(src).unwrap().flatten();
assert!(matches!(res, FtsExpr::Or(_)));
let src = " hello world AND bye bye world";
let res = parse_fts_query(src).unwrap().flatten();
assert!(matches!(res, FtsExpr::And(_)));
let src = " hello world NOT bye bye NOT 'ok, mates'";
let res = parse_fts_query(src).unwrap().flatten();
assert!(matches!(res, FtsExpr::Not(_, _)));
let src = " NEAR(abc def \"ghi\"^22.8) ";
let res = parse_fts_query(src).unwrap().flatten();
assert!(matches!(res, FtsExpr::Near(FtsNear{distance: 10, ..})));
println!("{:#?}", res);
}
}

@ -32,6 +32,7 @@ pub(crate) mod imperative;
pub(crate) mod query; pub(crate) mod query;
pub(crate) mod schema; pub(crate) mod schema;
pub(crate) mod sys; pub(crate) mod sys;
pub(crate) mod fts;
#[derive(pest_derive::Parser)] #[derive(pest_derive::Parser)]
#[grammar = "cozoscript.pest"] #[grammar = "cozoscript.pest"]

Loading…
Cancel
Save