Use deterministic finite automaton for validation

A dual stream approach provides even more speed improvements
3 years ago · 5e4cd5be4b
parent 8cfab3f7d3
commit 5e4cd5be4b
3 changed files with 341 additions and 0 deletions
--- a/scripts/unicode.pl
+++ b/scripts/unicode.pl
@ -0,0 +1,227 @@
+#!/usr/bin/perl -w
+=pod
+All credits for the random unicode string generation logic go to Paul Sarena who released
+the original version here: https://github.com/bits/UTF-8-Unicode-Test-Documents and released
+it under the BSD 3-Clause "New" or "Revised" License 
+=cut
+use strict;
+use warnings qw( FATAL utf8 );
+use utf8;  # tell Perl parser there are non-ASCII characters in this lexical scope
+use open qw( :encoding(UTF-8) :std );  # Declare that anything that opens a filehandles within this lexical scope is to assume that that stream is encoded in UTF-8 unless you tell it otherwise
+
+use Encode;
+use HTML::Entities;
+
+my $html_pre = q|<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+	<title>UTF-8 Codepoint Sequence</title>
+</head>
+<body>|;
+
+my $html_post = q|</body>
+</html>|;
+
+my $output_directory = './utf8/';
+
+my $utf8_seq;
+
+#    0000–FFFF Plane 0:      Basic Multilingual Plane
+#  10000–1FFFF Plane 1:      Supplementary Multilingual Plane
+#  20000–2FFFF Plane 2:      Supplementary Ideographic Plane
+#  30000–DFFFF Planes 3–13:  Unassigned
+#  E0000–EFFFF Plane 14:     Supplementary Special-purpose Plane
+# F0000–10FFFF Planes 15–16: Supplementary Private Use Area
+
+foreach my $separator ('', ' ') {
+	foreach my $end (0xFF, 0xFFF, 0xFFFF, 0x1FFFF, 0x2FFFF, 0x10FFFF) {
+
+		# UTF-8 codepoint sequence of assigned, printable codepoints
+		$utf8_seq = gen_seq({
+			start => 0x00,
+			end => $end,
+			separator => $separator,
+			skip_unprintable => 1,
+				replace_unprintable => 1,
+			skip_unassigned => 1,
+			writefiles => ($separator ? 'txt,html' : 'txt')
+		});
+
+
+		# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints as-is
+		$utf8_seq = gen_seq({
+			start => 0x00,
+			end => $end,
+			separator => $separator,
+			skip_unprintable => 0,
+				replace_unprintable => 0,
+			skip_unassigned => 1,
+			writefiles => ($separator ? 'txt,html' : 'txt')
+		});
+		# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints replaced
+		$utf8_seq = gen_seq({
+			start => 0x00,
+			end => $end,
+			separator => $separator,
+			skip_unprintable => 0,
+				replace_unprintable => 1,
+			skip_unassigned => 1,
+			writefiles => ($separator ? 'txt,html' : 'txt')
+		});
+
+
+		# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints as-is
+		$utf8_seq = gen_seq({
+			start => 0x00,
+			end => $end,
+			separator => $separator,
+			skip_unprintable => 0,
+				replace_unprintable => 0,
+			skip_unassigned => 0,
+			writefiles => ($separator ? 'txt,html' : 'txt')
+		});
+		# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints replaced
+		$utf8_seq = gen_seq({
+			start => 0x00,
+			end => $end,
+			separator => $separator,
+			skip_unprintable => 0,
+				replace_unprintable => 1,
+			skip_unassigned => 0,
+			writefiles => ($separator ? 'txt,html' : 'txt')
+		});
+
+	}
+}
+
+# print Encode::encode('UTF-8', $utf8_seq), "\n";
+
+
+
+sub gen_seq{
+	my $config = shift;
+
+	$config->{start}               = 0x00        unless defined $config->{start};
+	$config->{end}                 = 0x10FFFF    unless defined $config->{end};
+	$config->{skip_unassigned}     = 1           unless defined $config->{skip_unassigned};
+	$config->{skip_unprintable}    = 1           unless defined $config->{skip_unprintable};
+	$config->{replace_unprintable} = 1           unless defined $config->{replace_unprintable};
+	$config->{separator}           = ' '         unless defined $config->{separator};
+	$config->{newlines_every}      = 50          unless defined $config->{newlines_every};
+	$config->{writefiles}          = 'text,html' unless defined $config->{writefiles};
+
+	my $utf8_seq;
+	my $codepoints_this_line = 0;
+	my $codepoints_printed = 0;
+
+	for my $i ($config->{start} .. $config->{end}) {
+
+		next if ($i >= 0xD800 && $i <= 0xDFFF); # high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) are not legal Unicode values, and the UTF-8 encoding of them is an invalid byte sequence
+		next if ($i >= 0xFDD0 && $i <= 0xFDEF); # Non-characters
+		next if ( # Non-characters
+			$i ==   0xFFFE || $i ==   0xFFFF ||
+			$i ==  0x1FFFE || $i ==  0x1FFFF ||
+			$i ==  0x2FFFE || $i ==  0x2FFFF ||
+			$i ==  0x3FFFE || $i ==  0x3FFFF ||
+			$i ==  0x4FFFE || $i ==  0x4FFFF ||
+			$i ==  0x5FFFE || $i ==  0x5FFFF ||
+			$i ==  0x6FFFE || $i ==  0x6FFFF ||
+			$i ==  0x7FFFE || $i ==  0x7FFFF ||
+			$i ==  0x8FFFE || $i ==  0x8FFFF ||
+			$i ==  0x9FFFE || $i ==  0x9FFFF ||
+			$i ==  0xaFFFE || $i ==  0xAFFFF ||
+			$i ==  0xbFFFE || $i ==  0xBFFFF ||
+			$i ==  0xcFFFE || $i ==  0xCFFFF ||
+			$i ==  0xdFFFE || $i ==  0xDFFFF ||
+			$i ==  0xeFFFE || $i ==  0xEFFFF ||
+			$i ==  0xfFFFE || $i ==  0xFFFFF ||
+			$i == 0x10FFFE || $i == 0x10FFFF
+		);
+
+		my $codepoint = chr($i);
+
+		# skip unassiggned codepoints
+		next if $config->{skip_unassigned} && $codepoint !~ /^\p{Assigned}/o;
+
+		if ( $codepoint =~ /^\p{IsPrint}/o ) {
+			$utf8_seq .= $codepoint;
+		} else { # not printable
+			next if $config->{skip_unprintable};
+			# include unprintable or replace it
+			$utf8_seq .= $config->{replace_unprintable} ? '<27>' : $codepoint;
+		}
+
+		$codepoints_printed++;
+
+		if ($config->{separator}) {
+			if ($config->{newlines_every} && $codepoints_this_line++ == $config->{newlines_every}) {
+				$utf8_seq .= "\n";
+				$codepoints_this_line = 0;
+			} else {
+				$utf8_seq .= $config->{separator};
+			}
+		}
+	}
+
+	utf8::upgrade($utf8_seq);
+
+
+	if ($config->{writefiles}) {
+
+		my $filebasename = 'utf8_sequence_' .
+			(sprintf '%#x', $config->{start}) .
+			'-' .
+			(sprintf '%#x', $config->{end}) .
+			($config->{skip_unassigned} ? '_assigned' : '_including-unassigned') .
+			($config->{skip_unprintable} ? '_printable' : '_including-unprintable') .
+			(!$config->{skip_unprintable} ?
+				($config->{replace_unprintable} ? '-replaced' : '-asis') :
+				''
+			) .
+			($config->{separator} ?
+				($config->{newlines_every} ? '' : '_without-newlines') :
+				'_unseparated'
+			);
+
+
+		my $title = 'UTF-8 codepoint sequence' .
+			($config->{skip_unassigned} ? ' of assigned' : ' of assinged and unassigned') .
+			($config->{skip_unprintable} ? ', printable' : ', with unprintable') .
+			(!$config->{skip_unprintable} ?
+				($config->{replace_unprintable} ? ' codepoints replaced' : ' codepoints as-is') :
+				' codepoints'
+			) .
+			' in the range ' .
+			(sprintf '%#x', $config->{start}) .
+			'-' .
+			(sprintf '%#x', $config->{end}) .
+			($config->{newlines_every} ? '' : ', as a long string without newlines');
+
+		my $html_pre_custom = $html_pre;
+		$html_pre_custom =~ s|UTF\-8 codepoint sequence|$title|;
+
+
+		my $filename = ${output_directory} . ($config->{separator} ? '' : 'un') . 'separated/' . ${filebasename};
+
+		if ($config->{writefiles} =~ /te?xt/) {
+			open FH, ">${filename}.txt" or die "cannot open $filename: $!";
+			print FH $utf8_seq;
+			close FH;
+		}
+
+		if ($config->{writefiles} =~ /html/) {
+			open FH, ">${filename}_unescaped.html" or die "cannot open $filename: $!";
+			print FH $html_pre_custom, $utf8_seq, $html_post;
+			close FH;
+		}
+
+		# open FH, ">${output_directory}${filebasename}_escaped.html";
+		# print FH $html_pre_custom, HTML::Entities::encode_entities($utf8_seq), $html_post;
+		# close FH;
+
+		print "Output $title ($codepoints_printed codepoints)\n";
+	}
+
+	return $utf8_seq;
+}
--- a/server/src/kvengine/encoding.rs
+++ b/server/src/kvengine/encoding.rs
@ -0,0 +1,113 @@
+/*
+ * Created on Thu Jul 01 2021
+ *
+ * This file is a part of Skytable
+ * Skytable (formerly known as TerrabaseDB or Skybase) is a free and open-source
+ * NoSQL database written by Sayan Nandan ("the Author") with the
+ * vision to provide flexibility in data modelling without compromising
+ * on performance, queryability or scalability.
+ *
+ * Copyright (c) 2021, Sayan Nandan <ohsayan@outlook.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+*/
+
+/*
+ This cannot be the work of a single person! A big thanks to:
+ - Professor Lemire: https://scholar.google.com/citations?user=q1ja-G8AAAAJ
+ - Travis Downs: https://github.com/travisdowns
+*/
+
+/// This table maps bytes to character classes that helps us reduce the size of the
+/// transition table and generate bitmasks
+pub const UTF8_MAP_BYTE_TO_CHAR_CLASS: [u8; 256] = [
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8,
+];
+
+/// This table is a transition table that maps the combination of a state of the
+/// automaton and a char class to a state
+pub const UTF8_TRANSITION_MAP: [u8; 108] = [
+    0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
+    12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+];
+
+/// This method uses a dual-stream deterministic finite automaton
+/// [(DFA)](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) that is used to validate
+/// UTF-8 bytes that use the encoded finite state machines defined in this module.
+///
+/// This function gives us as much as a ~300% improvement over std's validation algorithm
+pub fn is_utf8(bytes: impl AsRef<[u8]>) -> bool {
+    let bytes = bytes.as_ref();
+    let mut half = bytes.len() / 2;
+    while bytes[half] <= 0xBF && bytes[half] >= 0x80 && half > 0 {
+        half -= 1;
+    }
+    let (mut fsm_state_1, mut fsm_state_2) = (0u8, 0u8);
+    let mut i = 0usize;
+    let mut j = half;
+    while i < half {
+        fsm_state_1 = UTF8_TRANSITION_MAP
+            [(fsm_state_1 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[i]) as usize])) as usize];
+        fsm_state_2 = UTF8_TRANSITION_MAP
+            [(fsm_state_2 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[j]) as usize])) as usize];
+        i += 1;
+        j += 1;
+    }
+    let mut j = half * 2;
+    while j < bytes.len() {
+        fsm_state_2 = UTF8_TRANSITION_MAP
+            [(fsm_state_2 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[j]) as usize])) as usize];
+        j += 1;
+    }
+    fsm_state_1 == 0 && fsm_state_2 == 0
+}
+
+#[test]
+fn test_utf8_verity() {
+    let unicode = gen_unicode();
+    assert!(unicode.into_iter().all(self::is_utf8));
+}
+
+#[cfg(test)]
+fn gen_unicode() -> Vec<String> {
+    use std::env;
+    use std::fs;
+    use std::process::Command;
+    let mut path = env::var("ROOT_DIR").expect("ROOT_DIR unset");
+    path.push_str("/scripts/unicode.pl");
+    fs::create_dir_all("./utf8/separated").unwrap();
+    fs::create_dir_all("./utf8/unseparated").unwrap();
+    let _cmd = Command::new("perl").arg("-w").arg(path).output().unwrap();
+    let mut strings = vec![];
+    for file in fs::read_dir("utf8/separated").unwrap() {
+        strings.push(fs::read_to_string(file.unwrap().path()).unwrap());
+    }
+    for file in fs::read_dir("utf8/unseparated").unwrap() {
+        strings.push(fs::read_to_string(file.unwrap().path()).unwrap());
+    }
+    fs::remove_dir_all("utf8").unwrap();
+    strings
+}
--- a/server/src/kvengine/mod.rs
+++ b/server/src/kvengine/mod.rs
@ -33,6 +33,7 @@ use crate::coredb::htable::MapSingleReference;
 use crate::coredb::htable::SharedValue;
 use core::sync::atomic::AtomicBool;
 use core::sync::atomic::Ordering;
+mod encoding;

 const ORD_RELAXED: Ordering = Ordering::Relaxed;