Use deterministic finite automaton for validation

A dual stream approach provides even more speed improvements
next
Sayan Nandan 3 years ago
parent 8cfab3f7d3
commit 5e4cd5be4b

@ -0,0 +1,227 @@
#!/usr/bin/perl -w
=pod
All credits for the random unicode string generation logic go to Paul Sarena who released
the original version here: https://github.com/bits/UTF-8-Unicode-Test-Documents and released
it under the BSD 3-Clause "New" or "Revised" License
=cut
use strict;
use warnings qw( FATAL utf8 );
use utf8; # tell Perl parser there are non-ASCII characters in this lexical scope
use open qw( :encoding(UTF-8) :std ); # Declare that anything that opens a filehandles within this lexical scope is to assume that that stream is encoded in UTF-8 unless you tell it otherwise
use Encode;
use HTML::Entities;
my $html_pre = q|<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>UTF-8 Codepoint Sequence</title>
</head>
<body>|;
my $html_post = q|</body>
</html>|;
my $output_directory = './utf8/';
my $utf8_seq;
# 0000FFFF Plane 0: Basic Multilingual Plane
# 100001FFFF Plane 1: Supplementary Multilingual Plane
# 200002FFFF Plane 2: Supplementary Ideographic Plane
# 30000DFFFF Planes 313: Unassigned
# E0000EFFFF Plane 14: Supplement­ary Special-purpose Plane
# F000010FFFF Planes 1516: Supplement­ary Private Use Area
foreach my $separator ('', ' ') {
foreach my $end (0xFF, 0xFFF, 0xFFFF, 0x1FFFF, 0x2FFFF, 0x10FFFF) {
# UTF-8 codepoint sequence of assigned, printable codepoints
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 1,
replace_unprintable => 1,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints as-is
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 0,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints replaced
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 1,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints as-is
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 0,
skip_unassigned => 0,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints replaced
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 1,
skip_unassigned => 0,
writefiles => ($separator ? 'txt,html' : 'txt')
});
}
}
# print Encode::encode('UTF-8', $utf8_seq), "\n";
sub gen_seq{
my $config = shift;
$config->{start} = 0x00 unless defined $config->{start};
$config->{end} = 0x10FFFF unless defined $config->{end};
$config->{skip_unassigned} = 1 unless defined $config->{skip_unassigned};
$config->{skip_unprintable} = 1 unless defined $config->{skip_unprintable};
$config->{replace_unprintable} = 1 unless defined $config->{replace_unprintable};
$config->{separator} = ' ' unless defined $config->{separator};
$config->{newlines_every} = 50 unless defined $config->{newlines_every};
$config->{writefiles} = 'text,html' unless defined $config->{writefiles};
my $utf8_seq;
my $codepoints_this_line = 0;
my $codepoints_printed = 0;
for my $i ($config->{start} .. $config->{end}) {
next if ($i >= 0xD800 && $i <= 0xDFFF); # high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) are not legal Unicode values, and the UTF-8 encoding of them is an invalid byte sequence
next if ($i >= 0xFDD0 && $i <= 0xFDEF); # Non-characters
next if ( # Non-characters
$i == 0xFFFE || $i == 0xFFFF ||
$i == 0x1FFFE || $i == 0x1FFFF ||
$i == 0x2FFFE || $i == 0x2FFFF ||
$i == 0x3FFFE || $i == 0x3FFFF ||
$i == 0x4FFFE || $i == 0x4FFFF ||
$i == 0x5FFFE || $i == 0x5FFFF ||
$i == 0x6FFFE || $i == 0x6FFFF ||
$i == 0x7FFFE || $i == 0x7FFFF ||
$i == 0x8FFFE || $i == 0x8FFFF ||
$i == 0x9FFFE || $i == 0x9FFFF ||
$i == 0xaFFFE || $i == 0xAFFFF ||
$i == 0xbFFFE || $i == 0xBFFFF ||
$i == 0xcFFFE || $i == 0xCFFFF ||
$i == 0xdFFFE || $i == 0xDFFFF ||
$i == 0xeFFFE || $i == 0xEFFFF ||
$i == 0xfFFFE || $i == 0xFFFFF ||
$i == 0x10FFFE || $i == 0x10FFFF
);
my $codepoint = chr($i);
# skip unassiggned codepoints
next if $config->{skip_unassigned} && $codepoint !~ /^\p{Assigned}/o;
if ( $codepoint =~ /^\p{IsPrint}/o ) {
$utf8_seq .= $codepoint;
} else { # not printable
next if $config->{skip_unprintable};
# include unprintable or replace it
$utf8_seq .= $config->{replace_unprintable} ? '<27>' : $codepoint;
}
$codepoints_printed++;
if ($config->{separator}) {
if ($config->{newlines_every} && $codepoints_this_line++ == $config->{newlines_every}) {
$utf8_seq .= "\n";
$codepoints_this_line = 0;
} else {
$utf8_seq .= $config->{separator};
}
}
}
utf8::upgrade($utf8_seq);
if ($config->{writefiles}) {
my $filebasename = 'utf8_sequence_' .
(sprintf '%#x', $config->{start}) .
'-' .
(sprintf '%#x', $config->{end}) .
($config->{skip_unassigned} ? '_assigned' : '_including-unassigned') .
($config->{skip_unprintable} ? '_printable' : '_including-unprintable') .
(!$config->{skip_unprintable} ?
($config->{replace_unprintable} ? '-replaced' : '-asis') :
''
) .
($config->{separator} ?
($config->{newlines_every} ? '' : '_without-newlines') :
'_unseparated'
);
my $title = 'UTF-8 codepoint sequence' .
($config->{skip_unassigned} ? ' of assigned' : ' of assinged and unassigned') .
($config->{skip_unprintable} ? ', printable' : ', with unprintable') .
(!$config->{skip_unprintable} ?
($config->{replace_unprintable} ? ' codepoints replaced' : ' codepoints as-is') :
' codepoints'
) .
' in the range ' .
(sprintf '%#x', $config->{start}) .
'-' .
(sprintf '%#x', $config->{end}) .
($config->{newlines_every} ? '' : ', as a long string without newlines');
my $html_pre_custom = $html_pre;
$html_pre_custom =~ s|UTF\-8 codepoint sequence|$title|;
my $filename = ${output_directory} . ($config->{separator} ? '' : 'un') . 'separated/' . ${filebasename};
if ($config->{writefiles} =~ /te?xt/) {
open FH, ">${filename}.txt" or die "cannot open $filename: $!";
print FH $utf8_seq;
close FH;
}
if ($config->{writefiles} =~ /html/) {
open FH, ">${filename}_unescaped.html" or die "cannot open $filename: $!";
print FH $html_pre_custom, $utf8_seq, $html_post;
close FH;
}
# open FH, ">${output_directory}${filebasename}_escaped.html";
# print FH $html_pre_custom, HTML::Entities::encode_entities($utf8_seq), $html_post;
# close FH;
print "Output $title ($codepoints_printed codepoints)\n";
}
return $utf8_seq;
}

@ -0,0 +1,113 @@
/*
* Created on Thu Jul 01 2021
*
* This file is a part of Skytable
* Skytable (formerly known as TerrabaseDB or Skybase) is a free and open-source
* NoSQL database written by Sayan Nandan ("the Author") with the
* vision to provide flexibility in data modelling without compromising
* on performance, queryability or scalability.
*
* Copyright (c) 2021, Sayan Nandan <ohsayan@outlook.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
/*
This cannot be the work of a single person! A big thanks to:
- Professor Lemire: https://scholar.google.com/citations?user=q1ja-G8AAAAJ
- Travis Downs: https://github.com/travisdowns
*/
/// This table maps bytes to character classes that helps us reduce the size of the
/// transition table and generate bitmasks
pub const UTF8_MAP_BYTE_TO_CHAR_CLASS: [u8; 256] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8,
];
/// This table is a transition table that maps the combination of a state of the
/// automaton and a char class to a state
pub const UTF8_TRANSITION_MAP: [u8; 108] = [
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
];
/// This method uses a dual-stream deterministic finite automaton
/// [(DFA)](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) that is used to validate
/// UTF-8 bytes that use the encoded finite state machines defined in this module.
///
/// This function gives us as much as a ~300% improvement over std's validation algorithm
pub fn is_utf8(bytes: impl AsRef<[u8]>) -> bool {
let bytes = bytes.as_ref();
let mut half = bytes.len() / 2;
while bytes[half] <= 0xBF && bytes[half] >= 0x80 && half > 0 {
half -= 1;
}
let (mut fsm_state_1, mut fsm_state_2) = (0u8, 0u8);
let mut i = 0usize;
let mut j = half;
while i < half {
fsm_state_1 = UTF8_TRANSITION_MAP
[(fsm_state_1 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[i]) as usize])) as usize];
fsm_state_2 = UTF8_TRANSITION_MAP
[(fsm_state_2 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[j]) as usize])) as usize];
i += 1;
j += 1;
}
let mut j = half * 2;
while j < bytes.len() {
fsm_state_2 = UTF8_TRANSITION_MAP
[(fsm_state_2 + (UTF8_MAP_BYTE_TO_CHAR_CLASS[(bytes[j]) as usize])) as usize];
j += 1;
}
fsm_state_1 == 0 && fsm_state_2 == 0
}
#[test]
fn test_utf8_verity() {
let unicode = gen_unicode();
assert!(unicode.into_iter().all(self::is_utf8));
}
#[cfg(test)]
fn gen_unicode() -> Vec<String> {
use std::env;
use std::fs;
use std::process::Command;
let mut path = env::var("ROOT_DIR").expect("ROOT_DIR unset");
path.push_str("/scripts/unicode.pl");
fs::create_dir_all("./utf8/separated").unwrap();
fs::create_dir_all("./utf8/unseparated").unwrap();
let _cmd = Command::new("perl").arg("-w").arg(path).output().unwrap();
let mut strings = vec![];
for file in fs::read_dir("utf8/separated").unwrap() {
strings.push(fs::read_to_string(file.unwrap().path()).unwrap());
}
for file in fs::read_dir("utf8/unseparated").unwrap() {
strings.push(fs::read_to_string(file.unwrap().path()).unwrap());
}
fs::remove_dir_all("utf8").unwrap();
strings
}

@ -33,6 +33,7 @@ use crate::coredb::htable::MapSingleReference;
use crate::coredb::htable::SharedValue;
use core::sync::atomic::AtomicBool;
use core::sync::atomic::Ordering;
mod encoding;
const ORD_RELAXED: Ordering = Ordering::Relaxed;

Loading…
Cancel
Save