skytable/scripts/unicode.pl

#!/usr/bin/perl -w
=pod
All credits for the random unicode string generation logic go to Paul Sarena who released
the original version here: https://github.com/bits/UTF-8-Unicode-Test-Documents and released
it under the BSD 3-Clause "New" or "Revised" License 
=cut
use strict;
use warnings qw( FATAL utf8 );
use utf8;  # tell Perl parser there are non-ASCII characters in this lexical scope
use open qw( :encoding(UTF-8) :std );  # Declare that anything that opens a filehandles within this lexical scope is to assume that that stream is encoded in UTF-8 unless you tell it otherwise

use Encode;
use HTML::Entities;

my $html_pre = q|<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
	<title>UTF-8 Codepoint Sequence</title>
</head>
<body>|;

my $html_post = q|</body>
</html>|;

my $output_directory = './utf8/';

my $utf8_seq;

#    0000–FFFF Plane 0:      Basic Multilingual Plane
#  10000–1FFFF Plane 1:      Supplementary Multilingual Plane
#  20000–2FFFF Plane 2:      Supplementary Ideographic Plane
#  30000–DFFFF Planes 3–13:  Unassigned
#  E0000–EFFFF Plane 14:     Supplementary Special-purpose Plane
# F0000–10FFFF Planes 15–16: Supplementary Private Use Area

foreach my $separator ('', ' ') {
	foreach my $end (0xFF, 0xFFF, 0xFFFF, 0x1FFFF, 0x2FFFF, 0x10FFFF) {

		# UTF-8 codepoint sequence of assigned, printable codepoints
		$utf8_seq = gen_seq({
			start => 0x00,
			end => $end,
			separator => $separator,
			skip_unprintable => 1,
				replace_unprintable => 1,
			skip_unassigned => 1,
			writefiles => ($separator ? 'txt,html' : 'txt')
		});


		# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints as-is
		$utf8_seq = gen_seq({
			start => 0x00,
			end => $end,
			separator => $separator,
			skip_unprintable => 0,
				replace_unprintable => 0,
			skip_unassigned => 1,
			writefiles => ($separator ? 'txt,html' : 'txt')
		});
		# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints replaced
		$utf8_seq = gen_seq({
			start => 0x00,
			end => $end,
			separator => $separator,
			skip_unprintable => 0,
				replace_unprintable => 1,
			skip_unassigned => 1,
			writefiles => ($separator ? 'txt,html' : 'txt')
		});


		# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints as-is
		$utf8_seq = gen_seq({
			start => 0x00,
			end => $end,
			separator => $separator,
			skip_unprintable => 0,
				replace_unprintable => 0,
			skip_unassigned => 0,
			writefiles => ($separator ? 'txt,html' : 'txt')
		});
		# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints replaced
		$utf8_seq = gen_seq({
			start => 0x00,
			end => $end,
			separator => $separator,
			skip_unprintable => 0,
				replace_unprintable => 1,
			skip_unassigned => 0,
			writefiles => ($separator ? 'txt,html' : 'txt')
		});

	}
}

# print Encode::encode('UTF-8', $utf8_seq), "\n";


sub gen_seq{
	my $config = shift;

	$config->{start}               = 0x00        unless defined $config->{start};
	$config->{end}                 = 0x10FFFF    unless defined $config->{end};
	$config->{skip_unassigned}     = 1           unless defined $config->{skip_unassigned};
	$config->{skip_unprintable}    = 1           unless defined $config->{skip_unprintable};
	$config->{replace_unprintable} = 1           unless defined $config->{replace_unprintable};
	$config->{separator}           = ' '         unless defined $config->{separator};
	$config->{newlines_every}      = 50          unless defined $config->{newlines_every};
	$config->{writefiles}          = 'text,html' unless defined $config->{writefiles};

	my $utf8_seq;
	my $codepoints_this_line = 0;
	my $codepoints_printed = 0;

	for my $i ($config->{start} .. $config->{end}) {

		next if ($i >= 0xD800 && $i <= 0xDFFF); # high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) are not legal Unicode values, and the UTF-8 encoding of them is an invalid byte sequence
		next if ($i >= 0xFDD0 && $i <= 0xFDEF); # Non-characters
		next if ( # Non-characters
			$i ==   0xFFFE || $i ==   0xFFFF ||
			$i ==  0x1FFFE || $i ==  0x1FFFF ||
			$i ==  0x2FFFE || $i ==  0x2FFFF ||
			$i ==  0x3FFFE || $i ==  0x3FFFF ||
			$i ==  0x4FFFE || $i ==  0x4FFFF ||
			$i ==  0x5FFFE || $i ==  0x5FFFF ||
			$i ==  0x6FFFE || $i ==  0x6FFFF ||
			$i ==  0x7FFFE || $i ==  0x7FFFF ||
			$i ==  0x8FFFE || $i ==  0x8FFFF ||
			$i ==  0x9FFFE || $i ==  0x9FFFF ||
			$i ==  0xaFFFE || $i ==  0xAFFFF ||
			$i ==  0xbFFFE || $i ==  0xBFFFF ||
			$i ==  0xcFFFE || $i ==  0xCFFFF ||
			$i ==  0xdFFFE || $i ==  0xDFFFF ||
			$i ==  0xeFFFE || $i ==  0xEFFFF ||
			$i ==  0xfFFFE || $i ==  0xFFFFF ||
			$i == 0x10FFFE || $i == 0x10FFFF
		);

		my $codepoint = chr($i);

		# skip unassiggned codepoints
		next if $config->{skip_unassigned} && $codepoint !~ /^\p{Assigned}/o;

		if ( $codepoint =~ /^\p{IsPrint}/o ) {
			$utf8_seq .= $codepoint;
		} else { # not printable
			next if $config->{skip_unprintable};
			# include unprintable or replace it
			$utf8_seq .= $config->{replace_unprintable} ? '<27>' : $codepoint;
		}

		$codepoints_printed++;

		if ($config->{separator}) {
			if ($config->{newlines_every} && $codepoints_this_line++ == $config->{newlines_every}) {
				$utf8_seq .= "\n";
				$codepoints_this_line = 0;
			} else {
				$utf8_seq .= $config->{separator};
			}
		}
	}

	utf8::upgrade($utf8_seq);


	if ($config->{writefiles}) {

		my $filebasename = 'utf8_sequence_' .
			(sprintf '%#x', $config->{start}) .
			'-' .
			(sprintf '%#x', $config->{end}) .
			($config->{skip_unassigned} ? '_assigned' : '_including-unassigned') .
			($config->{skip_unprintable} ? '_printable' : '_including-unprintable') .
			(!$config->{skip_unprintable} ?
				($config->{replace_unprintable} ? '-replaced' : '-asis') :
				''
			) .
			($config->{separator} ?
				($config->{newlines_every} ? '' : '_without-newlines') :
				'_unseparated'
			);


		my $title = 'UTF-8 codepoint sequence' .
			($config->{skip_unassigned} ? ' of assigned' : ' of assinged and unassigned') .
			($config->{skip_unprintable} ? ', printable' : ', with unprintable') .
			(!$config->{skip_unprintable} ?
				($config->{replace_unprintable} ? ' codepoints replaced' : ' codepoints as-is') :
				' codepoints'
			) .
			' in the range ' .
			(sprintf '%#x', $config->{start}) .
			'-' .
			(sprintf '%#x', $config->{end}) .
			($config->{newlines_every} ? '' : ', as a long string without newlines');

		my $html_pre_custom = $html_pre;
		$html_pre_custom =~ s|UTF\-8 codepoint sequence|$title|;


		my $filename = ${output_directory} . ($config->{separator} ? '' : 'un') . 'separated/' . ${filebasename};

		if ($config->{writefiles} =~ /te?xt/) {
			open FH, ">${filename}.txt" or die "cannot open $filename: $!";
			print FH $utf8_seq;
			close FH;
		}

		if ($config->{writefiles} =~ /html/) {
			open FH, ">${filename}_unescaped.html" or die "cannot open $filename: $!";
			print FH $html_pre_custom, $utf8_seq, $html_post;
			close FH;
		}

		# open FH, ">${output_directory}${filebasename}_escaped.html";
		# print FH $html_pre_custom, HTML::Entities::encode_entities($utf8_seq), $html_post;
		# close FH;

		print "Output $title ($codepoints_printed codepoints)\n";
	}

	return $utf8_seq;
}