You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

227 lines
7.3 KiB
Perl

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/perl -w
=pod
All credits for the random unicode string generation logic go to Paul Sarena who released
the original version here: https://github.com/bits/UTF-8-Unicode-Test-Documents and released
it under the BSD 3-Clause "New" or "Revised" License
=cut
use strict;
use warnings qw( FATAL utf8 );
use utf8; # tell Perl parser there are non-ASCII characters in this lexical scope
use open qw( :encoding(UTF-8) :std ); # Declare that anything that opens a filehandles within this lexical scope is to assume that that stream is encoded in UTF-8 unless you tell it otherwise
use Encode;
use HTML::Entities;
my $html_pre = q|<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>UTF-8 Codepoint Sequence</title>
</head>
<body>|;
my $html_post = q|</body>
</html>|;
my $output_directory = './utf8/';
my $utf8_seq;
# 0000FFFF Plane 0: Basic Multilingual Plane
# 100001FFFF Plane 1: Supplementary Multilingual Plane
# 200002FFFF Plane 2: Supplementary Ideographic Plane
# 30000DFFFF Planes 313: Unassigned
# E0000EFFFF Plane 14: Supplement­ary Special-purpose Plane
# F000010FFFF Planes 1516: Supplement­ary Private Use Area
foreach my $separator ('', ' ') {
foreach my $end (0xFF, 0xFFF, 0xFFFF, 0x1FFFF, 0x2FFFF, 0x10FFFF) {
# UTF-8 codepoint sequence of assigned, printable codepoints
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 1,
replace_unprintable => 1,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints as-is
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 0,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assigned, printable and unprintable codepoints replaced
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 1,
skip_unassigned => 1,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints as-is
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 0,
skip_unassigned => 0,
writefiles => ($separator ? 'txt,html' : 'txt')
});
# UTF-8 codepoint sequence of assinged and unassigned, printable and unprintable codepoints replaced
$utf8_seq = gen_seq({
start => 0x00,
end => $end,
separator => $separator,
skip_unprintable => 0,
replace_unprintable => 1,
skip_unassigned => 0,
writefiles => ($separator ? 'txt,html' : 'txt')
});
}
}
# print Encode::encode('UTF-8', $utf8_seq), "\n";
sub gen_seq{
my $config = shift;
$config->{start} = 0x00 unless defined $config->{start};
$config->{end} = 0x10FFFF unless defined $config->{end};
$config->{skip_unassigned} = 1 unless defined $config->{skip_unassigned};
$config->{skip_unprintable} = 1 unless defined $config->{skip_unprintable};
$config->{replace_unprintable} = 1 unless defined $config->{replace_unprintable};
$config->{separator} = ' ' unless defined $config->{separator};
$config->{newlines_every} = 50 unless defined $config->{newlines_every};
$config->{writefiles} = 'text,html' unless defined $config->{writefiles};
my $utf8_seq;
my $codepoints_this_line = 0;
my $codepoints_printed = 0;
for my $i ($config->{start} .. $config->{end}) {
next if ($i >= 0xD800 && $i <= 0xDFFF); # high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) are not legal Unicode values, and the UTF-8 encoding of them is an invalid byte sequence
next if ($i >= 0xFDD0 && $i <= 0xFDEF); # Non-characters
next if ( # Non-characters
$i == 0xFFFE || $i == 0xFFFF ||
$i == 0x1FFFE || $i == 0x1FFFF ||
$i == 0x2FFFE || $i == 0x2FFFF ||
$i == 0x3FFFE || $i == 0x3FFFF ||
$i == 0x4FFFE || $i == 0x4FFFF ||
$i == 0x5FFFE || $i == 0x5FFFF ||
$i == 0x6FFFE || $i == 0x6FFFF ||
$i == 0x7FFFE || $i == 0x7FFFF ||
$i == 0x8FFFE || $i == 0x8FFFF ||
$i == 0x9FFFE || $i == 0x9FFFF ||
$i == 0xaFFFE || $i == 0xAFFFF ||
$i == 0xbFFFE || $i == 0xBFFFF ||
$i == 0xcFFFE || $i == 0xCFFFF ||
$i == 0xdFFFE || $i == 0xDFFFF ||
$i == 0xeFFFE || $i == 0xEFFFF ||
$i == 0xfFFFE || $i == 0xFFFFF ||
$i == 0x10FFFE || $i == 0x10FFFF
);
my $codepoint = chr($i);
# skip unassiggned codepoints
next if $config->{skip_unassigned} && $codepoint !~ /^\p{Assigned}/o;
if ( $codepoint =~ /^\p{IsPrint}/o ) {
$utf8_seq .= $codepoint;
} else { # not printable
next if $config->{skip_unprintable};
# include unprintable or replace it
$utf8_seq .= $config->{replace_unprintable} ? '<27>' : $codepoint;
}
$codepoints_printed++;
if ($config->{separator}) {
if ($config->{newlines_every} && $codepoints_this_line++ == $config->{newlines_every}) {
$utf8_seq .= "\n";
$codepoints_this_line = 0;
} else {
$utf8_seq .= $config->{separator};
}
}
}
utf8::upgrade($utf8_seq);
if ($config->{writefiles}) {
my $filebasename = 'utf8_sequence_' .
(sprintf '%#x', $config->{start}) .
'-' .
(sprintf '%#x', $config->{end}) .
($config->{skip_unassigned} ? '_assigned' : '_including-unassigned') .
($config->{skip_unprintable} ? '_printable' : '_including-unprintable') .
(!$config->{skip_unprintable} ?
($config->{replace_unprintable} ? '-replaced' : '-asis') :
''
) .
($config->{separator} ?
($config->{newlines_every} ? '' : '_without-newlines') :
'_unseparated'
);
my $title = 'UTF-8 codepoint sequence' .
($config->{skip_unassigned} ? ' of assigned' : ' of assinged and unassigned') .
($config->{skip_unprintable} ? ', printable' : ', with unprintable') .
(!$config->{skip_unprintable} ?
($config->{replace_unprintable} ? ' codepoints replaced' : ' codepoints as-is') :
' codepoints'
) .
' in the range ' .
(sprintf '%#x', $config->{start}) .
'-' .
(sprintf '%#x', $config->{end}) .
($config->{newlines_every} ? '' : ', as a long string without newlines');
my $html_pre_custom = $html_pre;
$html_pre_custom =~ s|UTF\-8 codepoint sequence|$title|;
my $filename = ${output_directory} . ($config->{separator} ? '' : 'un') . 'separated/' . ${filebasename};
if ($config->{writefiles} =~ /te?xt/) {
open FH, ">${filename}.txt" or die "cannot open $filename: $!";
print FH $utf8_seq;
close FH;
}
if ($config->{writefiles} =~ /html/) {
open FH, ">${filename}_unescaped.html" or die "cannot open $filename: $!";
print FH $html_pre_custom, $utf8_seq, $html_post;
close FH;
}
# open FH, ">${output_directory}${filebasename}_escaped.html";
# print FH $html_pre_custom, HTML::Entities::encode_entities($utf8_seq), $html_post;
# close FH;
print "Output $title ($codepoints_printed codepoints)\n";
}
return $utf8_seq;
}