mirror of
synced 2025-03-04 21:41:01 +01:00
Consolidate c{99,11,23}std.pl into cstd.pl
This commit is contained in:
@ -1,272 +0,0 @@
#!/usr/bin/env perl
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
use warnings;
use strict;
my $debug = 0;
# for paragraphs
my $search = join ' ', @ARGV;
if (not length $search) {
"Usage: c23std [-list] [-n#] [-section <section>] [search text] [-text <regex>] -- `section` must be in the form of `X.Y[pZ]` where `X` and `Y` are section/chapter and, optionally, `pZ` is paragraph. If both `section` and `search text` are specified, then the search space will be within the specified section. Use `-n <n>` to skip to the nth match. To list only the section numbers containing 'search text', add -list. To display specific text, use `-text <regex>`.\n";
exit 0;
my ($section, $paragraph, $section_specified, $paragraph_specified, $match, $list_only, $list_titles, $match_text);
$section_specified = 0;
$paragraph_specified = 0;
if ($search =~ s/-section\s*([A-Z0-9\.p]+)//i or $search =~ s/\b([A-Z0-9]+\.[0-9\.p]+)//i) {
$section = $1;
if ($section =~ s/p(\d+)//i) {
$paragraph = $1;
$paragraph_specified = $USER_SPECIFIED;
} else {
$paragraph = 1;
$section = "$section." if $section =~ m/^[A-Z0-9]+$/i;
$section_specified = 1;
if ($search =~ s/-n\s*(\d+)//) {
$match = $1;
} else {
$match = 1;
if ($search =~ s/-list//i) {
$list_only = 1;
$list_titles = 1; # Added here instead of removing -titles option
if ($search =~ s/-titles//i) {
$list_only = 1;
$list_titles = 1;
if ($search =~ s/-text ([^ ]+)//) {
$match_text = $1;
$search =~ s/^\s+//;
$search =~ s/\s+$//;
if (not defined $section) {
$section = "1.";
$paragraph = 1;
if ($list_only and not length $search) {
print "You must specify some search text to use with -list.\n";
exit 0;
open FH, "<n3047.out" or die "Could not open n3047: $!";
my @contents = <FH>;
close FH;
my $text = join '', @contents;
$text =~ s/\r//g;
my $result;
my $found_section = "";
my $found_section_title = "";
my $section_title;
my $found_paragraph;
my $found = 0;
my $matches = 0;
my $this_section;
my $comma = "";
if ($list_only) { $result = "Sections containing '$search':\n "; }
my $qsearch = quotemeta $search;
$qsearch =~ s/\\ / /g;
$qsearch =~ s/\s+/\\s+/g;
while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
$this_section = $1;
print "----------------------------------\n" if $debug >= 2;
print "Processing section [$this_section]\n" if $debug;
if ($section_specified and $this_section !~ m/^$section/i) {
print "No section match, skipping.\n" if $debug >= 4;
my $section_text;
if ($text =~ m/(.*?)^(?=\s{0,4}(?!FOOTNOTE)[0-9A-Z]+\.)/msg) { $section_text = $1; }
else {
print "No section text, end of file marker found.\n" if $debug >= 4;
if ($section =~ /FOOTNOTE/i) {
$section_text =~ s/^\s{4}//ms;
$section_text =~ s/^\s{4}FOOTNOTE.*//msi;
$section_text =~ s/^\d.*//ms;
} elsif ($section_text =~ m/(.*?)$/msg) {
$section_title = $1 if length $1;
$section_title =~ s/^\s+//;
$section_title =~ s/\s+$//;
print "$this_section [$section_title]\n" if $debug >= 2;
while ($section_text =~ m/^(\d+)\s(.*?)^(?=\d)/msgic or $section_text =~ m/^(\d+)\s(.*)/msgi) {
my $p = $1;
my $t = $2;
print "paragraph $p: [$t]\n" if $debug >= 3;
if ($paragraph_specified == $USER_SPECIFIED and not length $search and $p == $paragraph) {
$result = $t if not $found;
$found_paragraph = $p;
$found_section = $this_section;
$found_section_title = $section_title;
$found = 1;
if (length $search) {
eval {
if ($t =~ m/\b$qsearch\b/mis or $section_title =~ m/\b$qsearch\b/mis) {
if ($matches >= $match) {
if ($list_only) {
$result .= sprintf("%s%-15s", $comma, $this_section . "p" . $p);
$result .= " $section_title" if $list_titles;
$comma = ",\n ";
} else {
if (not $found) {
$result = $t;
$found_section = $this_section;
$found_section_title = $section_title;
$found_paragraph = $p;
$paragraph_specified = $RESULTS_SPECIFIED;
$found = 1;
if ($@) {
print "Error in search regex; you may need to escape characters such as *, ?, ., etc.\n";
exit 0;
last if $found && $paragraph_specified == $USER_SPECIFIED;
if ($paragraph_specified == $USER_SPECIFIED) {
if (length $search) { print "No such text '$search' in paragraph $paragraph of section $section of n3047.\n"; }
else { print "No such paragraph $paragraph in section $section of n3047.\n"; }
exit 0;
if (defined $section_specified and not length $search) {
$found = 1;
$found_section = $this_section;
$found_section_title = $section_title;
$found_paragraph = $paragraph;
$result = $section_text;
if (not $found and $comma eq "") {
$search =~ s/\\s\+/ /g;
if ($section_specified) {
print "No such text '$search' found within section '$section' in C23 Draft Standard (n3047).\n" if length $search;
print "No such section '$section' in C23 Draft Standard (n3047).\n" if not length $search;
exit 0;
print "No such section '$section' in C23 Draft Standard (n3047).\n" if not length $search;
print "No such text '$search' found in C23 Draft Standard (n3047).\n" if length $search;
exit 0;
$result =~ s/$found_section_title// if length $found_section_title;
$result =~ s/^\s+//;
$result =~ s/\s+$//;
$result =~ s/\s+/ /g;
$result =~ s/[\n\r]/ /g;
if ($matches > 1 and not $list_only) { print "Displaying $match of $matches matches: "; }
if ($comma eq "") {
print $found_section;
print "p" . $found_paragraph if $paragraph_specified;
print "http://www.iso-9899.info/n3047.html\#$found_section";
print "p" . $found_paragraph if $paragraph_specified;
print "\n\n";
print "[", $found_section_title, "]\n\n" if length $found_section_title;
$result =~ s/\s*Constraints\s*$//;
$result =~ s/\s*Semantics\s*$//;
$result =~ s/\s*Description\s*$//;
$result =~ s/\s*Returns\s*$//;
$result =~ s/\s*Runtime-constraints\s*$//;
$result =~ s/\s*Recommended practice\s*$//;
if (length $match_text) {
my $match_result = $result;
$match_result =~ s/\s+/ /g;
my $match = eval {
my @matches = ($match_result =~ m/($match_text)/msp);
if (@matches > 1) {
shift @matches;
@matches = grep { length $_ } @matches;
return [${^PREMATCH}, join (' ... ', @matches), ${^POSTMATCH}];
if ($@) {
print "Error in -text option: $@\n";
exit 1;
$result = '';
if (length $match->[0]) {
$result = '... ';
if (length $match->[1]) {
$result .= $match->[1];
} else {
$result = "No text found for `$match_text`.";
if (length $match->[2]) {
$result .= ' ...';
print "$result\n";
@ -1,272 +0,0 @@
#!/usr/bin/env perl
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
use warnings;
use strict;
my $debug = 0;
# for paragraphs
my $search = join ' ', @ARGV;
if (not length $search) {
"Usage: c99std [-list] [-n#] [-section <section>] [search text] [-text <regex>] -- `section` must be in the form of `X.Y[pZ]` where `X` and `Y` are section/chapter and, optionally, `pZ` is paragraph. If both `section` and `search text` are specified, then the search space will be within the specified section. Use `-n <n>` to skip to the nth match. To list only the section numbers containing 'search text', add -list. To display specific text, use `-text <regex>`.\n";
exit 0;
my ($section, $paragraph, $section_specified, $paragraph_specified, $match, $list_only, $list_titles, $match_text);
$section_specified = 0;
$paragraph_specified = 0;
if ($search =~ s/-section\s*([A-Z0-9\.p]+)//i or $search =~ s/\b([A-Z0-9]+\.[0-9\.p]+)//i) {
$section = $1;
if ($section =~ s/p(\d+)//i) {
$paragraph = $1;
$paragraph_specified = $USER_SPECIFIED;
} else {
$paragraph = 1;
$section = "$section." if $section =~ m/^[A-Z0-9]+$/i;
$section_specified = 1;
if ($search =~ s/-n\s*(\d+)//) {
$match = $1;
} else {
$match = 1;
if ($search =~ s/-list//i) {
$list_only = 1;
$list_titles = 1; # Added here instead of removing -titles option
if ($search =~ s/-titles//i) {
$list_only = 1;
$list_titles = 1;
if ($search =~ s/-text ([^ ]+)//) {
$match_text = $1;
$search =~ s/^\s+//;
$search =~ s/\s+$//;
if (not defined $section) {
$section = "1.";
$paragraph = 1;
if ($list_only and not length $search) {
print "You must specify some search text to use with -list.\n";
exit 0;
open FH, "<n1256.out" or die "Could not open n1256: $!";
my @contents = <FH>;
close FH;
my $text = join '', @contents;
$text =~ s/\r//g;
my $result;
my $found_section = "";
my $found_section_title = "";
my $section_title;
my $found_paragraph;
my $found = 0;
my $matches = 0;
my $this_section;
my $comma = "";
if ($list_only) { $result = "Sections containing '$search':\n "; }
my $qsearch = quotemeta $search;
$qsearch =~ s/\\ / /g;
$qsearch =~ s/\s+/\\s+/g;
while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
$this_section = $1;
print "----------------------------------\n" if $debug >= 2;
print "Processing section [$this_section]\n" if $debug;
if ($section_specified and $this_section !~ m/^$section/i) {
print "No section match, skipping.\n" if $debug >= 4;
my $section_text;
if ($text =~ m/(.*?)^(?=\s{0,4}(?!FOOTNOTE)[0-9A-Z]+\.)/msg) { $section_text = $1; }
else {
print "No section text, end of file marker found.\n" if $debug >= 4;
if ($section =~ /FOOTNOTE/i) {
$section_text =~ s/^\s{4}//ms;
$section_text =~ s/^\s{4}FOOTNOTE.*//msi;
$section_text =~ s/^\d.*//ms;
} elsif ($section_text =~ m/(.*?)$/msg) {
$section_title = $1 if length $1;
$section_title =~ s/^\s+//;
$section_title =~ s/\s+$//;
print "$this_section [$section_title]\n" if $debug >= 2;
while ($section_text =~ m/^(\d+)\s(.*?)^(?=\d)/msgic or $section_text =~ m/^(\d+)\s(.*)/msgi) {
my $p = $1;
my $t = $2;
print "paragraph $p: [$t]\n" if $debug >= 3;
if ($paragraph_specified == $USER_SPECIFIED and not length $search and $p == $paragraph) {
$result = $t if not $found;
$found_paragraph = $p;
$found_section = $this_section;
$found_section_title = $section_title;
$found = 1;
if (length $search) {
eval {
if ($t =~ m/\b$qsearch\b/mis or $section_title =~ m/\b$qsearch\b/mis) {
if ($matches >= $match) {
if ($list_only) {
$result .= sprintf("%s%-15s", $comma, $this_section . "p" . $p);
$result .= " $section_title" if $list_titles;
$comma = ",\n ";
} else {
if (not $found) {
$result = $t;
$found_section = $this_section;
$found_section_title = $section_title;
$found_paragraph = $p;
$paragraph_specified = $RESULTS_SPECIFIED;
$found = 1;
if ($@) {
print "Error in search regex; you may need to escape characters such as *, ?, ., etc.\n";
exit 0;
last if $found && $paragraph_specified == $USER_SPECIFIED;
if ($paragraph_specified == $USER_SPECIFIED) {
if (length $search) { print "No such text '$search' in paragraph $paragraph of section $section of n1256.\n"; }
else { print "No such paragraph $paragraph in section $section of n1256.\n"; }
exit 0;
if (defined $section_specified and not length $search) {
$found = 1;
$found_section = $this_section;
$found_section_title = $section_title;
$found_paragraph = $paragraph;
$result = $section_text;
if (not $found and $comma eq "") {
$search =~ s/\\s\+/ /g;
if ($section_specified) {
print "No such text '$search' found within section '$section' in C99 Draft Standard (n1256).\n" if length $search;
print "No such section '$section' in C99 Draft Standard (n1256).\n" if not length $search;
exit 0;
print "No such section '$section' in C99 Draft Standard (n1256).\n" if not length $search;
print "No such text '$search' found in C99 Draft Standard (n1256).\n" if length $search;
exit 0;
$result =~ s/$found_section_title// if length $found_section_title;
$result =~ s/^\s+//;
$result =~ s/\s+$//;
$result =~ s/\s+/ /g;
$result =~ s/[\n\r]/ /g;
if ($matches > 1 and not $list_only) { print "Displaying $match of $matches matches: "; }
if ($comma eq "") {
print $found_section;
print "p" . $found_paragraph if $paragraph_specified;
print "http://www.iso-9899.info/n1256.html\#$found_section";
print "p" . $found_paragraph if $paragraph_specified;
print "\n\n";
print "[", $found_section_title, "]\n\n" if length $found_section_title;
$result =~ s/\s*Constraints\s*$//;
$result =~ s/\s*Semantics\s*$//;
$result =~ s/\s*Description\s*$//;
$result =~ s/\s*Returns\s*$//;
$result =~ s/\s*Runtime-constraints\s*$//;
$result =~ s/\s*Recommended practice\s*$//;
if (length $match_text) {
my $match_result = $result;
$match_result =~ s/\s+/ /g;
my $match = eval {
my @matches = ($match_result =~ m/($match_text)/msp);
if (@matches > 1) {
shift @matches;
@matches = grep { length $_ } @matches;
return [${^PREMATCH}, join (' ... ', @matches), ${^POSTMATCH}];
if ($@) {
print "Error in -text option: $@\n";
exit 1;
$result = '';
if (length $match->[0]) {
$result = '... ';
if (length $match->[1]) {
$result .= $match->[1];
} else {
$result = "No text found for `$match_text`.";
if (length $match->[2]) {
$result .= ' ...';
print "$result\n";
applets/c11std.pl → applets/cstd.pl
applets/c11std.pl → applets/cstd.pl
@ -6,80 +6,117 @@
use warnings;
use strict;
my $debug = 0;
use Getopt::Long qw/GetOptionsFromArray/;
use Encode;
# for paragraphs
my %standards = (
C99 => 'n1256.out',
C11 => 'n1570.out',
C23 => 'n3047.out',
my $search = join ' ', @ARGV;
@ARGV = map { decode('UTF-8', $_, 1) } @ARGV;
if (not length $search) {
"Usage: c11std [-list] [-n#] [-section <section>] [search text] [-text <regex>] -- `section` must be in the form of `X.Y[pZ]` where `X` and `Y` are section/chapter and, optionally, `pZ` is paragraph. If both `section` and `search text` are specified, then the search space will be within the specified section. Use `-n <n>` to skip to the nth match. To list only the section numbers containing 'search text', add -list. To display specific text, use `-text <regex>`.\n";
exit 0;
my ($std, $search, $section, $paragraph, $debug);
my ($match, $list_only, $match_text);
my $opt_error;
local $SIG{__WARN__} = sub {
$opt_error = shift;
chomp $opt_error;
'std=s' => \$std,
'section|s=s' => \$section,
'num|n=i' => \$match,
'text|t=s' => \$match_text,
'list|l' => \$list_only,
'debug|d=i' => \$debug,
$std //= 'C99';
$section //= '';
$match //= 1;
$list_only //= 0;
$debug //= 0;
$std = uc $std;
if (not exists $standards{$std}) {
print "Invalid -std=$std selected. Valid -std= values are: ", join(', ', sort keys %standards), "\n";
exit 1;
my $usage = "Usage: $std [-list] [-n#] [-section <section>] [search text] [-text <regex>] -- `section` must be in the form of `X.Y[pZ]` where `X` and `Y` are section/chapter and, optionally, `pZ` is paragraph. If both `section` and `search text` are specified, then the search space will be within the specified section. Use `-n <n>` to skip to the nth match. To list only the section numbers containing 'search text', add -list. To display specific text, use `-text <regex>`.\n";
if ($opt_error) {
print "$opt_error: $usage\n";
exit 1;
$search = "@ARGV";
if (!length $section && !length $search) {
print $usage;
exit 1;
my ($section, $paragraph, $section_specified, $paragraph_specified, $match, $list_only, $list_titles, $match_text);
# for paragraphs
use constant {
$section_specified = 0;
$paragraph_specified = 0;
my $section_specified = length $section ? 1 : 0;
my $paragraph_specified = 0;
if ($search =~ s/-section\s*([A-Z0-9\.p]+)//i or $search =~ s/\b([A-Z0-9]+\.[0-9\.p]+)//i) {
if ($search =~ s/\b([A-Z0-9]+\.[0-9.p]*)//i) {
$section = $1;
if ($section =~ s/p(\d+)//i) {
$paragraph = $1;
$paragraph_specified = $USER_SPECIFIED;
$paragraph_specified = USER_SPECIFIED;
} else {
$paragraph = 1;
$section = "$section." if $section =~ m/^[A-Z0-9]+$/i;
$section_specified = 1;
if ($search =~ s/-n\s*(\d+)//) {
$match = $1;
} else {
$match = 1;
if ($search =~ s/-list//i) {
$list_only = 1;
$list_titles = 1; # Added here instead of removing -titles option
if ($search =~ s/-titles//i) {
$list_only = 1;
$list_titles = 1;
if ($search =~ s/-text ([^ ]+)//) {
$match_text = $1;
# add trailing dot if missing
if ($section =~ /^[A-Z0-9]+$/i) {
$section .= '.';
$search =~ s/^\s+//;
$search =~ s/\s+$//;
if (not defined $section) {
if (not length $section) {
$section = "1.";
$paragraph = 1;
if ($list_only and not length $search) {
print "You must specify some search text to use with -list.\n";
exit 0;
exit 1;
open FH, "<n1570.out" or die "Could not open n1570: $!";
open FH, "<:encoding(UTF-8)", $standards{$std} or die "Could not open $standards{$std}: $!";
my @contents = <FH>;
close FH;
my $text = join '', @contents;
$text =~ s/\r//g;
my $std_name = $standards{$std};
$std_name =~ s/(.*)\..*$/$1/;
my $result;
my $found_section = "";
my $found_section_title = "";
@ -96,7 +133,7 @@ my $qsearch = quotemeta $search;
$qsearch =~ s/\\ / /g;
$qsearch =~ s/\s+/\\s+/g;
while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
while ($text =~ m/^([0-9A-Z]+\.[0-9.]*)/msg) {
$this_section = $1;
print "----------------------------------\n" if $debug >= 2;
@ -109,15 +146,16 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
my $section_text;
if ($text =~ m/(.*?)^(?=\s{0,4}(?!FOOTNOTE)[0-9A-Z]+\.)/msg) { $section_text = $1; }
else {
if ($text =~ m/(.*?)^(?=\s{0,4}(?!Footnote)[0-9A-Z]+\.)/msg) {
$section_text = $1;
} else {
print "No section text, end of file marker found.\n" if $debug >= 4;
if ($section =~ /FOOTNOTE/i) {
if ($section =~ /Footnote/i) {
$section_text =~ s/^\s{4}//ms;
$section_text =~ s/^\s{4}FOOTNOTE.*//msi;
$section_text =~ s/^\s{4}Footnote.*//msi;
$section_text =~ s/^\d.*//ms;
} elsif ($section_text =~ m/(.*?)$/msg) {
$section_title = $1 if length $1;
@ -133,7 +171,7 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
print "paragraph $p: [$t]\n" if $debug >= 3;
if ($paragraph_specified == $USER_SPECIFIED and not length $search and $p == $paragraph) {
if ($paragraph_specified == USER_SPECIFIED and not length $search and $p == $paragraph) {
$result = $t if not $found;
$found_paragraph = $p;
$found_section = $this_section;
@ -149,7 +187,7 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
if ($matches >= $match) {
if ($list_only) {
$result .= sprintf("%s%-15s", $comma, $this_section . "p" . $p);
$result .= " $section_title" if $list_titles;
$result .= " $section_title";
$comma = ",\n ";
} else {
if (not $found) {
@ -157,7 +195,7 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
$found_section = $this_section;
$found_section_title = $section_title;
$found_paragraph = $p;
$paragraph_specified = $RESULTS_SPECIFIED;
$paragraph_specified = RESULTS_SPECIFIED;
$found = 1;
@ -165,19 +203,23 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
if ($@) {
print "Error in search regex; you may need to escape characters such as *, ?, ., etc.\n";
if (my $err = $@) {
$err =~ s/.* at .*$//;
print "Error in search regex: $err\n";
exit 0;
last if $found && $paragraph_specified == $USER_SPECIFIED;
last if $found && $paragraph_specified == USER_SPECIFIED;
if ($paragraph_specified == $USER_SPECIFIED) {
if (length $search) { print "No such text '$search' in paragraph $paragraph of section $section of n1570.\n"; }
else { print "No such paragraph $paragraph in section $section of n1570.\n"; }
exit 0;
if ($paragraph_specified == USER_SPECIFIED) {
if (length $search) {
print "No such text '$search' in paragraph $paragraph of section $section of $std_name.\n";
} else {
print "No such paragraph $paragraph in section $section of $std_name.\n";
exit 1;
if (defined $section_specified and not length $search) {
@ -192,15 +234,17 @@ while ($text =~ m/^\s{0,4}([0-9A-Z]+\.[0-9\.]*)/msg) {
if (not $found and $comma eq "") {
$search =~ s/\\s\+/ /g;
if ($section_specified) {
print "No such text '$search' found within section '$section' in C11 Draft Standard (n1570).\n" if length $search;
print "No such section '$section' in C11 Draft Standard (n1570).\n" if not length $search;
exit 0;
if (length $search) {
print "No such text '$search' found ";
print "No such section '$section' in C11 Draft Standard (n1570).\n" if not length $search;
print "No such text '$search' found in C11 Draft Standard (n1570).\n" if length $search;
exit 0;
if ($section_specified) {
print "within section '$section' ";
} else {
print "No such section '$section' ";
print "in $std Draft Standard ($std_name).\n";
exit 1;
$result =~ s/$found_section_title// if length $found_section_title;
@ -221,7 +265,7 @@ if ($comma eq "") {
print "p" . $found_paragraph if $paragraph_specified;
print "http://www.iso-9899.info/n1570.html\#$found_section";
print "http://www.iso-9899.info/$std_name.html\#$found_section";
print "p" . $found_paragraph if $paragraph_specified;
print "\n\n";
print "[", $found_section_title, "]\n\n" if length $found_section_title;
@ -5,234 +5,260 @@
# ugly and hacked together
# n1256: pdftotext -layout -y 75 -H 650 -W 1000 n1256.pdf n1256.in
# n1570: pdftotext -layout -y 80 -H 650 -W 1000 n1570.pdf n1570.in
# n3047: pdftotext -layout -y 75 -H 700 -W 1000 n3047.pdf n3047.in
use warnings;
use strict;
use HTML::Entities;
use Data::Dumper;
my $debug = 1000;
my $debug = 100;
sub gen_data;
sub gen_txt;
sub gen_html;
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
open FH, "<n1256.txt" or die "Could not open n1256.txt: $!";
my $input = "@ARGV";
#open FH, "<n1570.txt" or die "Could not open n1570.txt: $!";
if (not length $input) {
print STDERR "Usage: $0 <input .txt file>\n";
exit 1;
open FH, "<:encoding(UTF-8)", $input or die "Could not open $input: $!";
my @contents = <FH>;
close FH;
my $text = join '', @contents;
$text =~ s/\r//g;
my ($section_title, $this_section);
my $section_title;
my $this_section = '';
my %sections;
my $last_section_number = 0;
my $section_number = 0;
my $last_section;
my @last_section_number;
my @section_number;
my $last_section = '';
my @footnotes;
my $footnote = 0;
my $last_footnote = 0;
sub gen_data {
while ($text =~ m/^\s{0,5}([0-9A-Z]+\.[0-9\.]*)/msg) {
$last_section_number = $section_number;
$last_section = $this_section;
$this_section = $1;
while ($text =~ m/^\f?\s{0,5}([0-9A-Z]+\.[0-9\.]*)/msg) {
$last_section = $this_section;
$this_section = $1;
($section_number) = $this_section =~ /([^.]+)\./;
@last_section_number = @section_number;
@section_number = split /\./, $this_section;
print STDERR "----------------------------------\n" if $debug;
print STDERR "Processing section [$this_section]; number [$section_number]\n" if $debug;
print STDERR "----------------------------------\n" if $debug;
print STDERR "Processing section [$this_section]\n" if $debug;
print STDERR "this_section: [$this_section]; last_section: [$last_section]\n" if $debug >= 2;
print STDERR "Section diff: ", ($this_section - $last_section), "\n" if $debug >= 2;
my $diff = $section_number - $last_section_number;
print STDERR "Diff: $diff\n" if $debug >= 2;
if ($section_number > 0 and $diff < 0 or $diff > 1) {
print STDERR "Diff out of bounds: $diff\n";
my $section_text;
if ($text =~ m/(.*?)^(?=\s{0,4}[0-9A-Z]+\.)/msg) { $section_text = $1; }
else {
if ($text =~ m/(.*?)^(?=\f?\s{0,4}[0-9A-Z]+\.)/msg) {
$section_text = $1;
} else {
print STDERR "No section text, end of file marker found.\n";
if ($section_text =~ m/(.*?)$/msg) {
$section_title = $1 if length $1;
$section_title =~ s/^\s+//;
$section_title =~ s/\s+$//;
if (length $1) {
$section_title = $1;
$section_title =~ s/^\s+//;
$section_title =~ s/\s+$//;
print STDERR "+++ set new section title: [$section_title]\n" if $debug;
} else {
print STDERR "--- no length for section title\n" if $debug;
} else {
print STDERR "--- no new section title\n" if $debug;
print STDERR "$this_section [$section_title]\n" if $debug >= 2;
$sections{$this_section}{title} = $section_title;
print STDERR "section text: [$section_text]\n" if $debug >= 2;
($section_text) = $section_text =~ m/\s*(.*)/msg;
if (not $section_text =~ m/^(?=\d+\s)/msg) { $sections{$this_section}{text} = $section_text; }
else {
print STDERR "+++ $this_section [$section_title]\n" if $debug >= 2;
print STDERR "+++ section text: [$section_text]\n" if $debug >= 2;
if (not $section_text =~ m/^(?=\d+\s)/msg) {
print STDERR "??? no paragraphs in section\n" if $debug;
$section_text =~ s/~~//msg;
$section_text =~ s/ZZZ//msg;
$sections{$this_section}{text} = $section_text;
} else {
my $last_p = 0;
my $p = 0;
while ($section_text =~ m/^(\d+)\s(.*?)^(?=\d)/msgc or $section_text =~ m/^(\d+)\s(.*)/msg) {
print STDERR "+++ getting paragraphs for $this_section\n" if $debug;
my $pretext;
if ($section_text =~ m/^(?!\f?\d+\s)/) {
($pretext) = $section_text =~ m/^(.*?)^(?=\f?\d+\s)/ms;
print STDERR "pretext captured: [$pretext]\n";
while ($section_text =~ m/^\f?(\d+)\s(.*?)^(?=\f?\d)/msgc or $section_text =~ m/^\f?(\d+)\s(.*)/msg) {
$last_p = $p;
$p = $1;
my $t = $2;
my $t = $2;
if (length $pretext) {
$t = "$pretext $t";
$pretext = '';
print STDERR "paragraph $p: [$t]\n" if $debug >= 3;
if (($last_p - $p) != -1) { die "Paragraph diff invalid"; }
if ($p - $last_p != 1) {
die "Paragraph diff invalid" unless ($input eq 'n1570.in' && $this_section =~ /^(?:K.3.9.(?:2|3))/);
# check for footnotes
my @new_footnotes;
while ($t =~ m/^\s*(\d+)\)\s*(.*?)$/mgc) {
$footnote = $1;
my $footnote_text = "$2\n";
print STDERR "processing 1st footnote $footnote [last: $last_footnote]\n" if $debug;
print STDERR "footnote text [$footnote_text]\n" if $debug;
while ($t =~ m/^(\s*)(\d+)\)(\s*)(.*?)$/msg) {
my $leading_spaces = $1;
$footnote = $2;
my $middle_spaces = $3;
my $footnote_text = "$4\n";
print STDERR "1st footnote\n" if $debug;
print STDERR "processing footnote $footnote [last: $last_footnote]\n" if $debug >= 2;
if ($last_footnote - $footnote != -1) {
print STDERR "footnotes dump: \n" if $debug > 5;
shift @footnotes;
my $dump = Dumper(@footnotes) if $debug > 5;
#print STDERR "$dump\n";
die "Footnote diff invalid";
$last_footnote = $footnote;
my $indent = (length $leading_spaces) + (length $footnote) + (length ')') + (length $middle_spaces);
push @new_footnotes, $footnote;
print STDERR "footnote $footnote text [indent=$indent]: [$footnote_text]\n" if $debug >= 4;
print STDERR "footnote $footnote text: [$footnote_text]\n" if $debug >= 4;
while ($t =~ m/^(.*?)$/msgc) {
while ($t =~ m/^(.*?)$/mgc) {
my $line = $1;
print STDERR "processing [$line]\n" if $debug;
if ($line =~ m/^(\s*)(\d+)\)(\s*)(.*?)$/msg) {
print STDERR "----------------\n" if $debug >= 1;
print STDERR "footnote $footnote: [$footnote_text]\n" if $debug >= 1;
$footnotes[$footnote] = $footnote_text;
print STDERR "----------------\n" if $debug >= 1;
if ($line =~ m/^\f/mg) {
print STDERR "end of footnote $footnote\n";
$leading_spaces = $1;
$footnote = $2;
$middle_spaces = $3;
$footnote_text = "$4\n";
print STDERR "2nd footnote\n" if $debug >= 2;
print STDERR "processing footnote $footnote [last: $last_footnote]\n" if $debug >= 2;
if ($last_footnote - $footnote != -1) {
print STDERR "footnotes dump: \n";
shift @footnotes;
my $dump = Dumper(@footnotes);
print STDERR "$dump\n" if $debug >= 3;
die "Footnote diff invalid";
$last_footnote = $footnote;
my $indent = (length $leading_spaces) + (length $footnote) + (length ')') + (length $middle_spaces);
print STDERR "footnote $footnote text [indent=$indent]: [$footnote_text]\n" if $debug >= 4;
if (not length $line or $line =~ m/^\s+$/) {
print STDERR "skipping empty line\n";
if (not $line =~ m/^\s{$indent}/msg) {
if ($line =~ m/^\s*(\d+)\)\s*(.*?)$/mg) {
print STDERR "----------------\n" if $debug >= 1;
print STDERR "+++ added footnote $footnote: [$footnote_text]\n" if $debug >= 1;
$footnotes[$footnote] = $footnote_text;
print STDERR "----------------\n" if $debug >= 1;
$footnote = $1;
$footnote_text = "$2\n";
print STDERR "processing 2nd footnote $footnote [last: $last_footnote]\n" if $debug;
if ($last_footnote - $footnote != -1) {
die "Footnote diff invalid";
$last_footnote = $footnote;
push @new_footnotes, $footnote;
print STDERR "footnote $footnote text: [$footnote_text]\n" if $debug >= 4;
if (not length $line or $line =~ m/^\s+$/) {
print STDERR "footnote $footnote: skipping empty line\n";
} else {
$footnote_text .= "$line\n";
print STDERR "footnote $footnote text: appending [$line]\n" if $debug >= 3;
$footnote_text .= "$line\n";
print STDERR "footnote $footnote text: appending [$line]\n" if $debug >= 3;
print STDERR "----------------\n" if $debug >= 1;
print STDERR "footnote $footnote: [$footnote_text]\n" if $debug >= 1;
print STDERR "----------------\n" if $debug >= 1;
print STDERR "+++ added footnote $footnote: [$footnote_text]\n" if $debug >= 1;
$footnotes[$footnote] = $footnote_text;
print STDERR "----------------\n" if $debug >= 1;
# strip footnotes from section text
foreach my $fn (@new_footnotes) {
my $sub = quotemeta $footnotes[$fn];
$sub =~ s/(\\ )+/\\s*/g;
#print STDERR "subbing out [$footnote) $sub]\n";
$t =~ s/^\s*$fn\)\s*$sub//ms;
$t =~ s/\f//g;
$t =~ s/~~//msg;
$t =~ s/ZZZ//msg;
$sections{$this_section . "p$p"}{text} = "$p $t";
print STDERR "+++ added ${this_section}p$p:\n$p $t\n" if $debug;
print STDERR "+++ paragraphs done\n" if $debug;
sub bysection {
my $inverse = 1;
print STDERR "section cmp $a <=> $b\n" if $debug > 10;
my ($a1, $p1) = split /p/, $a;
my ($b1, $p2) = split /p/, $b;
$p1 = 0 if not defined $p1;
$p2 = 0 if not defined $p2;
$p1 //= 0;
$p2 //= 0;
my @k1 = split /\./, $a1;
my @k2 = split /\./, $b1;
my @r;
if ($#k2 > $#k1) {
my @t = @k1;
@k1 = @k2;
@k2 = @t;
my @tk = @k1;
@k1 = @k2;
@k2 = @tk;
my $tp = $p1;
$p1 = $p2;
$p2 = $tp;
$p1 = $p2;
$p2 = $tp;
$inverse = -1;
} else {
$inverse = 1;
print STDERR "k1 vals:\n";
print STDERR Dumper(@k1), "\n";
print STDERR "p1: $p1\n";
print STDERR "k2 vals:\n";
print STDERR Dumper(@k2), "\n";
print STDERR "p2: $p2\n";
my $i = 0;
for (; $i < $#k1 + 1; $i++) {
if (not defined $k2[$i]) { $r[$i] = 1; }
else {
print STDERR " cmp k1[$i] ($k1[$i]) vs k2[$i] ($k2[$i])\n" if $debug >= 5;
if ($i == 0) { $r[$i] = $k1[$i] cmp $k2[$i]; }
else { $r[$i] = $k1[$i] <=> $k2[$i]; }
print STDERR " r[$i] = $r[$i]\n" if $debug >= 5;
$r[$i] = ($p1 <=> $p2);
print STDERR " $p1 <=> $p2 => r[$i] = $r[$i]\n" if $debug >= 5;
my $ret = 0;
foreach my $rv (@r) {
print STDERR " checking r: $rv\n" if $debug >= 5;
if ($rv != 0) {
$ret = $rv;
$ret = $ret * $inverse;
print STDERR "ret $ret\n" if $debug >= 5;
return $ret;
return $ret * $inverse;
sub gen_txt {
@ -251,14 +277,6 @@ sub gen_txt {
my $section_text = $sections{$this_section}{text};
for ($footnote = 1; $footnote < $#footnotes; $footnote++) {
my $sub = quotemeta $footnotes[$footnote];
$sub =~ s/(\\ )+/\\s*/g;
#print STDERR "subbing out [$footnote) $sub]\n";
$section_text =~ s/^\s*$footnote\)\s*$sub//ms;
while ($section_text =~ m/^(.*?)$/msg) {
my $line = $1;
@ -279,7 +297,7 @@ sub gen_txt {
if ($paren == -1) {
if (length $number and defined $footnotes[$number]) {
print STDERR "Got footnote $number here!\n" if $debug;
$footer .= " FOOTNOTE.$number\n $footnotes[$number]\n";
$footer .= "\nFOOTNOTE.$number) $footnotes[$number]\n";
$paren = 0;
@ -299,72 +317,238 @@ sub gen_txt {
sub make_link {
my ($text) = @_;
if (exists $sections{$text}) {
return "<a href='#$text'>$text</a>";
} else {
return $text;
sub linkify {
my ($text) = @_;
$text =~ s/\b((?:[A-Z]|[1-9])\.(?:\.?[0-9]+)*)\b/make_link($1)/ge;
return $text;
sub gen_html {
print "<html>\n<body>\n";
my $footer = "";
my $paren = 0;
foreach my $this_section (sort bysection keys %sections) {
print STDERR "writing section $this_section\n" if $debug;
print "<a name='", encode_entities $this_section, "'>\n";
print "<hr>\n<h3>", encode_entities $this_section, " [", encode_entities $sections{$this_section}{title}, "]</h3>\n" if not $this_section =~ m/p/;
my $section_text = $sections{$this_section}{text};
for ($footnote = 1; $footnote < $#footnotes; $footnote++) {
my $sub = quotemeta $footnotes[$footnote];
$sub =~ s/(\\ )+/\\s*/g;
#print STDERR "subbing out [$footnote) $sub]\n";
$section_text =~ s/^\s*$footnote\)\s*$sub//ms;
$section_text = encode_entities $section_text;
while ($section_text =~ m/^(.*?)$/msg) {
my $line = $1;
print STDERR "paren reset, line [$line]\n" if $debug >= 8;
my $number = "";
while ($line =~ m/(.)/g) {
my $c = $1;
if ($c =~ m/[0-9]/) { $number .= $c; }
elsif ($c eq ' ') { $number = ""; }
elsif ($c eq '(') {
print STDERR "got $paren (\n" if $debug >= 8;
} elsif ($c eq ')') {
print STDERR "got $paren )\n" if $debug >= 8;
if ($paren == -1) {
if (length $number and defined $footnotes[$number]) {
print STDERR "Got footnote $number here!\n" if $debug;
$section_text =~ s/$number\)/<sup>[$number]<\/sup>/;
$footer .= "<a name='FOOTNOTE.$number'>\n<pre><i><b>Footnote $number)</b> ", encode_entities $footnotes[$number], "</i></pre>\n</a>\n";
$paren = 0;
} else {
$number = "";
foreach my $section (qw/ABSTRACT. CONTENTS. FOREWORD. INTRO./) {
foreach my $paragraph (sort bysection keys %sections) {
if ($paragraph =~ m/^$section/) {
delete $sections{$paragraph};
delete $sections{$section};
$section_text =~ s/\(([0-9.]+)\)/(<a href="#$1">$1<\/a>)/g;
$footer =~ s/\(([0-9.]+)\)/(<a href="#$1">$1<\/a>)/g;
print "<pre>", $section_text, "</pre>\n";
print "</a>\n";
if (length $footer) {
print $footer;
$footer = "";
foreach my $section (sort bysection keys %sections) {
print "\n</body>\n</html>\n";
sub write_html_section {
my ($this_section) = @_;
my $footer = "";
my $paren = 0;
print STDERR "writing section [$this_section]\n" if $debug;
print "<a name='", encode_entities($this_section), "'></a>\n";
if (not $this_section =~ m/p/) {
print "<hr>\n<h3>", encode_entities($this_section), " [", encode_entities($sections{$this_section}{title}), "]</h3>\n";
my $section_text = $sections{$this_section}{text};
next if not length $section_text;
$section_text = encode_entities $section_text;
while ($section_text =~ m/^(.*?)$/msg) {
my $line = $1;
print STDERR "paren reset, line [$line]\n" if $debug >= 8;
my $number = "";
while ($line =~ m/(.)/g) {
my $c = $1;
if ($c =~ m/[0-9]/) { $number .= $c; }
elsif ($c eq ' ') { $number = ""; }
elsif ($c eq '(') {
print STDERR "got $paren (\n" if $debug >= 8;
} elsif ($c eq ')') {
print STDERR "got $paren )\n" if $debug >= 8;
if ($paren == -1) {
if (length $number and defined $footnotes[$number]) {
print STDERR "Got footnote $number here!\n" if $debug;
$section_text =~ s/$number\)/<a href='#FOOTNOTE.$number'><sup>[$number]<\/sup><\/a>/;
$footer .= "<a name='FOOTNOTE.$number'>\n<pre><i><b>Footnote $number)</b> ".encode_entities($footnotes[$number])."</i></pre>\n</a>\n";
$paren = 0;
} else {
$number = "";
$section_text = linkify($section_text);
$footer = linkify($footer);
if ($this_section eq 'CONTENTS.') {
$section_text =~ s/Annex ([A-Z])/<a href='#$1.'>Annex $1<\/a>/mg;
$section_text =~ s/^(\d+\.)/<a href='#$1'>$1<\/a>/mg;
$section_text =~ s/^Foreword/<a href='#FOREWORD.'>Foreword<\/a>/mg;
$section_text =~ s/^Introduction/<a href='#INTRO.'>Introduction<\/a>/mg;
print "<pre>", $section_text, "</pre>\n";
if (length $footer) {
print $footer;
$footer = '';
# this mess of code verifies that two given section numbers are within 1 unit of distance of each other
# this ensures that no sections were skipped due to misparses
sub validate_section_difference {
if (@last_section_number && $last_section_number[0] !~ /(?:ABSTRACT|CONTENTS|FOREWORD|INTRO)/) {
my $fail = 0;
my $skip = 0;
print STDERR "comparing last section ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
if (@section_number > @last_section_number) {
if (@section_number - @last_section_number != 1) {
$fail = 1;
print STDERR "size difference too great\n";
unless ($fail) {
if ($section_number[0] =~ /^[A-Z]+$/) {
if ($last_section_number[0] =~ /^[A-Z]+$/) {
for (my $i = 0; $i < @last_section_number; $i++) {
if ($section_number[$i] ne $last_section_number[$i]) {
$fail = 1;
print STDERR "digits different\n";
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @last_section_number; $i++) {
if ($section_number[$i] ne $last_section_number[$i]) {
$fail = 1;
print STDERR "digits different\n";
if (!$skip && ($fail || $section_number[$#section_number] != 1)) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
} elsif (@last_section_number > @section_number) {
if ($section_number[0] =~ /^[A-Z]+$/) {
if ($last_section_number[0] =~ /^[A-Z]+$/) {
if ($section_number[0] ne $last_section_number[0]) {
if (ord($section_number[0]) - ord($last_section_number[0]) != 1) {
$fail = 1;
print STDERR "letter difference too great\n";
} else {
$skip = 1;
print STDERR "letter difference good\n";
unless ($fail) {
for (my $i = 1; $i < @section_number - 1; $i++) {
if ($section_number[$i] != $last_section_number[$i]) {
if ($section_number[$i] - $last_section_number[$i] != 1) {
print STDERR "digit difference too great\n";
$fail = 1;
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @section_number - 1; $i++) {
if ($section_number[$i] != $last_section_number[$i]) {
if ($section_number[$i] - $last_section_number[$i] != 1) {
print STDERR "digit difference too great\n";
$fail = 1;
if (!$skip && ($fail || $section_number[$#section_number] - $last_section_number[$#section_number] != 1)) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
} else {
my @rev_last = reverse @last_section_number;
my @rev_curr = reverse @section_number;
if ($rev_curr[$#rev_curr] =~ /^[A-Z]+$/) {
if ($rev_last[$#rev_last] =~ /^[A-Z]+$/) {
if ($rev_curr[$#rev_curr] ne $rev_last[$#rev_last]) {
if (ord($rev_curr[$#rev_curr]) - ord($rev_last[$#rev_last]) != 1) {
$fail = 1;
print STDERR "letter difference too great\n";
for (my $i = 1; $i < @rev_curr; $i++) {
if ($rev_curr[$i] != $rev_last[$i]) {
if ($rev_curr[$i] - $rev_last[$i] > 1) {
$fail = 1;
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @rev_curr; $i++) {
if ($rev_curr[$i] != $rev_last[$i]) {
if ($rev_curr[$i] - $rev_last[$i] > 1) {
$fail = 1;
if (!$skip && $fail) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
@ -1,554 +0,0 @@
#!/usr/bin/env perl
# SPDX-FileCopyrightText: 2021 Pragmatic Software <pragma78@gmail.com>
# SPDX-License-Identifier: MIT
# ugly and hacked together
# n1256: pdftotext -layout -y 75 -H 650 -W 1000 n1256.pdf n1256.in
# n1570: pdftotext -layout -y 80 -H 650 -W 1000 n1570.pdf n1570.in
# n3047: pdftotext -layout -y 75 -H 700 -W 1000 n3047.pdf n3047.in
use warnings;
use strict;
use HTML::Entities;
use Data::Dumper;
my $debug = 100;
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $input = "@ARGV";
if (not length $input) {
print STDERR "Usage: $0 <input .txt file>\n";
exit 1;
open FH, "<:encoding(UTF-8)", $input or die "Could not open $input: $!";
my @contents = <FH>;
close FH;
my $text = join '', @contents;
$text =~ s/\r//g;
my $section_title;
my $this_section = '';
my %sections;
my @last_section_number;
my @section_number;
my $last_section = '';
my @footnotes;
my $footnote = 0;
my $last_footnote = 0;
sub gen_data {
while ($text =~ m/^\f?\s{0,5}([0-9A-Z]+\.[0-9\.]*)/msg) {
$last_section = $this_section;
$this_section = $1;
@last_section_number = @section_number;
@section_number = split /\./, $this_section;
print STDERR "----------------------------------\n" if $debug;
print STDERR "Processing section [$this_section]\n" if $debug;
my $section_text;
if ($text =~ m/(.*?)^(?=\f?\s{0,4}[0-9A-Z]+\.)/msg) {
$section_text = $1;
} else {
print STDERR "No section text, end of file marker found.\n";
if ($section_text =~ m/(.*?)$/msg) {
if (length $1) {
$section_title = $1;
$section_title =~ s/^\s+//;
$section_title =~ s/\s+$//;
print STDERR "+++ set new section title: [$section_title]\n" if $debug;
} else {
print STDERR "--- no length for section title\n" if $debug;
} else {
print STDERR "--- no new section title\n" if $debug;
$sections{$this_section}{title} = $section_title;
($section_text) = $section_text =~ m/\s*(.*)/msg;
print STDERR "+++ $this_section [$section_title]\n" if $debug >= 2;
print STDERR "+++ section text: [$section_text]\n" if $debug >= 2;
if (not $section_text =~ m/^(?=\d+\s)/msg) {
print STDERR "??? no paragraphs in section\n" if $debug;
$section_text =~ s/~~//msg;
$section_text =~ s/ZZZ//msg;
$sections{$this_section}{text} = $section_text;
} else {
my $last_p = 0;
my $p = 0;
print STDERR "+++ getting paragraphs for $this_section\n" if $debug;
my $pretext;
if ($section_text =~ m/^(?!\f?\d+\s)/) {
($pretext) = $section_text =~ m/^(.*?)^(?=\f?\d+\s)/ms;
print STDERR "pretext captured: [$pretext]\n";
while ($section_text =~ m/^\f?(\d+)\s(.*?)^(?=\f?\d)/msgc or $section_text =~ m/^\f?(\d+)\s(.*)/msg) {
$last_p = $p;
$p = $1;
my $t = $2;
if (length $pretext) {
$t = "$pretext $t";
$pretext = '';
print STDERR "paragraph $p: [$t]\n" if $debug >= 3;
if ($p - $last_p != 1) {
die "Paragraph diff invalid" unless ($input eq 'n1570.in' && $this_section =~ /^(?:K.3.9.(?:2|3))/);
# check for footnotes
my @new_footnotes;
while ($t =~ m/^\s*(\d+)\)\s*(.*?)$/mgc) {
$footnote = $1;
my $footnote_text = "$2\n";
print STDERR "processing 1st footnote $footnote [last: $last_footnote]\n" if $debug;
print STDERR "footnote text [$footnote_text]\n" if $debug;
if ($last_footnote - $footnote != -1) {
die "Footnote diff invalid";
$last_footnote = $footnote;
push @new_footnotes, $footnote;
print STDERR "footnote $footnote text: [$footnote_text]\n" if $debug >= 4;
while ($t =~ m/^(.*?)$/mgc) {
my $line = $1;
print STDERR "processing [$line]\n" if $debug;
if ($line =~ m/^\f/mg) {
print STDERR "end of footnote $footnote\n";
if (not length $line or $line =~ m/^\s+$/) {
print STDERR "skipping empty line\n";
if ($line =~ m/^\s*(\d+)\)\s*(.*?)$/mg) {
print STDERR "----------------\n" if $debug >= 1;
print STDERR "+++ added footnote $footnote: [$footnote_text]\n" if $debug >= 1;
$footnotes[$footnote] = $footnote_text;
print STDERR "----------------\n" if $debug >= 1;
$footnote = $1;
$footnote_text = "$2\n";
print STDERR "processing 2nd footnote $footnote [last: $last_footnote]\n" if $debug;
if ($last_footnote - $footnote != -1) {
die "Footnote diff invalid";
$last_footnote = $footnote;
push @new_footnotes, $footnote;
print STDERR "footnote $footnote text: [$footnote_text]\n" if $debug >= 4;
if (not length $line or $line =~ m/^\s+$/) {
print STDERR "footnote $footnote: skipping empty line\n";
} else {
$footnote_text .= "$line\n";
print STDERR "footnote $footnote text: appending [$line]\n" if $debug >= 3;
print STDERR "----------------\n" if $debug >= 1;
print STDERR "+++ added footnote $footnote: [$footnote_text]\n" if $debug >= 1;
$footnotes[$footnote] = $footnote_text;
print STDERR "----------------\n" if $debug >= 1;
# strip footnotes from section text
foreach my $fn (@new_footnotes) {
my $sub = quotemeta $footnotes[$fn];
$sub =~ s/(\\ )+/\\s*/g;
#print STDERR "subbing out [$footnote) $sub]\n";
$t =~ s/^\s*$fn\)\s*$sub//ms;
$t =~ s/\f//g;
$t =~ s/~~//msg;
$t =~ s/ZZZ//msg;
$sections{$this_section . "p$p"}{text} = "$p $t";
print STDERR "+++ added ${this_section}p$p:\n$p $t\n" if $debug;
print STDERR "+++ paragraphs done\n" if $debug;
sub bysection {
my $inverse = 1;
my ($a1, $p1) = split /p/, $a;
my ($b1, $p2) = split /p/, $b;
$p1 //= 0;
$p2 //= 0;
my @k1 = split /\./, $a1;
my @k2 = split /\./, $b1;
my @r;
if ($#k2 > $#k1) {
my @tk = @k1;
@k1 = @k2;
@k2 = @tk;
my $tp = $p1;
$p1 = $p2;
$p2 = $tp;
$inverse = -1;
} else {
$inverse = 1;
my $i = 0;
for (; $i < $#k1 + 1; $i++) {
if (not defined $k2[$i]) { $r[$i] = 1; }
else {
if ($i == 0) { $r[$i] = $k1[$i] cmp $k2[$i]; }
else { $r[$i] = $k1[$i] <=> $k2[$i]; }
$r[$i] = ($p1 <=> $p2);
my $ret = 0;
foreach my $rv (@r) {
if ($rv != 0) {
$ret = $rv;
return $ret * $inverse;
sub gen_txt {
my $footer = "";
my $paren = 0;
my $section_head;
my $section_title;
foreach my $this_section (sort bysection keys %sections) {
print STDERR "writing section $this_section\n" if $debug;
if (not $this_section =~ m/p/) {
print " $this_section $sections{$this_section}{title}\n";
$section_head = $this_section;
$section_title = $sections{$this_section}{title};
my $section_text = $sections{$this_section}{text};
while ($section_text =~ m/^(.*?)$/msg) {
my $line = $1;
print STDERR "paren reset, line [$line]\n" if $debug >= 8;
my $number = "";
while ($line =~ m/(.)/g) {
my $c = $1;
if ($c =~ m/[0-9]/) { $number .= $c; }
elsif ($c eq ' ') { $number = ""; }
elsif ($c eq '(') {
print STDERR "got $paren (\n" if $debug >= 8;
} elsif ($c eq ')') {
print STDERR "got $paren )\n" if $debug >= 8;
if ($paren == -1) {
if (length $number and defined $footnotes[$number]) {
print STDERR "Got footnote $number here!\n" if $debug;
$footer .= "\nFOOTNOTE.$number) $footnotes[$number]\n";
$paren = 0;
} else {
$number = "";
print "$section_text\n";
if (length $footer) {
print $footer;
$footer = "";
sub make_link {
my ($text) = @_;
if (exists $sections{$text}) {
return "<a href='#$text'>$text</a>";
} else {
return $text;
sub linkify {
my ($text) = @_;
$text =~ s/\b((?:[A-Z]|[1-9])\.(?:\.?[0-9]+)*)\b/make_link($1)/ge;
return $text;
sub gen_html {
print "<html>\n<body>\n";
foreach my $section (qw/ABSTRACT. CONTENTS. FOREWORD. INTRO./) {
foreach my $paragraph (sort bysection keys %sections) {
if ($paragraph =~ m/^$section/) {
delete $sections{$paragraph};
delete $sections{$section};
foreach my $section (sort bysection keys %sections) {
print "\n</body>\n</html>\n";
sub write_html_section {
my ($this_section) = @_;
my $footer = "";
my $paren = 0;
print STDERR "writing section [$this_section]\n" if $debug;
print "<a name='", encode_entities($this_section), "'></a>\n";
if (not $this_section =~ m/p/) {
print "<hr>\n<h3>", encode_entities($this_section), " [", encode_entities($sections{$this_section}{title}), "]</h3>\n";
my $section_text = $sections{$this_section}{text};
next if not length $section_text;
$section_text = encode_entities $section_text;
while ($section_text =~ m/^(.*?)$/msg) {
my $line = $1;
print STDERR "paren reset, line [$line]\n" if $debug >= 8;
my $number = "";
while ($line =~ m/(.)/g) {
my $c = $1;
if ($c =~ m/[0-9]/) { $number .= $c; }
elsif ($c eq ' ') { $number = ""; }
elsif ($c eq '(') {
print STDERR "got $paren (\n" if $debug >= 8;
} elsif ($c eq ')') {
print STDERR "got $paren )\n" if $debug >= 8;
if ($paren == -1) {
if (length $number and defined $footnotes[$number]) {
print STDERR "Got footnote $number here!\n" if $debug;
$section_text =~ s/$number\)/<a href='#FOOTNOTE.$number'><sup>[$number]<\/sup><\/a>/;
$footer .= "<a name='FOOTNOTE.$number'>\n<pre><i><b>Footnote $number)</b> ".encode_entities($footnotes[$number])."</i></pre>\n</a>\n";
$paren = 0;
} else {
$number = "";
$section_text = linkify($section_text);
$footer = linkify($footer);
if ($this_section eq 'CONTENTS.') {
$section_text =~ s/Annex ([A-Z])/<a href='#$1.'>Annex $1<\/a>/mg;
$section_text =~ s/^(\d+\.)/<a href='#$1'>$1<\/a>/mg;
$section_text =~ s/^Foreword/<a href='#FOREWORD.'>Foreword<\/a>/mg;
$section_text =~ s/^Introduction/<a href='#INTRO.'>Introduction<\/a>/mg;
print "<pre>", $section_text, "</pre>\n";
if (length $footer) {
print $footer;
$footer = '';
# this mess of code verifies that two given section numbers are within 1 unit of distance of each other
# this ensures that no sections were skipped due to misparses
sub validate_section_difference {
if (@last_section_number && $last_section_number[0] !~ /(?:ABSTRACT|CONTENTS|FOREWORD|INTRO)/) {
my $fail = 0;
my $skip = 0;
print STDERR "comparing last section ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
if (@section_number > @last_section_number) {
if (@section_number - @last_section_number != 1) {
$fail = 1;
print STDERR "size difference too great\n";
unless ($fail) {
if ($section_number[0] =~ /^[A-Z]+$/) {
if ($last_section_number[0] =~ /^[A-Z]+$/) {
for (my $i = 0; $i < @last_section_number; $i++) {
if ($section_number[$i] ne $last_section_number[$i]) {
$fail = 1;
print STDERR "digits different\n";
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @last_section_number; $i++) {
if ($section_number[$i] ne $last_section_number[$i]) {
$fail = 1;
print STDERR "digits different\n";
if (!$skip && ($fail || $section_number[$#section_number] != 1)) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
} elsif (@last_section_number > @section_number) {
if ($section_number[0] =~ /^[A-Z]+$/) {
if ($last_section_number[0] =~ /^[A-Z]+$/) {
if ($section_number[0] ne $last_section_number[0]) {
if (ord($section_number[0]) - ord($last_section_number[0]) != 1) {
$fail = 1;
print STDERR "letter difference too great\n";
} else {
$skip = 1;
print STDERR "letter difference good\n";
unless ($fail) {
for (my $i = 1; $i < @section_number - 1; $i++) {
if ($section_number[$i] != $last_section_number[$i]) {
if ($section_number[$i] - $last_section_number[$i] != 1) {
print STDERR "digit difference too great\n";
$fail = 1;
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @section_number - 1; $i++) {
if ($section_number[$i] != $last_section_number[$i]) {
if ($section_number[$i] - $last_section_number[$i] != 1) {
print STDERR "digit difference too great\n";
$fail = 1;
if (!$skip && ($fail || $section_number[$#section_number] - $last_section_number[$#section_number] != 1)) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
} else {
my @rev_last = reverse @last_section_number;
my @rev_curr = reverse @section_number;
if ($rev_curr[$#rev_curr] =~ /^[A-Z]+$/) {
if ($rev_last[$#rev_last] =~ /^[A-Z]+$/) {
if ($rev_curr[$#rev_curr] ne $rev_last[$#rev_last]) {
if (ord($rev_curr[$#rev_curr]) - ord($rev_last[$#rev_last]) != 1) {
$fail = 1;
print STDERR "letter difference too great\n";
for (my $i = 1; $i < @rev_curr; $i++) {
if ($rev_curr[$i] != $rev_last[$i]) {
if ($rev_curr[$i] - $rev_last[$i] > 1) {
$fail = 1;
} else {
print STDERR "disregarding section namespace change from number to alphabet\n";
$skip = 1;
} else {
for (my $i = 0; $i < @rev_curr; $i++) {
if ($rev_curr[$i] != $rev_last[$i]) {
if ($rev_curr[$i] - $rev_last[$i] > 1) {
$fail = 1;
if (!$skip && $fail) {
print STDERR "difference too great ", join('.', @last_section_number), " vs ", join('.', @section_number), "\n";
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -25,8 +25,8 @@ use PBot::Imports;
# These are set by the /misc/update_version script
use constant {
BUILD_DATE => "2022-08-08",
BUILD_DATE => "2022-08-12",
sub initialize {}
Reference in New Issue
Block a user