From 83d0d862f53f8dff9460aa1630daca7d158ebd3c Mon Sep 17 00:00:00 2001 From: Pragmatic Software Date: Tue, 10 Jun 2014 14:29:16 +0000 Subject: [PATCH] c2english.pl: Improve code parsing/function extraction --- PBot/VERSION.pm | 2 +- modules/c2english.pl | 317 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 274 insertions(+), 45 deletions(-) diff --git a/PBot/VERSION.pm b/PBot/VERSION.pm index 2b580a8a..b3e68cd1 100644 --- a/PBot/VERSION.pm +++ b/PBot/VERSION.pm @@ -13,7 +13,7 @@ use warnings; # These are set automatically by the build/commit script use constant { BUILD_NAME => "PBot", - BUILD_REVISION => 625, + BUILD_REVISION => 626, BUILD_DATE => "2014-06-10", }; diff --git a/modules/c2english.pl b/modules/c2english.pl index 078350cd..82161277 100755 --- a/modules/c2english.pl +++ b/modules/c2english.pl @@ -3,7 +3,12 @@ use warnings; use strict; -use Text::Balanced qw(extract_codeblock extract_delimited); +use Text::Balanced qw(extract_codeblock extract_delimited extract_bracketed); + +use feature 'switch'; +no if $] >= 5.018, warnings => 'experimental::smartmatch'; + +my $debug = 0; my $code = join ' ', @ARGV; my $output; @@ -13,59 +18,275 @@ if($code =~ s/^-f\s+//) { $force = 1; } -$code =~ s/#include <([^>]+)>/\n#include <$1>\n/g; -$code =~ s/#([^ ]+) (.*?)\\n/\n#$1 $2\n/g; -$code =~ s/#([\w\d_]+)\\n/\n#$1\n/g; - -my $original_code = $code; - -my $precode = $code; -$code = ''; - -my ($has_function, $has_main); - +my ($has_function, $has_main, $got_nomain); my $prelude_base = "#define _XOPEN_SOURCE 9001\n#define __USE_XOPEN\n#include \n#include \n#include \n#include \n#include \n#include \n#include \n#include \n#include \n#include \n#include \n\n"; my $prelude = $prelude_base; -$prelude .= "$1$2" if $precode =~ s/^\s*(#.*)(#.*?[>\n])//s; -my $preprecode = $precode; +print "code before: [$code]\n" if $debug; -while($preprecode =~ s/([ a-zA-Z0-9\_\*\[\]]+)\s+([a-zA-Z0-9_*]+)\s*\((.*?)\)\s*({.*)//) { - my ($ret, $ident, $params, $potential_body) = ($1, $2, $3, $4); +# replace \n outside of quotes with literal newline +my $new_code = ""; - $ret =~ s/^\s+//; - $ret =~ s/\s+$//; +use constant { + NORMAL => 0, + DOUBLE_QUOTED => 1, + SINGLE_QUOTED => 2, +}; - if($ret eq "else" or $ret eq "while") { - $precode .= "$ret $ident ($params) $potential_body"; - next; - } else { - $precode =~ s/([ a-zA-Z0-9\_\*\[\]]+)\s+([a-zA-Z0-9_*]+)\s*\((.*?)\)\s*({.*)//; +my $state = NORMAL; +my $escaped = 0; + +while($code =~ m/(.)/gs) { + my $ch = $1; + + given ($ch) { + when ('\\') { + if($escaped == 0) { + $escaped = 1; + next; + } + } + + if($state == NORMAL) { + when ($_ eq '"' and not $escaped) { + $state = DOUBLE_QUOTED; + } + + when ($_ eq "'" and not $escaped) { + $state = SINGLE_QUOTED; + } + + when ($_ eq 'n' and $escaped == 1) { + $ch = "\n"; + $escaped = 0; + } + } + + if($state == DOUBLE_QUOTED) { + when ($_ eq '"' and not $escaped) { + $state = NORMAL; + } + } + + if($state == SINGLE_QUOTED) { + when ($_ eq "'" and not $escaped) { + $state = NORMAL; + } + } } - my @extract = extract_codeblock($potential_body, '{}'); - my $body; - if(not defined $extract[0]) { - $output = "error: unmatched brackets for function '$ident';\n"; - $body = $extract[1]; - } else { - $body = $extract[0]; - $preprecode .= $extract[1]; - $precode .= $extract[1]; - } - $code .= "$ret $ident($params) $body\n\n"; - $has_function = 1; - $has_main = 1 if $ident eq 'main'; + $new_code .= '\\' and $escaped = 0 if $escaped; + $new_code .= $ch; } -$precode =~ s/^\s+//; -$precode =~ s/\s+$//; +$code = $new_code; -if(not $has_main) { - $code = "$prelude\n\n$code\n\nint main(void) {\n$precode\n;\nreturn 0;}\n"; +print "code after \\n replacement: [$code]\n" if $debug; + +my $single_quote = 0; +my $double_quote = 0; +my $parens = 0; +$escaped = 0; +my $cpp = 0; # preprocessor + +while($code =~ m/(.)/msg) { + my $ch = $1; + my $pos = pos $code; + + print "adding newlines, ch = [$ch], parens: $parens, cpp: $cpp, single: $single_quote, double: $double_quote, escaped: $escaped, pos: $pos\n" if $debug >= 10; + + if($ch eq '\\') { + $escaped = not $escaped; + } elsif($ch eq '#' and not $cpp and not $escaped and not $single_quote and not $double_quote) { + $cpp = 1; + + if($code =~ m/include\s*[<"]([^>"]*)[>"]/msg) { + my $match = $1; + $pos = pos $code; + substr ($code, $pos, 0) = "\n"; + pos $code = $pos; + $cpp = 0; + } else { + pos $code = $pos; + } + } elsif($ch eq '"') { + $double_quote = not $double_quote unless $escaped or $single_quote; + $escaped = 0; + } elsif($ch eq '(' and not $single_quote and not $double_quote) { + $parens++; + } elsif($ch eq ')' and not $single_quote and not $double_quote) { + $parens--; + $parens = 0 if $parens < 0; + } elsif($ch eq ';' and not $cpp and not $single_quote and not $double_quote and $parens == 0) { + if(not substr($code, $pos, 1) =~ m/[\n\r]/) { + substr ($code, $pos, 0) = "\n"; + pos $code = $pos + 1; + } + } elsif($ch eq "'") { + $single_quote = not $single_quote unless $escaped or $double_quote; + $escaped = 0; + } elsif($ch eq 'n' and $escaped) { + if(not $single_quote and not $double_quote) { + print "added newline\n" if $debug >= 10; + substr ($code, $pos - 2, 2) = "\n"; + pos $code = $pos; + $cpp = 0; + } + $escaped = 0; + } elsif($ch eq '{' and not $cpp and not $single_quote and not $double_quote) { + if(not substr($code, $pos, 1) =~ m/[\n\r]/) { + substr ($code, $pos, 0) = "\n"; + pos $code = $pos + 1; + } + } elsif($ch eq '}' and not $cpp and not $single_quote and not $double_quote) { + if(not substr($code, $pos, 1) =~ m/[\n\r;]/) { + substr ($code, $pos, 0) = "\n"; + pos $code = $pos + 1; + } + } elsif($ch eq "\n" and $cpp and not $single_quote and not $double_quote) { + $cpp = 0; + } else { + $escaped = 0; + } +} + +print "code after \\n additions: [$code]\n" if $debug; + +# white-out contents of quoted literals +my $white_code = $code; +$white_code =~ s/(?:\"((?:\\\"|(?!\").)*)\")/'"' . ('-' x length $1) . '"'/ge; +$white_code =~ s/(?:\'((?:\\\'|(?!\').)*)\')/"'" . ('-' x length $1) . "'"/ge; + +my $precode; +if($white_code =~ m/#include/) { + $precode = $code; } else { - $code = "$prelude\n\n$precode\n\n$code\n"; + $precode = $prelude . $code; } +$code = ''; +my $warn_unterminated_define = 0; + +print "--- precode: [$precode]\n" if $debug; + +my $lang = 'C89'; + +if($lang eq 'C89' or $lang eq 'C99' or $lang eq 'C11' or $lang eq 'C++') { + my $has_main = 0; + + my $prelude = ''; + while($precode =~ s/^\s*(#.*\n{1,2})//g) { + $prelude .= $1; + } + + if($precode =~ m/^\s*(#.*)/ms) { + my $line = $1; + + if($line !~ m/\n/) { + $warn_unterminated_define = 1; + } + } + + print "*** prelude: [$prelude]\n precode: [$precode]\n" if $debug; + + my $preprecode = $precode; + + # white-out contents of quoted literals + $preprecode =~ s/(?:\"((?:\\\"|(?!\").)*)\")/'"' . ('-' x length $1) . '"'/ge; + $preprecode =~ s/(?:\'((?:\\\'|(?!\').)*)\')/"'" . ('-' x length $1) . "'"/ge; + + # strip C and C++ style comments + if($lang eq 'C89') { + $preprecode =~ s#/\*[^*]*\*+([^/*][^*]*\*+)*/# #gs; + $preprecode =~ s#|//([^\\]|[^\n][\n]?)*?\n|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)#defined $2 ? $2 : ""#gse; + } else { + $preprecode =~ s#|//([^\\]|[^\n][\n]?)*?\n|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)#defined $2 ? $2 : ""#gse; + $preprecode =~ s#/\*[^*]*\*+([^/*][^*]*\*+)*/# #gs; + } + + print "preprecode: [$preprecode]\n" if $debug; + + print "looking for functions, has main: $has_main\n" if $debug >= 2; + + my $func_regex = qr/^([ *\w]+)\s+([ ()*\w]+)\s*\(([^;{]*)\s*\)\s*({.*|<%.*|\?\?<.*)/ims; + + # look for potential functions to extract + while($preprecode =~ /$func_regex/ms) { + my ($pre_ret, $pre_ident, $pre_params, $pre_potential_body) = ($1, $2, $3, $4); + + print "looking for functions, found [$pre_ret][$pre_ident][$pre_params][$pre_potential_body], has main: $has_main\n" if $debug >= 1; + + # find the pos at which this function lives, for extracting from precode + $preprecode =~ m/(\Q$pre_ret\E\s+\Q$pre_ident\E\s*\(\s*\Q$pre_params\E\s*\)\s*\Q$pre_potential_body\E)/g; + my $extract_pos = (pos $preprecode) - (length $1); + + # now that we have the pos, substitute out the extracted potential function from preprecode + $preprecode =~ s/$func_regex//ms; + + # create tmpcode object that starts from extract pos, to skip any quoted code + my $tmpcode = substr($precode, $extract_pos); + print "tmpcode: [$tmpcode]\n" if $debug; + + $precode = substr($precode, 0, $extract_pos); + print "precode: [$precode]\n" if $debug; + + $tmpcode =~ m/$func_regex/ms; + my ($ret, $ident, $params, $potential_body) = ($1, $2, $3, $4); + + print "1st extract: [$ret][$ident][$params][$potential_body]\n" if $debug; + + $ret =~ s/^\s+//; + $ret =~ s/\s+$//; + + if(not length $ret or $ret eq "else" or $ret eq "while" or $ret eq "if" or $ret eq "for" or $ident eq "for" or $ident eq "while" or $ident eq "if") { + $precode .= "$ret $ident ($params) $potential_body"; + next; + } else { + $tmpcode =~ s/$func_regex//ms; + } + + $potential_body =~ s/^\s*<%/{/ms; + $potential_body =~ s/%>\s*$/}/ms; + $potential_body =~ s/^\s*\?\?$/}/ms; + + my @extract = extract_bracketed($potential_body, '{}'); + my $body; + if(not defined $extract[0]) { + if($debug == 0) { + print "error: unmatched brackets\n"; + } else { + print "error: unmatched brackets for function '$ident';\n"; + print "body: [$potential_body]\n"; + } + exit; + } else { + $body = $extract[0]; + $preprecode .= $extract[1]; + $precode .= $extract[1]; + } + + print "final extract: [$ret][$ident][$params][$body]\n" if $debug; + $code .= "$ret $ident($params) $body\n\n"; + $has_main = 1 if $ident =~ m/^\s*\(?\s*main\s*\)?\s*$/; + $has_function = 1; + } + + $precode =~ s/^\s+//; + $precode =~ s/\s+$//; + + $precode =~ s/^{(.*)}$/$1/s; + + if(not $has_main and not $got_nomain) { + $code = "$prelude\n$code" . "int main(void) {\n$precode\n;\nreturn 0;\n}\n"; + } else { + print "code: [$code]; precode: [$precode]\n" if $debug; + $code = "$prelude\n$precode\n\n$code\n"; + } +} else { + $code = $precode; +} + +print "after func extract, code: [$code]\n" if $debug; $code =~ s/\|n/\n/g; $code =~ s/^\s+//; @@ -74,6 +295,9 @@ $code =~ s/;\s*;\n/;\n/gs; $code =~ s/;(\s*\/\*.*?\*\/\s*);\n/;$1/gs; $code =~ s/;(\s*\/\/.*?\s*);\n/;$1/gs; $code =~ s/({|})\n\s*;\n/$1\n/gs; +$code =~ s/(?:\n\n)+/\n\n/g; + +print "final code: [$code]\n" if $debug; chdir "c2english" or die "Could not chdir: $!"; @@ -142,6 +366,10 @@ if(not $force and $ret != 0) { $output =~ s/^\s+//; $output =~ s/\s+$//; $output =~ s/error: ISO C forbids nested functions\s+//g; + $output =~ s/\s*note: this is the location of the previous definition//g; + $output =~ s/\s*note: use option -std=c99 or -std=gnu99 to compile your code//g; + $output =~ s/\s*\(declared at .*?\)//g; + $output =~ s/, note: declared here//g; # don't error about undeclared objects $output =~ s/error: '[^']+' undeclared\s*//g; @@ -156,18 +384,19 @@ if(not $force and $ret != 0) { $code =~ s/^\Q$prelude_base\E\s*//; -open my $fh, '>', 'code2eng.c' or die "Could not write code: $!"; +open $fh, '>', 'code2eng.c' or die "Could not write code: $!"; print $fh $code; close $fh; $output = `./c2eng.pl code2eng.c` if not defined $output; if(not $has_function and not $has_main) { - $output =~ s/Let .main. be a function taking no parameters and returning int.\s*To perform the function:\s*//; + $output =~ s/Let .main. be a function taking no parameters and returning int.\s*To perform the function.\s*//; $output =~ s/\s*Return 0.\s*$//; $output =~ s/\s*Return 0.\s*End of function .main..\s*//; $output =~ s/\s*Do nothing.\s*$//; $output =~ s/^\s*(.)/\U$1/; + $output =~ s/\.\s+(\S)/. \U$1/g; } $output =~ s/\s+/ /; @@ -175,7 +404,7 @@ if(not $output) { $output = "Does not compute. I only know about C89 and valid code.\n"; } -print "[Note: Work-in-progress; may be issues!] $output\n"; +print "[Work-in-progress; there will be issues!] $output\n"; sub execute { my $timeout = shift @_;