2014-07-05 02:04:15 +02:00
#!/usr/bin/env perl
2012-07-22 21:22:30 +02:00
2017-03-05 22:33:31 +01:00
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
2012-07-22 21:22:30 +02:00
use warnings ;
use strict ;
2014-06-10 16:29:16 +02:00
use Text::Balanced qw( extract_codeblock extract_delimited extract_bracketed ) ;
use feature 'switch' ;
no if $] >= 5.018 , warnings = > 'experimental::smartmatch' ;
my $ debug = 0 ;
2012-07-22 21:22:30 +02:00
my $ code = join ' ' , @ ARGV ;
2015-01-23 22:41:31 +01:00
if ( not length $ code ) {
print "Usage: english <any C11 code>\n" ;
exit ;
}
2012-07-22 21:22:30 +02:00
my $ output ;
2014-02-05 02:13:16 +01:00
my $ force ;
if ( $ code =~ s/^-f\s+// ) {
$ force = 1 ;
}
2014-06-10 16:29:16 +02:00
my ( $ has_function , $ has_main , $ got_nomain ) ;
2015-09-08 10:23:44 +02:00
my $ prelude_base = "#define _XOPEN_SOURCE 9001\n#define __USE_XOPEN\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <unistd.h>\n#include <math.h>\n#include <limits.h>\n#include <sys/types.h>\n#include <stdint.h>\n#include <errno.h>\n#include <ctype.h>\n#include <assert.h>\n#include <stdnoreturn.h>\n#include <stdbool.h>\n#include <stdalign.h>\n#include <time.h>\n#include <stddef.h>\n#include <uchar.h>\n#define _Atomic\n#define _Static_assert(a, b)\n\n" ;
2014-06-10 16:29:16 +02:00
my $ prelude = $ prelude_base ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
print "code before: [$code]\n" if $ debug ;
2014-06-07 15:00:07 +02:00
2014-06-10 16:29:16 +02:00
# replace \n outside of quotes with literal newline
my $ new_code = "" ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
use constant {
NORMAL = > 0 ,
DOUBLE_QUOTED = > 1 ,
SINGLE_QUOTED = > 2 ,
} ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
my $ state = NORMAL ;
my $ escaped = 0 ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
while ( $ code =~ m/(.)/gs ) {
my $ ch = $ 1 ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
given ( $ ch ) {
when ( '\\' ) {
if ( $ escaped == 0 ) {
$ escaped = 1 ;
next ;
}
}
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
if ( $ state == NORMAL ) {
when ( $ _ eq '"' and not $ escaped ) {
$ state = DOUBLE_QUOTED ;
}
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
when ( $ _ eq "'" and not $ escaped ) {
$ state = SINGLE_QUOTED ;
}
when ( $ _ eq 'n' and $ escaped == 1 ) {
$ ch = "\n" ;
$ escaped = 0 ;
}
}
if ( $ state == DOUBLE_QUOTED ) {
when ( $ _ eq '"' and not $ escaped ) {
$ state = NORMAL ;
}
}
if ( $ state == SINGLE_QUOTED ) {
when ( $ _ eq "'" and not $ escaped ) {
$ state = NORMAL ;
}
}
2012-07-22 21:22:30 +02:00
}
2014-06-10 16:29:16 +02:00
$ new_code . = '\\' and $ escaped = 0 if $ escaped ;
$ new_code . = $ ch ;
}
$ code = $ new_code ;
print "code after \\n replacement: [$code]\n" if $ debug ;
my $ single_quote = 0 ;
my $ double_quote = 0 ;
my $ parens = 0 ;
$ escaped = 0 ;
my $ cpp = 0 ; # preprocessor
while ( $ code =~ m/(.)/msg ) {
my $ ch = $ 1 ;
my $ pos = pos $ code ;
print "adding newlines, ch = [$ch], parens: $parens, cpp: $cpp, single: $single_quote, double: $double_quote, escaped: $escaped, pos: $pos\n" if $ debug >= 10 ;
if ( $ ch eq '\\' ) {
$ escaped = not $ escaped ;
} elsif ( $ ch eq '#' and not $ cpp and not $ escaped and not $ single_quote and not $ double_quote ) {
$ cpp = 1 ;
if ( $ code =~ m/include\s*[<"]([^>"]*)[>"]/msg ) {
my $ match = $ 1 ;
$ pos = pos $ code ;
substr ( $ code , $ pos , 0 ) = "\n" ;
pos $ code = $ pos ;
$ cpp = 0 ;
} else {
pos $ code = $ pos ;
}
} elsif ( $ ch eq '"' ) {
$ double_quote = not $ double_quote unless $ escaped or $ single_quote ;
$ escaped = 0 ;
} elsif ( $ ch eq '(' and not $ single_quote and not $ double_quote ) {
$ parens + + ;
} elsif ( $ ch eq ')' and not $ single_quote and not $ double_quote ) {
$ parens - - ;
$ parens = 0 if $ parens < 0 ;
} elsif ( $ ch eq ';' and not $ cpp and not $ single_quote and not $ double_quote and $ parens == 0 ) {
if ( not substr ( $ code , $ pos , 1 ) =~ m/[\n\r]/ ) {
substr ( $ code , $ pos , 0 ) = "\n" ;
pos $ code = $ pos + 1 ;
}
} elsif ( $ ch eq "'" ) {
$ single_quote = not $ single_quote unless $ escaped or $ double_quote ;
$ escaped = 0 ;
} elsif ( $ ch eq 'n' and $ escaped ) {
if ( not $ single_quote and not $ double_quote ) {
print "added newline\n" if $ debug >= 10 ;
substr ( $ code , $ pos - 2 , 2 ) = "\n" ;
pos $ code = $ pos ;
$ cpp = 0 ;
}
$ escaped = 0 ;
} elsif ( $ ch eq '{' and not $ cpp and not $ single_quote and not $ double_quote ) {
if ( not substr ( $ code , $ pos , 1 ) =~ m/[\n\r]/ ) {
substr ( $ code , $ pos , 0 ) = "\n" ;
pos $ code = $ pos + 1 ;
}
} elsif ( $ ch eq '}' and not $ cpp and not $ single_quote and not $ double_quote ) {
if ( not substr ( $ code , $ pos , 1 ) =~ m/[\n\r;]/ ) {
substr ( $ code , $ pos , 0 ) = "\n" ;
pos $ code = $ pos + 1 ;
}
} elsif ( $ ch eq "\n" and $ cpp and not $ single_quote and not $ double_quote ) {
$ cpp = 0 ;
2012-07-22 21:22:30 +02:00
} else {
2014-06-10 16:29:16 +02:00
$ escaped = 0 ;
2012-07-22 21:22:30 +02:00
}
}
2014-06-10 16:29:16 +02:00
print "code after \\n additions: [$code]\n" if $ debug ;
# white-out contents of quoted literals
my $ white_code = $ code ;
$ white_code =~ s/(?:\"((?:\\\"|(?!\").)*)\")/'"' . ('-' x length $1) . '"'/ge ;
$ white_code =~ s/(?:\'((?:\\\'|(?!\').)*)\')/"'" . ('-' x length $1) . "'"/ge ;
my $ precode ;
if ( $ white_code =~ m/#include/ ) {
$ precode = $ code ;
} else {
$ precode = $ prelude . $ code ;
}
$ code = '' ;
my $ warn_unterminated_define = 0 ;
print "--- precode: [$precode]\n" if $ debug ;
my $ lang = 'C89' ;
if ( $ lang eq 'C89' or $ lang eq 'C99' or $ lang eq 'C11' or $ lang eq 'C++' ) {
my $ prelude = '' ;
while ( $ precode =~ s/^\s*(#.*\n{1,2})//g ) {
$ prelude . = $ 1 ;
}
if ( $ precode =~ m/^\s*(#.*)/ms ) {
my $ line = $ 1 ;
if ( $ line !~ m/\n/ ) {
$ warn_unterminated_define = 1 ;
}
}
print "*** prelude: [$prelude]\n precode: [$precode]\n" if $ debug ;
my $ preprecode = $ precode ;
# white-out contents of quoted literals
$ preprecode =~ s/(?:\"((?:\\\"|(?!\").)*)\")/'"' . ('-' x length $1) . '"'/ge ;
$ preprecode =~ s/(?:\'((?:\\\'|(?!\').)*)\')/"'" . ('-' x length $1) . "'"/ge ;
# strip C and C++ style comments
if ( $ lang eq 'C89' ) {
$ preprecode =~ s #/\*[^*]*\*+([^/*][^*]*\*+)*/# #gs;
$ preprecode =~ s #|//([^\\]|[^\n][\n]?)*?\n|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)#defined $2 ? $2 : ""#gse;
} else {
$ preprecode =~ s #|//([^\\]|[^\n][\n]?)*?\n|("(\\.|[^"\\])*"|'(\\.|[^'\\])*'|.[^/"'\\]*)#defined $2 ? $2 : ""#gse;
$ preprecode =~ s #/\*[^*]*\*+([^/*][^*]*\*+)*/# #gs;
}
print "preprecode: [$preprecode]\n" if $ debug ;
print "looking for functions, has main: $has_main\n" if $ debug >= 2 ;
my $ func_regex = qr/^([ *\w]+)\s+([ ()*\w]+)\s*\(([^;{]*)\s*\)\s*({.*|<%.*|\?\?<.*)/ ims ;
# look for potential functions to extract
while ( $ preprecode =~ /$func_regex/ms ) {
my ( $ pre_ret , $ pre_ident , $ pre_params , $ pre_potential_body ) = ( $ 1 , $ 2 , $ 3 , $ 4 ) ;
print "looking for functions, found [$pre_ret][$pre_ident][$pre_params][$pre_potential_body], has main: $has_main\n" if $ debug >= 1 ;
# find the pos at which this function lives, for extracting from precode
$ preprecode =~ m/(\Q$pre_ret\E\s+\Q$pre_ident\E\s*\(\s*\Q$pre_params\E\s*\)\s*\Q$pre_potential_body\E)/g ;
my $ extract_pos = ( pos $ preprecode ) - ( length $ 1 ) ;
# now that we have the pos, substitute out the extracted potential function from preprecode
$ preprecode =~ s/$func_regex//ms ;
# create tmpcode object that starts from extract pos, to skip any quoted code
my $ tmpcode = substr ( $ precode , $ extract_pos ) ;
print "tmpcode: [$tmpcode]\n" if $ debug ;
$ precode = substr ( $ precode , 0 , $ extract_pos ) ;
print "precode: [$precode]\n" if $ debug ;
$ tmpcode =~ m/$func_regex/ms ;
my ( $ ret , $ ident , $ params , $ potential_body ) = ( $ 1 , $ 2 , $ 3 , $ 4 ) ;
print "1st extract: [$ret][$ident][$params][$potential_body]\n" if $ debug ;
$ ret =~ s/^\s+// ;
$ ret =~ s/\s+$// ;
if ( not length $ ret or $ ret eq "else" or $ ret eq "while" or $ ret eq "if" or $ ret eq "for" or $ ident eq "for" or $ ident eq "while" or $ ident eq "if" ) {
$ precode . = "$ret $ident ($params) $potential_body" ;
next ;
} else {
$ tmpcode =~ s/$func_regex//ms ;
}
$ potential_body =~ s/^\s*<%/{/ms ;
$ potential_body =~ s/%>\s*$/}/ms ;
$ potential_body =~ s/^\s*\?\?</{/ms ;
$ potential_body =~ s/\?\?>$/}/ms ;
my @ extract = extract_bracketed ( $ potential_body , '{}' ) ;
my $ body ;
if ( not defined $ extract [ 0 ] ) {
if ( $ debug == 0 ) {
print "error: unmatched brackets\n" ;
} else {
print "error: unmatched brackets for function '$ident';\n" ;
print "body: [$potential_body]\n" ;
}
exit ;
} else {
$ body = $ extract [ 0 ] ;
$ preprecode . = $ extract [ 1 ] ;
$ precode . = $ extract [ 1 ] ;
}
print "final extract: [$ret][$ident][$params][$body]\n" if $ debug ;
$ code . = "$ret $ident($params) $body\n\n" ;
$ has_main = 1 if $ ident =~ m/^\s*\(?\s*main\s*\)?\s*$/ ;
$ has_function = 1 ;
}
$ precode =~ s/^\s+// ;
$ precode =~ s/\s+$// ;
$ precode =~ s/^{(.*)}$/$1/s ;
if ( not $ has_main and not $ got_nomain ) {
2014-07-04 14:18:59 +02:00
$ code = "$prelude\n$code" . "int main(void) {\n$precode\n;\n}\n" ;
2014-06-10 16:29:16 +02:00
} else {
print "code: [$code]; precode: [$precode]\n" if $ debug ;
$ code = "$prelude\n$precode\n\n$code\n" ;
}
2012-07-22 21:22:30 +02:00
} else {
2014-06-10 16:29:16 +02:00
$ code = $ precode ;
2012-07-22 21:22:30 +02:00
}
2014-06-10 16:29:16 +02:00
print "after func extract, code: [$code]\n" if $ debug ;
2012-07-22 21:22:30 +02:00
$ code =~ s/\|n/\n/g ;
$ code =~ s/^\s+// ;
$ code =~ s/\s+$// ;
2013-10-19 19:56:43 +02:00
$ code =~ s/;\s*;\n/;\n/gs ;
2014-07-21 23:27:22 +02:00
$ code =~ s/(;)?(\s*\/\*.*?\*\/\s*);\n/$1$2/gs ;
$ code =~ s/(;)?(\s*\/\/.*?\s*);\n/$1$2/gs ;
2013-10-19 19:56:43 +02:00
$ code =~ s/({|})\n\s*;\n/$1\n/gs ;
2014-06-10 16:29:16 +02:00
$ code =~ s/(?:\n\n)+/\n\n/g ;
print "final code: [$code]\n" if $ debug ;
2012-07-22 21:22:30 +02:00
2014-06-07 15:00:07 +02:00
chdir "c2english" or die "Could not chdir: $!" ;
2012-07-22 21:22:30 +02:00
open my $ fh , '>' , 'code.c' or die "Could not write code: $!" ;
print $ fh $ code ;
close $ fh ;
2014-06-08 15:51:29 +02:00
#my ($ret, $result) = execute(10, "gcc -std=c89 -pedantic -Werror -Wno-unused -fsyntax-only -fno-diagnostics-show-option -fno-diagnostics-show-caret code.c");
2015-01-23 22:41:31 +01:00
my ( $ ret , $ result ) = execute ( 10 , "gcc -std=c11 -pedantic -Werror -Wno-implicit -Wno-unused -fsyntax-only -fno-diagnostics-show-option -fno-diagnostics-show-caret code.c" ) ;
2013-10-19 19:56:43 +02:00
2014-02-05 02:13:16 +01:00
if ( not $ force and $ ret != 0 ) {
2013-10-19 19:56:43 +02:00
$ output = $ result ;
2014-09-19 06:17:37 +02:00
#print STDERR "output: [$output]\n";
$ output =~ s/\s*In file included from\s+.*?:\d+:\d+:\s*//g ;
2013-10-19 19:56:43 +02:00
$ output =~ s/code\.c:\d+:\d+://g ;
$ output =~ s/code\.c://g ;
$ output =~ s/error=edantic/error=pedantic/g ;
$ output =~ s/(\d+:\d+:\s*)*cc1: all warnings being treated as errors// ;
$ output =~ s/(\d+:\d+:\s*)* \(first use in this function\)//g ;
$ output =~ s/(\d+:\d+:\s*)*error: \(Each undeclared identifier is reported only once.*?\)//msg ;
$ output =~ s/(\d+:\d+:\s*)*ld: warning: cannot find entry symbol _start; defaulting to [^ ]+// ;
#$output =~ s/(\d+:\d+:\s*)*error: (.*?) error/error: $1; error/msg;
$ output =~ s/(\d+:\d+:\s*)*\/tmp\/.*\.o://g ;
$ output =~ s/(\d+:\d+:\s*)*collect2: ld returned \d+ exit status//g ;
$ output =~ s/\(\.text\+[^)]+\)://g ;
$ output =~ s/\[ In/[In/ ;
$ output =~ s/(\d+:\d+:\s*)*warning: Can't read pathname for load map: Input.output error.//g ;
my $ left_quote = chr ( 226 ) . chr ( 128 ) . chr ( 152 ) ;
my $ right_quote = chr ( 226 ) . chr ( 128 ) . chr ( 153 ) ;
$ output =~ s/$left_quote/'/msg ;
$ output =~ s/$right_quote/'/msg ;
$ output =~ s/`/'/msg ;
$ output =~ s/\t/ /g ;
$ output =~ s/(\d+:\d+:\s*)*\s*In function .main.:\s*//g ;
$ output =~ s/(\d+:\d+:\s*)*warning: unknown conversion type character 'b' in format \[-Wformat\]\s+(\d+:\d+:\s*)*warning: too many arguments for format \[-Wformat-extra-args\]/info: %b is a candide extension/g ;
$ output =~ s/(\d+:\d+:\s*)*warning: unknown conversion type character 'b' in format \[-Wformat\]//g ;
$ output =~ s/\s\(core dumped\)/./ ;
# $output =~ s/\[\s+/[/g;
$ output =~ s/ \[enabled by default\]//g ;
$ output =~ s/initializer\s+warning: \(near/initializer (near/g ;
$ output =~ s/(\d+:\d+:\s*)*note: each undeclared identifier is reported only once for each function it appears in//g ;
$ output =~ s/\(gdb\)//g ;
$ output =~ s/", '\\(\d{3})' <repeats \d+ times>,? ?"/\\$1/g ;
$ output =~ s/, '\\(\d{3})' <repeats \d+ times>\s*//g ;
$ output =~ s/(\\000)+/\\0/g ;
$ output =~ s/\\0[^">']+/\\0/g ;
$ output =~ s/= (\d+) '\\0'/= $1/g ;
$ output =~ s/\\0"/"/g ;
$ output =~ s/"\\0/"/g ;
$ output =~ s/\.\.\.>/>/g ;
2015-01-23 22:41:31 +01:00
# $output =~ s/(\\\d{3})+//g;
2013-10-19 19:56:43 +02:00
$ output =~ s/<\s*included at \/home\/compiler\/>\s*//g ;
$ output =~ s/\s*compilation terminated due to -Wfatal-errors\.//g ;
$ output =~ s/^======= Backtrace.*\[vsyscall\]\s*$//ms ;
$ output =~ s/glibc detected \*\*\* \/home\/compiler\/code: // ;
$ output =~ s/: \/home\/compiler\/code terminated// ;
$ output =~ s/<Defined at \/home\/compiler\/>/<Defined at \/home\/compiler\/code.c:0>/g ;
$ output =~ s/\s*In file included from\s+\/usr\/include\/.*?:\d+:\d+:\s*/, /g ;
$ output =~ s/\s*collect2: error: ld returned 1 exit status//g ;
$ output =~ s/In function\s*`main':\s*\/home\/compiler\/ undefined reference to/error: undefined reference to/g ;
$ output =~ s/\/home\/compiler\///g ;
$ output =~ s/compilation terminated.// ;
$ output =~ s/<'(.*)' = char>/<'$1' = int>/g ;
$ output =~ s/= (-?\d+) ''/= $1/g ;
$ output =~ s/, <incomplete sequence >//g ;
2013-10-19 20:42:05 +02:00
$ output =~ s/\s*error: expected ';' before 'return'//g ;
2014-02-05 02:13:16 +01:00
$ output =~ s/^\s+// ;
$ output =~ s/\s+$// ;
$ output =~ s/error: ISO C forbids nested functions\s+//g ;
2014-06-10 16:29:16 +02:00
$ output =~ s/\s*note: this is the location of the previous definition//g ;
$ output =~ s/\s*note: use option -std=c99 or -std=gnu99 to compile your code//g ;
$ output =~ s/\s*\(declared at .*?\)//g ;
$ output =~ s/, note: declared here//g ;
2015-02-14 06:34:02 +01:00
$ output =~ s #/usr/include/.*?.h:\d+:\d+:/##g;
2014-02-05 02:13:16 +01:00
# don't error about undeclared objects
$ output =~ s/error: '[^']+' undeclared\s*//g ;
2013-10-19 19:56:43 +02:00
2014-02-05 02:13:16 +01:00
if ( length $ output ) {
print "$output\n" ;
exit 0 ;
} else {
$ output = undef ;
}
2013-10-19 19:56:43 +02:00
}
2014-06-07 15:00:07 +02:00
$ code =~ s/^\Q$prelude_base\E\s*// ;
2012-07-22 21:22:30 +02:00
2014-06-10 16:29:16 +02:00
open $ fh , '>' , 'code2eng.c' or die "Could not write code: $!" ;
2014-06-07 15:00:07 +02:00
print $ fh $ code ;
close $ fh ;
2014-06-22 06:50:21 +02:00
$ output = `./c2eng.pl code2eng.c` if not defined $ output ;
2012-07-22 21:22:30 +02:00
2014-06-07 15:00:07 +02:00
if ( not $ has_function and not $ has_main ) {
2014-07-05 01:41:54 +02:00
$ output =~ s/Let .main. be a function taking no arguments and returning int.\s*When called, the function will.\s*(do nothing.)?//i ;
2014-06-08 15:51:29 +02:00
$ output =~ s/\s*Return 0.\s*End of function .main..\s*// ;
2014-07-04 14:18:59 +02:00
$ output =~ s/\s*Finally, return 0.$// ;
$ output =~ s/\s*and then return 0.$/./ ;
2014-06-07 15:00:07 +02:00
$ output =~ s/\s*Do nothing.\s*$// ;
$ output =~ s/^\s*(.)/\U$1/ ;
2014-06-10 16:29:16 +02:00
$ output =~ s/\.\s+(\S)/. \U$1/g ;
2014-06-15 05:57:27 +02:00
} elsif ( $ has_function and not $ has_main ) {
2014-07-05 01:41:54 +02:00
$ output =~ s/\s*Let `main` be a function taking no arguments and returning int.\s*When called, the function will do nothing.// ;
2014-07-04 14:18:59 +02:00
$ output =~ s/\s*Finally, return 0.$// ;
$ output =~ s/\s*and then return 0.$/./ ;
2014-06-07 15:00:07 +02:00
}
2012-07-22 21:22:30 +02:00
$ output =~ s/\s+/ / ;
2014-06-07 15:00:07 +02:00
if ( not $ output ) {
2015-09-08 10:23:44 +02:00
$ output = "Does not compute; I only understand valid C11 code.\n" ;
2012-07-22 21:22:30 +02:00
}
2014-06-22 06:50:21 +02:00
print "$output\n" ;
2013-10-19 19:56:43 +02:00
sub execute {
my $ timeout = shift @ _ ;
my ( $ cmdline ) = @ _ ;
my ( $ ret , $ result ) ;
( $ ret , $ result ) = eval {
my $ result = '' ;
my $ pid = open ( my $ fh , '-|' , "$cmdline 2>&1" ) ;
local $ SIG { ALRM } = sub { kill 'TERM' , $ pid ; die "$result [Timed-out]\n" ; } ;
alarm ( $ timeout ) ;
while ( my $ line = <$fh> ) {
$ result . = $ line ;
}
close $ fh ;
my $ ret = $? >> 8 ;
alarm 0 ;
return ( $ ret , $ result ) ;
} ;
alarm 0 ;
if ( $@ =~ /Timed-out/ ) {
return ( - 1 , $@ ) ;
}
return ( $ ret , $ result ) ;
}