diff --git a/misc/spinach/ja_convert.pl b/misc/spinach/ja_convert.pl new file mode 100755 index 00000000..d91691a9 --- /dev/null +++ b/misc/spinach/ja_convert.pl @@ -0,0 +1,174 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Data::Dumper; +use HTML::DOM; +use HTML::Entities; +use JSON; + +my $questions = { questions => [] }; + +my $debug = 0; +my $kill_id = 9999999; +#my $kill_id = 10; + +my @files = glob '*game*.html'; +#my @files = glob '*game-1022.html'; + +my $id = 0; + +foreach my $file (@files) { + print "Processing $file...\n"; + + my $text; + + { + open my $fh, '<', $file or die "Couldn't open $file: $!"; + local $/ = undef; + $text = <$fh>; + close $fh; + } + + my $doc = HTML::DOM->new; + $doc->write($text); + + my @rounds = $doc->getElementsByClassName('round'); + my $round_nr = 0; + + foreach my $round (@rounds) { + $round_nr++; + print " Round $round_nr!\n"; + + my ($category, $question, $answer); + + my @categories = $round->getElementsByClassName('category_name'); + my @clues = $round->getElementsByClassName('clue'); + + foreach my $clue (@clues) { + my $div = $clue->getElementsByTagName('div'); + + if (not defined $div->[0]) { + print "No div!\n"; + next; + } + + my $mouseover = $div->[0]->{onmouseover}; + my $mouseout = $div->[0]->{onmouseout}; + + my $clue_values = $div->[0]->getElementsByClassName('clue_value'); + + if (not defined $clue_values->[0]) { + $clue_values = $div->[0]->getElementsByClassName('clue_value_daily_double'); + } + + if (not defined $clue_values->[0]) { + print "No clue value.\n"; + die; + } + + my $clue_value = $clue_values->[0]->innerHTML; + + if ($debug) { + print " mouseover: $mouseover\n"; + print " mouseout: $mouseout\n"; + print " clue value: $clue_value\n"; + } + + my ($col, $row); + if ($mouseover =~ m/J_(\d+)_(\d+)/) { + $col = $1; + $row = $2; + } else { + print "Failed to find col, row\n"; + print " mouseover: $mouseover\n"; + print " mouseout: $mouseout\n"; + die; + } + + if ($mouseover =~ m{(.*?)}) { + $answer = $1; + } else { + print "Failed to find answer.\n"; + print " mouseover: $mouseover\n"; + print " mouseout: $mouseout\n"; + die; + } + + if ($mouseout =~ m/toggle\('clue[^']+', '[^']+', '(.*?)'\)$/) { + $question = $1; + } else { + print "Failed to find question.\n"; + print " mouseover: $mouseover\n"; + print " mouseout: $mouseout\n"; + die; + } + + print "row: $row, col: $col\n"; + + $category = $categories[$col - 1]->innerHTML; + + $category =~ s/\\'/'/g; + $question =~ s/\\'/'/g; + $answer =~ s/\\'/'/g; + + next if $category =~ m/]*>//gs; + $question =~ s/<[^>]*>//gs; + $answer =~ s/<[^>]*>//gs; + + $category = decode_entities $category; + $question = decode_entities $question; + $answer = decode_entities $answer; + + $answer =~ s/\(.*?\)//g; + + $category =~ s/^\s+|\s+$//g; + $question =~ s/^\s+|\s+$//g; + $answer =~ s/^\s+|\s+$//g; + + if ($clue_value =~ m/\$(\d+,?\d+)/) { + $clue_value = $1; + } elsif ($clue_value =~ m/^(\d+)$/) { + $clue_value = $1; + } + + if (not $clue_value) { + print "Bad clue value.\n"; + die; + } + + print "$id: [$category] $question ($answer) $clue_value\n"; + + my @alternates = split / or |\//i, $answer; + my $answer = shift @alternates; + + if (@alternates) { + print "Has alternates: [$answer] ", join (', ', @alternates), "\n"; + } + + my $new_question = { + alternativeSpellings => \@alternates, + suggestions => [], + question => $question, + id => ++$id, + answer => $answer, + category => $category, + last_seen => 0, + value => $clue_value + }; + + push @{$questions->{questions}}, $new_question; + } + } + last if $id >= $kill_id; +} + +my $json = encode_json $questions; +open my $fh, '>', 'jeopardy.json'; +print $fh "$json\n"; +close $fh; diff --git a/misc/spinach/ja_scrape.pl b/misc/spinach/ja_scrape.pl new file mode 100755 index 00000000..7ed3296c --- /dev/null +++ b/misc/spinach/ja_scrape.pl @@ -0,0 +1,55 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use LWP::UserAgent::WithCache; + +use Data::Dumper; + +my %cache_opt = ( + 'namespace' => 'ja', + 'cache_root' => File::Spec->catfile(File::HomeDir->my_home, '.jacache'), + 'default_expires_in' => 600 * 6 * 24); +my $ua = LWP::UserAgent::WithCache->new(\%cache_opt); + +$ua->agent("Mozilla 5.0"); +$ua->cookie_jar({ file => "$ENV{HOME}/.jacookies" }); + +my @seasons = (1 .. 35, 'superjeopardy'); + +foreach my $season (@seasons) { + print "Downloading season $season ... \n"; + + my $response = $ua->get("http://website.com/showseason.php?season=$season"); + + if (not $response->is_success) { + print Dumper $response; + die; + } + + my $text = $response->content; + + open my $fh, '>', "season-$season.html"; + print $fh "$text\n"; + close $fh; + + while ($text =~ m{http://www.website.com/showgame.php\?game_id=(\d+)}g) { + my $gameid = $1; + + print " Downloading game $gameid ...\n"; + + $response = $ua->get("http://website.com/showgame.php?game_id=$gameid"); + + if (not $response->is_success) { + print Dumper $response; + die; + } + + my $gametext = $response->content; + + open $fh, '>', "game-$gameid.html"; + print $fh "$gametext\n"; + close $fh; + } +}