Spinach: Add misc tools

This commit is contained in:
Pragmatic Software 2019-05-06 10:08:32 -07:00
parent c032450d82
commit 6dc911c100
2 changed files with 229 additions and 0 deletions

174
misc/spinach/ja_convert.pl Executable file
View File

@ -0,0 +1,174 @@
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dumper;
use HTML::DOM;
use HTML::Entities;
use JSON;
my $questions = { questions => [] };
my $debug = 0;
my $kill_id = 9999999;
#my $kill_id = 10;
my @files = glob '*game*.html';
#my @files = glob '*game-1022.html';
my $id = 0;
foreach my $file (@files) {
print "Processing $file...\n";
my $text;
{
open my $fh, '<', $file or die "Couldn't open $file: $!";
local $/ = undef;
$text = <$fh>;
close $fh;
}
my $doc = HTML::DOM->new;
$doc->write($text);
my @rounds = $doc->getElementsByClassName('round');
my $round_nr = 0;
foreach my $round (@rounds) {
$round_nr++;
print " Round $round_nr!\n";
my ($category, $question, $answer);
my @categories = $round->getElementsByClassName('category_name');
my @clues = $round->getElementsByClassName('clue');
foreach my $clue (@clues) {
my $div = $clue->getElementsByTagName('div');
if (not defined $div->[0]) {
print "No div!\n";
next;
}
my $mouseover = $div->[0]->{onmouseover};
my $mouseout = $div->[0]->{onmouseout};
my $clue_values = $div->[0]->getElementsByClassName('clue_value');
if (not defined $clue_values->[0]) {
$clue_values = $div->[0]->getElementsByClassName('clue_value_daily_double');
}
if (not defined $clue_values->[0]) {
print "No clue value.\n";
die;
}
my $clue_value = $clue_values->[0]->innerHTML;
if ($debug) {
print " mouseover: $mouseover\n";
print " mouseout: $mouseout\n";
print " clue value: $clue_value\n";
}
my ($col, $row);
if ($mouseover =~ m/J_(\d+)_(\d+)/) {
$col = $1;
$row = $2;
} else {
print "Failed to find col, row\n";
print " mouseover: $mouseover\n";
print " mouseout: $mouseout\n";
die;
}
if ($mouseover =~ m{<em class="correct_response">(.*?)</em>}) {
$answer = $1;
} else {
print "Failed to find answer.\n";
print " mouseover: $mouseover\n";
print " mouseout: $mouseout\n";
die;
}
if ($mouseout =~ m/toggle\('clue[^']+', '[^']+', '(.*?)'\)$/) {
$question = $1;
} else {
print "Failed to find question.\n";
print " mouseover: $mouseover\n";
print " mouseout: $mouseout\n";
die;
}
print "row: $row, col: $col\n";
$category = $categories[$col - 1]->innerHTML;
$category =~ s/\\'/'/g;
$question =~ s/\\'/'/g;
$answer =~ s/\\'/'/g;
next if $category =~ m/<a href/;
next if $question =~ m/<a href/;
next if $answer =~ m/<a href/;
$category =~ s/<[^>]*>//gs;
$question =~ s/<[^>]*>//gs;
$answer =~ s/<[^>]*>//gs;
$category = decode_entities $category;
$question = decode_entities $question;
$answer = decode_entities $answer;
$answer =~ s/\(.*?\)//g;
$category =~ s/^\s+|\s+$//g;
$question =~ s/^\s+|\s+$//g;
$answer =~ s/^\s+|\s+$//g;
if ($clue_value =~ m/\$(\d+,?\d+)/) {
$clue_value = $1;
} elsif ($clue_value =~ m/^(\d+)$/) {
$clue_value = $1;
}
if (not $clue_value) {
print "Bad clue value.\n";
die;
}
print "$id: [$category] $question ($answer) $clue_value\n";
my @alternates = split / or |\//i, $answer;
my $answer = shift @alternates;
if (@alternates) {
print "Has alternates: [$answer] ", join (', ', @alternates), "\n";
}
my $new_question = {
alternativeSpellings => \@alternates,
suggestions => [],
question => $question,
id => ++$id,
answer => $answer,
category => $category,
last_seen => 0,
value => $clue_value
};
push @{$questions->{questions}}, $new_question;
}
}
last if $id >= $kill_id;
}
my $json = encode_json $questions;
open my $fh, '>', 'jeopardy.json';
print $fh "$json\n";
close $fh;

55
misc/spinach/ja_scrape.pl Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/perl
use warnings;
use strict;
use LWP::UserAgent::WithCache;
use Data::Dumper;
my %cache_opt = (
'namespace' => 'ja',
'cache_root' => File::Spec->catfile(File::HomeDir->my_home, '.jacache'),
'default_expires_in' => 600 * 6 * 24);
my $ua = LWP::UserAgent::WithCache->new(\%cache_opt);
$ua->agent("Mozilla 5.0");
$ua->cookie_jar({ file => "$ENV{HOME}/.jacookies" });
my @seasons = (1 .. 35, 'superjeopardy');
foreach my $season (@seasons) {
print "Downloading season $season ... \n";
my $response = $ua->get("http://website.com/showseason.php?season=$season");
if (not $response->is_success) {
print Dumper $response;
die;
}
my $text = $response->content;
open my $fh, '>', "season-$season.html";
print $fh "$text\n";
close $fh;
while ($text =~ m{http://www.website.com/showgame.php\?game_id=(\d+)}g) {
my $gameid = $1;
print " Downloading game $gameid ...\n";
$response = $ua->get("http://website.com/showgame.php?game_id=$gameid");
if (not $response->is_success) {
print Dumper $response;
die;
}
my $gametext = $response->content;
open $fh, '>', "game-$gameid.html";
print $fh "$gametext\n";
close $fh;
}
}