3
0
mirror of https://github.com/pragma-/pbot.git synced 2025-01-10 20:12:35 +01:00

misc: update Spinach cat.pl

This commit is contained in:
Pragmatic Software 2018-02-20 12:47:13 -08:00
parent 092f36260d
commit 08edb697c1

View File

@ -3,23 +3,33 @@
use strict; use strict;
use warnings; use warnings;
use Lingua::Stem qw/stem/;
my %docs; my %docs;
my @uncat; my @uncat;
my $minimum_category_size = 6; my $minimum_category_size = 6;
open my $handle, '<dedup_questions' or die $@; open my $handle, '<dedup_questions' or die $!;
chomp(my @lines = <$handle>); close $handle; chomp(my @lines = <$handle>); close $handle;
my %stopwords;
open $handle, '<stopwords' or die $!;
foreach my $word (<$handle>) {
chomp $word;
$stopwords{$word} = 1;
}
close $handle;
my @doc_rules = ( my @doc_rules = (
{ regex => qr/(?:james bond| 007)/i, category => 'JAMES BOND' }, { regex => qr/(?:james bond| 007)/i, category => 'JAMES BOND' },
{ regex => qr/^194\d /, category => "THE 1940'S" }, { regex => qr/^(?:in (?:the year )?)?194\d /i, category => "THE 1940'S" },
{ regex => qr/^195\d /, category => "THE 1950'S" }, { regex => qr/^(?:in (?:the year )?)?195\d /i, category => "THE 1950'S" },
{ regex => qr/^196\d /, category => "THE 1960'S" }, { regex => qr/^(?:in (?:the year )?)?196\d /i, category => "THE 1960'S" },
{ regex => qr/^197\d /, category => "THE 1970'S" }, { regex => qr/^(?:in (?:the year )?)?197\d /i, category => "THE 1970'S" },
{ regex => qr/^198\d /, category => "THE 1980'S" }, { regex => qr/^(?:in (?:the year )?)?198\d /i, category => "THE 1980'S" },
{ regex => qr/^199\d /, category => "THE 1990'S" }, { regex => qr/^(?:in (?:the year )?)?199\d /i, category => "THE 1990'S" },
{ regex => qr/^200\d /, category => "THE 2000'S" }, { regex => qr/^(?:in (?:the year )?)?20\d\d /i, category => "THE 2000'S" },
{ regex => qr/(?:Name The Year|In what year)/, category => 'NAME THE YEAR' }, { regex => qr/(?:Name The Year|In what year)/, category => 'NAME THE YEAR' },
{ regex => qr/baby names/i, category => 'BABY NAMES' }, { regex => qr/baby names/i, category => 'BABY NAMES' },
{ regex => qr/what word mean/i, category => 'Definitions' }, { regex => qr/what word mean/i, category => 'Definitions' },
@ -38,19 +48,27 @@ my @doc_rules = (
{ regex => qr/world.cup/i, category => "WORLD CUP" }, { regex => qr/world.cup/i, category => "WORLD CUP" },
{ regex => qr/computer science/i, category => "COMPUTER SCIENCE" }, { regex => qr/computer science/i, category => "COMPUTER SCIENCE" },
{ regex => qr/computer/i, category => "COMPUTERS" }, { regex => qr/computer/i, category => "COMPUTERS" },
{ regex => qr/science fict/i, category => "SCI-FI" },
{ regex => qr/science/i, category => "SCIENCE" }, { regex => qr/science/i, category => "SCIENCE" },
{ regex => qr/technolog/i, category => "TECHNOLOGY" }, { regex => qr/technolog/i, category => "TECHNOLOGY" },
{ regex => qr/^games /i, category => "GAMES" }, { regex => qr/^games /i, category => "GAMES" },
{ regex => qr/x[ -]?men/i, category => "COMICS" }, { regex => qr/x.?men/i, category => "X-MEN" },
{ regex => qr/beatles/i, category => "BEATLES" }, { regex => qr/beatles/i, category => "BEATLES" },
{ regex => qr/^chiefly british/i, category => "BRITISH SLANG" }, { regex => qr/^chiefly british/i, category => "BRITISH SLANG" },
{ regex => qr/^SLANG /i, category => "SLANG" }, { regex => qr/^SLANG /i, category => "SLANG" },
{ regex => qr/^US SLANG$/i, category => "SLANG" }, { regex => qr/^US SLANG$/i, category => "SLANG" },
{ regex => qr/chess/i, category => "CHESS" }, { regex => qr/\bchess\b/i, category => "CHESS" },
{ regex => qr/sherlock holmes/i, category => "SHERLOCK HOLMES" }, { regex => qr/sherlock holmes/i, category => "SHERLOCK HOLMES" },
{ regex => qr/stephen king/i, category => "STEPHEN KING" }, { regex => qr/stephen king/i, category => "STEPHEN KING" },
{ regex => qr/wizard of oz/i, category => "WIZARD OF OZ" }, { regex => qr/wizard of oz/i, category => "WIZARD OF OZ" },
{ regex => qr/philosoph/i, category => "PHILOSOPHY" }, { regex => qr/philosoph/i, category => "PHILOSOPHY" },
{ regex => qr/.*: '.*\.'/, category => "GUESS THE WORD" },
{ regex => qr/monty python/i, category => "MONTY PYTHON" },
{ regex => qr/musical/i, category => "MUSICALS" },
{ regex => qr/^the name/, category => "NAME THAT THING" },
{ regex => qr/hit single/, category => "HIT SINGLES" },
{ regex => qr/^a group of/, category => "A GROUP OF IS CALLED" },
{ regex => qr/^music/, category => "MUSIC" },
); );
my @rename_rules = ( my @rename_rules = (
@ -144,7 +162,7 @@ my @rename_rules = (
{ old => qr/WHO RULED ROME/, new => "ROMAN RULERS" }, { old => qr/WHO RULED ROME/, new => "ROMAN RULERS" },
{ old => qr/^WHO DIRECTED/, new => "NAME THE DIRECTOR" }, { old => qr/^WHO DIRECTED/, new => "NAME THE DIRECTOR" },
{ old => qr/PHILOSOPHER/, new => "PHILOSOPHY" }, { old => qr/PHILOSOPHER/, new => "PHILOSOPHY" },
{ old => qr/^SIMILI?ES/, new => "SIMILES" }, { old => qr/^SIMILI?ES?/, new => "SIMILES" },
{ old => qr/^SCIENCE /, new => "SCIENCE" }, { old => qr/^SCIENCE /, new => "SCIENCE" },
{ old => qr/^ROMEO & JULIET/, new => "SHAKESPEARE" }, { old => qr/^ROMEO & JULIET/, new => "SHAKESPEARE" },
{ old => qr/^SAYINGS & SMILES$/, new => "SAYINGS & SIMILES" }, { old => qr/^SAYINGS & SMILES$/, new => "SAYINGS & SIMILES" },
@ -152,6 +170,20 @@ my @rename_rules = (
{ old => qr/^EPL$/, new => "SOCCER" }, { old => qr/^EPL$/, new => "SOCCER" },
{ old => qr/^NZ$/, new => "NEW ZEALAND" }, { old => qr/^NZ$/, new => "NEW ZEALAND" },
{ old => qr/^NZ /, new => "NEW ZEALAND" }, { old => qr/^NZ /, new => "NEW ZEALAND" },
{ old => qr/[NB]URSERY RHYME/, new => "FAIRYTALES & NURSERY RHYMES" },
{ old => qr/NURESRY RHYME/, new => "FAIRYTALES & NURSERY RHYMES" },
{ old => qr/^GEOGRAPH/, new => "GEOGRAPHY" },
{ old => qr/TREKKIE/, new => "STAR TREK" },
{ old => qr/^STAR TREK/, new => "STAR TREK" },
{ old => qr/^SPORT(?!S)/, new => "SPORTS" },
{ old => qr/WORDS CONTAINING/, new => "GUESS THE WORD" },
{ old => qr/MONTY PYTHON/, new => "MONTY PYTHON" },
{ old => qr/BARBIE/, new => "BARBIE DOLL" },
{ old => qr/(?:AMERICAN|INTL) BEER/, new => "BEER" },
);
my @skip_rules = (
qr/true or false/i,
); );
my @not_a_category = ( my @not_a_category = (
@ -201,7 +233,7 @@ print STDERR "Categorizing documents\n";
for my $i (0 .. $#lines) { for my $i (0 .. $#lines) {
# Remove/fix stupid things # Remove/fix stupid things
$lines[$i] =~ s/\s*Category:\s*//g; $lines[$i] =~ s/\s*category:\s*//gi;
$lines[$i] =~ s/(\w:)(\w)/$1 $2/g; $lines[$i] =~ s/(\w:)(\w)/$1 $2/g;
$lines[$i] =~ s{/}{ / }g; $lines[$i] =~ s{/}{ / }g;
$lines[$i] =~ s{&}{ & }g; $lines[$i] =~ s{&}{ & }g;
@ -217,6 +249,16 @@ for my $i (0 .. $#lines) {
my @l = split /`/, $lines[$i]; my @l = split /`/, $lines[$i];
my $skip = 0;
foreach my $rule (@skip_rules) {
if ($l[0] =~ m/$rule/) {
print STDERR "Skipping doc $i (matches $rule): $l[0] ($l[1])\n";
$skip = 1;
last;
}
}
next if $skip;
# If the question has an obvious category, use that # If the question has an obvious category, use that
if ($l[0] =~ m/^(.{3,30}?)\s*[:;-]/ or $l[0] =~ m/^(.{3,30}?)\s*\./) { if ($l[0] =~ m/^(.{3,30}?)\s*[:;-]/ or $l[0] =~ m/^(.{3,30}?)\s*\./) {
my $cat = uc $1; my $cat = uc $1;
@ -241,15 +283,7 @@ for my $i (0 .. $#lines) {
$cat =~ s/(?:\s+$|\R|^"|"$|^-|^\[|\]$)//g; $cat =~ s/(?:\s+$|\R|^"|"$|^-|^\[|\]$)//g;
$cat =~ s/\s+/ /g; $cat =~ s/\s+/ /g;
$cat =~ s/(\d+)S/$1'S/g; $cat =~ s/(\d+)S/$1'S/g;
$cat =~ s/^SPORT(?!S)/SPORTS/;
$cat =~ s/ (?:AND|N|'N) / & /; $cat =~ s/ (?:AND|N|'N) / & /;
#$cat =~ s/\s*\/\s*/\//;
$cat =~ s/^GEOGRAPH.*/GEOGRAPHY/;
$cat = 'STAR TREK' if ($cat =~ m/^STAR TREK/);
$cat = 'GUESS THE WORD' if $l[0] =~ m/.*: '.*\.'/;
foreach my $rule (@rename_rules) { foreach my $rule (@rename_rules) {
if ($cat =~ m/$rule->{old}/) { if ($cat =~ m/$rule->{old}/) {
@ -320,6 +354,68 @@ print STDERR "-" x 80, "\n";
print STDERR "Small categories: $small; total cats: ", (scalar keys %docs) - $small, " with $total questions.\n"; print STDERR "Small categories: $small; total cats: ", (scalar keys %docs) - $small, " with $total questions.\n";
print STDERR "-" x 80, "\n"; print STDERR "-" x 80, "\n";
foreach my $cat (sort keys %docs) {
print STDERR " $cat: ", scalar @{$docs{$cat}}, "\n" if @{$docs{$cat}} < $minimum_category_size;
}
print STDERR "Uncategorized: ", scalar @uncat, "\n";
my @remaining_uncat;
my $i = 0;
$total = @uncat;
foreach my $doc (sort { $lines[$a] cmp $lines[$b] } @uncat) {
print STDERR "$i / $total\n" if $i % 1000 == 0;
$i++;
my @l = split /`/, $lines[$doc];
my @doc_words = split / /, $l[0];
@doc_words = map { local $_ = $_; s/\p{PosixPunct}//g; lc $_ } @doc_words;
@doc_words = @{ stem grep { length $_ and not exists $stopwords{$_} } @doc_words};
#print STDERR "doc words for $doc: $l[0]: @doc_words\n";
my $categorized = 0;
foreach my $cat (sort { length $b <=> length $a } @approved) {
next if $cat =~ m/ANIMAL IN YOU/;
next if $cat =~ m/BOXING/;
my @cat_words = split / /, $cat;
@cat_words = map { local $_ = $_; s/\p{PosixPunct}//g; lc $_ } @cat_words;
@cat_words = @{ stem grep { length $_ and not exists $stopwords{$_} } @cat_words};
my %matches;
foreach my $cat_word (@cat_words) {
foreach my $doc_word (@doc_words) {
if ($cat_word eq $doc_word) {
$matches{$cat_word} = 1;
goto MATCH if keys %matches == @cat_words;
}
}
}
MATCH:
if (keys %matches == @cat_words) {
print STDERR "Adding doc $doc to $cat: $l[0] ($l[1])\n";
push @{$docs{$cat}}, $doc;
$categorized = 1;
last;
}
}
if (not $categorized) {
push @remaining_uncat, $doc;
}
}
$total = 0;
foreach my $cat (@approved) {
$total += @{$docs{$cat}};
}
print STDERR "-" x 80, "\n";
print STDERR "Categories: ", scalar @approved, " with $total questions.\n";
print STDERR "-" x 80, "\n";
foreach my $cat (sort @approved) { foreach my $cat (sort @approved) {
print STDERR "$cat ... "; print STDERR "$cat ... ";
@ -332,14 +428,10 @@ foreach my $cat (sort @approved) {
print STDERR "$count questions.\n"; print STDERR "$count questions.\n";
} }
print STDERR "Uncategorized: ", scalar @uncat, "\n"; print STDERR "-" x 80, "\n";
foreach my $cat (sort keys %docs) { print STDERR "Remaining uncategorized: ", scalar @remaining_uncat, "\n";
print STDERR " $cat: ", scalar @{$docs{$cat}}, "\n" if @{$docs{$cat}} < $minimum_category_size;
}
foreach my $i (sort { $lines[$a] cmp $lines[$b] } @uncat) { foreach my $i (sort { $lines[$a] cmp $lines[$b] } @remaining_uncat) {
print STDERR "uncategorized: $lines[$i]\n"; print STDERR "uncategorized: $lines[$i]\n";
} }