2018-02-19 03:21:31 +01:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
|
|
|
|
my %docs;
|
|
|
|
my @uncat;
|
|
|
|
|
|
|
|
open my $handle, '<dedup_questions' or die $@;
|
|
|
|
chomp(my @lines = <$handle>); close $handle;
|
|
|
|
|
|
|
|
my @rules = (
|
2018-02-20 04:35:42 +01:00
|
|
|
{ regex => qr/(?:james bond| 007)/i, category => 'JAMES BOND' },
|
2018-02-19 03:21:31 +01:00
|
|
|
{ regex => qr/^194\d /, category => "THE 1940'S" },
|
|
|
|
{ regex => qr/^195\d /, category => "THE 1950'S" },
|
|
|
|
{ regex => qr/^196\d /, category => "THE 1960'S" },
|
|
|
|
{ regex => qr/^197\d /, category => "THE 1970'S" },
|
|
|
|
{ regex => qr/^198\d /, category => "THE 1980'S" },
|
|
|
|
{ regex => qr/^199\d /, category => "THE 1990'S" },
|
|
|
|
{ regex => qr/^200\d /, category => "THE 2000'S" },
|
|
|
|
{ regex => qr/(?:Name The Year|In what year)/, category => 'NAME THE YEAR' },
|
|
|
|
{ regex => qr/baby names/i, category => 'BABY NAMES' },
|
|
|
|
{ regex => qr/what word mean/i, category => 'Definitions' },
|
|
|
|
{ regex => qr/What (?:one word|word links)/i, category => 'GUESS THE WORD' },
|
|
|
|
{ regex => qr/^(If [Yy]ou [Ww]ere [Bb]orn|Astrology)/i, category => 'Astrology' },
|
|
|
|
{ regex => qr/[Oo]lympics/, category => 'Olympics' },
|
|
|
|
{ regex => qr/^How many/, category => 'HOW MANY' },
|
|
|
|
{ regex => qr/(?:^What is a group|Group Nouns)/, category => 'animal groups' },
|
|
|
|
{ regex => qr/(?:[Ww]hat is the fear|phobia is (?:a|the) fear|Phobias)/, category => 'Phobias' },
|
|
|
|
{ regex => qr/who won the oscar/i, category => 'Oscars' },
|
|
|
|
{ regex => qr/(?:area code|country code)/, category => 'Phone COUNTRY Codes' },
|
|
|
|
{ regex => qr/17th century/i, category => "17TH CENTURY" },
|
|
|
|
{ regex => qr/18th century/i, category => "18TH CENTURY" },
|
|
|
|
{ regex => qr/19th century/i, category => "19TH CENTURY" },
|
|
|
|
{ regex => qr/shakespear/i, category => "SHAKESPEARE" },
|
|
|
|
{ regex => qr/world cup/i, category => "WORLD CUP" },
|
|
|
|
{ regex => qr/computer science/i, category => "COMPUTER SCIENCE" },
|
|
|
|
{ regex => qr/computer/i, category => "COMPUTERS" },
|
|
|
|
{ regex => qr/science/i, category => "SCIENCE" },
|
|
|
|
{ regex => qr/technolog/i, category => "TECHNOLOGY" },
|
|
|
|
{ regex => qr/^games /i, category => "GAMES" },
|
2018-02-20 04:35:42 +01:00
|
|
|
{ regex => qr/x[ -]?men/i, category => "COMICS" },
|
|
|
|
{ regex => qr/beatles/i, category => "BEATLES" },
|
2018-02-19 03:21:31 +01:00
|
|
|
);
|
|
|
|
|
|
|
|
my @rename_rules = (
|
2018-02-20 04:35:42 +01:00
|
|
|
{ old => qr/^007$/, new => "JAMES BOND" },
|
2018-02-19 03:21:31 +01:00
|
|
|
{ old => qr/^191\d/, new => "THE 1910'S" },
|
|
|
|
{ old => qr/^192\d/, new => "THE 1920'S" },
|
|
|
|
{ old => qr/^193\d/, new => "THE 1930'S" },
|
|
|
|
{ old => qr/^194\d/, new => "THE 1940'S" },
|
|
|
|
{ old => qr/^195\d/, new => "THE 1950'S" },
|
|
|
|
{ old => qr/^196\d/, new => "THE 1960'S" },
|
|
|
|
{ old => qr/^197\d/, new => "THE 1970'S" },
|
|
|
|
{ old => qr/^198\d/, new => "THE 1980'S" },
|
|
|
|
{ old => qr/^199\d/, new => "THE 1990'S" },
|
|
|
|
{ old => qr/^200\d/, new => "THE 2000'S" },
|
|
|
|
{ old => qr/19TH CENT ART/, new => "19TH CENTURY" },
|
|
|
|
{ old => qr/^20'S$/, new => "THE 1920'S" },
|
|
|
|
{ old => qr/^30'S$/, new => "THE 1930'S" },
|
|
|
|
{ old => qr/^40'S$/, new => "THE 1940'S" },
|
|
|
|
{ old => qr/^50'S$/, new => "THE 1950'S" },
|
|
|
|
{ old => qr/^60'S$/, new => "THE 1960'S" },
|
|
|
|
{ old => qr/^70'S$/, new => "THE 1970'S" },
|
|
|
|
{ old => qr/^80'S$/, new => "THE 1980'S" },
|
|
|
|
{ old => qr/^THE 50'S$/, new => "THE 1950'S" },
|
|
|
|
{ old => qr/^THE 60'S$/, new => "THE 1960'S" },
|
|
|
|
{ old => qr/^THE 70'S$/, new => "THE 1970'S" },
|
|
|
|
{ old => qr/^THE 80'S$/, new => "THE 1980'S" },
|
|
|
|
{ old => qr/^80'S TRIVIA$/, new => "THE 1980'S" },
|
|
|
|
{ old => qr/^90'S$/, new => "THE 1990'S" },
|
|
|
|
{ old => qr/(?:MOVIES|FILM) \/ TV/, new => "TV / MOVIES"},
|
|
|
|
{ old => qr/TV-MOVIES/, new => "TV / MOVIES"},
|
|
|
|
{ old => qr/MOVIE TRIVIA/, new => "MOVIES" },
|
|
|
|
{ old => qr/AT THE MOVIES/, new => "MOVIES" },
|
|
|
|
{ old => qr/^\d+ MOVIES/, new => "MOVIES" },
|
|
|
|
{ old => qr/^1993 THE YEAR/, new => "THE 1990'S" },
|
|
|
|
{ old => qr/TV \/ MOVIE/, new => "TV / MOVIES" },
|
|
|
|
{ old => qr/^TV (?:SITCOM|TRIVIA|SHOWS|HOSTS)/, new => "TV" },
|
|
|
|
{ old => qr/^TV:/, new => "TV" },
|
|
|
|
{ old => qr/TVS STTNG/, new => "STAR TREK" },
|
|
|
|
{ old => qr/ACRONYM/, new => "ACRONYM SOUP" },
|
|
|
|
{ old => qr/ANIMAL TRIVIA/, new => "ANIMAL KINGDOM" },
|
|
|
|
{ old => qr/^ANIA?MALS$/, new => "ANIMAL KINGDOM" },
|
|
|
|
{ old => qr/^ADS$/, new => "ADVERTISING" },
|
|
|
|
{ old => qr/^TELEVISION$/, new => "TV" },
|
|
|
|
{ old => qr/^QUICK QUICK$/, new => "QUICK! QUICK!" },
|
|
|
|
{ old => qr/^QUOTES$/, new => "QUOTATIONS" },
|
|
|
|
{ old => qr/^SHAKESPEAREAN CHARACTER$/, new => "SHAKESPEARE" },
|
|
|
|
{ old => qr/^USELESS INFO$/, new => "USELESS FACTS" },
|
|
|
|
{ old => qr/^WORLD CUP 2002$/, new => "WORLD CUP" },
|
|
|
|
{ old => qr/^AUTHOR$/, new => "AUTHORS" },
|
|
|
|
{ old => qr/^ART$/, new => "ARTS" },
|
|
|
|
{ old => qr/^BOOZE/, new => "BOOZE" },
|
|
|
|
{ old => qr/^SCIFI/, new => "SCI-FI" },
|
|
|
|
{ old => qr/^HITCHHIKER/, new => "HITCHHIKER'S GUIDE" },
|
|
|
|
{ old => qr/^SCIENCE FANTASY/, new => "SCI-FI" },
|
2018-02-20 04:35:42 +01:00
|
|
|
{ old => qr/^ANATOMY$/, new => "ANATOMY & MEDICAL" },
|
|
|
|
{ old => qr/^THE BODY$/, new => "ANATOMY & MEDICAL" },
|
|
|
|
{ old => qr/^BEATLES FIRST WORDS$/, new => "BEATLES" },
|
|
|
|
{ old => qr/^MUSIC LEGENDS$/, new => "MUSIC ARTISTS" },
|
|
|
|
{ old => qr/^TOYS GAMES$/, new => "TOYS & GAMES" },
|
|
|
|
{ old => qr/^PEANUTS COMICS$/, new => "COMICS" },
|
|
|
|
{ old => qr/^COMPUTER GAMES$/, new => "VIDEO GAMES" },
|
|
|
|
{ old => qr/^ABBR$/, new => "ABBREVIATIONS" },
|
|
|
|
{ old => qr/^BABY NAMES BEG/, new => "BABY NAMES" },
|
|
|
|
{ old => qr/^CURRENCY & FLAGS$/, new => "CURRENCIES & FLAGS" },
|
|
|
|
{ old => qr/^CURRENCIES$/, new => "CURRENCIES & FLAGS" },
|
|
|
|
{ old => qr/^FUN$/, new => "FUN & GAMES" },
|
|
|
|
{ old => qr/^GAMES$/, new => "FUN & GAMES" },
|
|
|
|
{ old => qr/^HOBBIES & LEISURE$/, new => "FUN & GAMES" },
|
|
|
|
{ old => qr/^MISC GAMES$/, new => "FUN & GAMES" },
|
|
|
|
{ old => qr/^SIMPSONS$/, new => "THE SIMPSONS" },
|
|
|
|
{ old => qr/^SMURFS$/, new => "THE SMURFS" },
|
|
|
|
{ old => qr/^MLB$/, new => "BASEBALL" },
|
|
|
|
{ old => qr/ENTERTAINMENT/, new => "ENTERTAINMENT" },
|
|
|
|
{ old => qr/CONFUSCIOUS SAY/, new => "CONFUCIUS SAY" },
|
2018-02-19 03:21:31 +01:00
|
|
|
);
|
|
|
|
|
|
|
|
print STDERR "Categorizing documents\n";
|
|
|
|
|
|
|
|
for my $i (0 .. $#lines) {
|
2018-02-19 03:23:26 +01:00
|
|
|
# Remove/fix stupid things
|
|
|
|
$lines[$i] =~ s/\s*Category:\s*//g;
|
2018-02-19 03:21:31 +01:00
|
|
|
$lines[$i] =~ s/(\w:)(\w)/$1 $2/g;
|
|
|
|
$lines[$i] =~ s{/}{ / }g;
|
|
|
|
$lines[$i] =~ s{&}{ & }g;
|
|
|
|
$lines[$i] =~ s/\s+/ /g;
|
|
|
|
$lines[$i] =~ s/^Useless Trivia: What word means/Definitions: What word means/i;
|
2018-02-20 04:35:42 +01:00
|
|
|
$lines[$i] =~ s/^useless triv \d+/Useless Trivia/i;
|
|
|
|
$lines[$i] =~ s/^general\s*(?:knowledge)?\s*\p{PosixPunct}\s*//i;
|
|
|
|
$lines[$i] =~ s/^(?:\(|\[)(.*?)(?:\)|\])\s*/$1: /;
|
|
|
|
$lines[$i] =~ s/star\s?wars/Star Wars/ig;
|
2018-02-19 03:21:31 +01:00
|
|
|
|
|
|
|
$lines[$i] =~ s/^sport\s*[:-]\s*(.*?)\s*[:-]/$1: /i;
|
|
|
|
|
|
|
|
my @l = split /`/, $lines[$i];
|
|
|
|
|
2018-02-19 03:23:26 +01:00
|
|
|
# If the question has an obvious category, use that
|
2018-02-20 04:35:42 +01:00
|
|
|
if ($l[0] =~ m/^(.{3,30}?)\s*[:-]/ or $l[0] =~ m/^(.{3,30}?)\s*\./) {
|
2018-02-19 03:23:26 +01:00
|
|
|
my $cat = $1;
|
2018-02-20 04:35:42 +01:00
|
|
|
my $max_spaces = 5;
|
|
|
|
$max_spaces = 3 if $cat =~ s/\.$//;
|
2018-02-19 03:23:26 +01:00
|
|
|
my $nspc = () = $cat =~ m/\s+/g;
|
2018-02-20 04:35:42 +01:00
|
|
|
if ($nspc <= $max_spaces) {
|
|
|
|
if ($cat !~ m/(general|^A |_+| u$| "c$)/i) {
|
2018-02-19 03:23:26 +01:00
|
|
|
$cat =~ s/^\s+|\s+$//g;
|
|
|
|
$cat = uc $cat;
|
2018-02-19 03:21:31 +01:00
|
|
|
$cat =~ s/'//g;
|
|
|
|
$cat =~ s/\.//g;
|
2018-02-19 03:23:26 +01:00
|
|
|
$cat =~ s/(?:\s+$|\R|^"|"$|^-|^\[|\]$)//g;
|
|
|
|
$cat =~ s/\s+/ /g;
|
2018-02-19 03:21:31 +01:00
|
|
|
$cat =~ s/(\d+)S/$1'S/g;
|
|
|
|
|
|
|
|
$cat =~ s/^SPORT(?!S)/SPORTS/;
|
2018-02-19 03:23:26 +01:00
|
|
|
$cat =~ s/ (?:AND|N|'N) / & /;
|
2018-02-19 03:21:31 +01:00
|
|
|
#$cat =~ s/\s*\/\s*/\//;
|
|
|
|
|
|
|
|
$cat =~ s/^GEOGRAPH.*/GEOGRAPHY/;
|
2018-02-19 03:23:26 +01:00
|
|
|
$cat = 'STAR TREK' if ($cat =~ m/^STAR TREK/);
|
2018-02-19 03:21:31 +01:00
|
|
|
|
|
|
|
$cat = 'GUESS THE WORD' if $l[0] =~ m/.*: '.*\.'/;
|
|
|
|
|
|
|
|
foreach my $rule (@rename_rules) {
|
|
|
|
if ($cat =~ m/$rule->{old}/) {
|
|
|
|
$cat = uc $rule->{new};
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-19 03:23:26 +01:00
|
|
|
print STDERR "Using obvious $cat for doc $i: $l[0] ($l[1])\n";
|
|
|
|
push @{$docs{$cat}}, $i;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-02-19 03:21:31 +01:00
|
|
|
|
|
|
|
my $found = 0;
|
2018-02-19 03:23:26 +01:00
|
|
|
foreach my $rule (@rules) {
|
|
|
|
if ($l[0] =~ m/$rule->{regex}/) {
|
2018-02-19 03:21:31 +01:00
|
|
|
my $cat = uc $rule->{'category'};
|
|
|
|
push @{$docs{$cat}}, $i;
|
|
|
|
$found = 1;
|
|
|
|
print STDERR "Using rules $cat for doc $i: $l[0] ($l[1])\n";
|
2018-02-19 03:23:26 +01:00
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
2018-02-19 03:21:31 +01:00
|
|
|
|
|
|
|
next if $found;
|
|
|
|
|
|
|
|
print STDERR "Uncategorized doc $i: $l[0] ($l[1])\n";
|
|
|
|
|
|
|
|
push @uncat, $i;
|
|
|
|
}
|
|
|
|
|
|
|
|
print STDERR "Done phase 1\n";
|
|
|
|
print STDERR "Generated ", scalar keys %docs, " categories.\n";
|
|
|
|
|
|
|
|
my $small = 0;
|
|
|
|
my $total = 0;
|
|
|
|
my @approved;
|
|
|
|
|
|
|
|
foreach my $cat (sort { @{$docs{$b}} <=> @{$docs{$a}} } keys %docs) {
|
|
|
|
print STDERR " $cat: ", scalar @{$docs{$cat}}, "\n";
|
|
|
|
|
|
|
|
if (@{$docs{$cat}} < 10) {
|
|
|
|
$small++
|
|
|
|
} else {
|
|
|
|
$total += @{$docs{$cat}};
|
|
|
|
push @approved, $cat;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
print STDERR "-" x 80, "\n";
|
|
|
|
print STDERR "Small categories: $small; total cats: ", (scalar keys %docs) - $small, " with $total questions.\n";
|
|
|
|
print STDERR "-" x 80, "\n";
|
|
|
|
|
|
|
|
foreach my $cat (sort @approved) {
|
|
|
|
print STDERR "Printing $cat ... ";
|
|
|
|
|
|
|
|
my $count = 0;
|
|
|
|
foreach my $i (@{$docs{$cat}}) {
|
|
|
|
print "$cat`$lines[$i]\n";
|
|
|
|
$count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
print STDERR "$count questions.\n";
|
|
|
|
}
|
2018-02-20 04:35:42 +01:00
|
|
|
|
|
|
|
print STDERR "Uncategorized: ", scalar @uncat, "\n";
|
|
|
|
|
|
|
|
foreach my $cat (sort keys %docs) {
|
|
|
|
print STDERR " $cat: ", scalar @{$docs{$cat}}, "\n" if @{$docs{$cat}} < 10;
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach my $i (sort { $lines[$a] cmp $lines[$b] } @uncat) {
|
|
|
|
print STDERR "uncategorized: $lines[$i]\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
|