diff --git a/misc/spinach/cat.pl b/misc/spinach/cat.pl index 41876127..b5b1285f 100755 --- a/misc/spinach/cat.pl +++ b/misc/spinach/cat.pl @@ -3,7 +3,7 @@ use strict; use warnings; -use Lingua::Stem qw/stem/; +use Lingua::Stem; my %docs; my @uncat; @@ -13,34 +13,36 @@ my $minimum_category_size = 6; open my $handle, '); close $handle; -my %stopwords; -open $handle, ') { - chomp $word; - $stopwords{$word} = 1; -} -close $handle; - my @doc_rules = ( { regex => qr/(?:james bond| 007)/i, category => 'JAMES BOND' }, - { regex => qr/^(?:in (?:the year )?)?194\d /i, category => "THE 1940'S" }, - { regex => qr/^(?:in (?:the year )?)?195\d /i, category => "THE 1950'S" }, - { regex => qr/^(?:in (?:the year )?)?196\d /i, category => "THE 1960'S" }, - { regex => qr/^(?:in (?:the year )?)?197\d /i, category => "THE 1970'S" }, - { regex => qr/^(?:in (?:the year )?)?198\d /i, category => "THE 1980'S" }, - { regex => qr/^(?:in (?:the year )?)?199\d /i, category => "THE 1990'S" }, - { regex => qr/^(?:in (?:the year )?)?20\d\d /i, category => "THE 2000'S" }, - { regex => qr/(?:Name The Year|In what year)/, category => 'NAME THE YEAR' }, + { regex => qr/192\d.?s/i, category => "THE 1920'S" }, + { regex => qr/193\d.?s/i, category => "THE 1930'S" }, + { regex => qr/194\d.?s/i, category => "THE 1940'S" }, + { regex => qr/195\d.?s/i, category => "THE 1950'S" }, + { regex => qr/196\d.?s/i, category => "THE 1960'S" }, + { regex => qr/197\d.?s/i, category => "THE 1970'S" }, + { regex => qr/198\d.?s/i, category => "THE 1980'S" }, + { regex => qr/199\d.?s/i, category => "THE 1990'S" }, + { regex => qr/^(?:in (?:the year )?)?192\d\p{PosixPunct}?/i, category => "THE 1920'S" }, + { regex => qr/^(?:in (?:the year )?)?193\d\p{PosixPunct}?/i, category => "THE 1930'S" }, + { regex => qr/^(?:in (?:the year )?)?194\d\p{PosixPunct}?/i, category => "THE 1940'S" }, + { regex => qr/^(?:in (?:the year )?)?195\d\p{PosixPunct}?/i, category => "THE 1950'S" }, + { regex => qr/^(?:in (?:the year )?)?196\d\p{PosixPunct}?/i, category => "THE 1960'S" }, + { regex => qr/^(?:in (?:the year )?)?197\d\p{PosixPunct}?/i, category => "THE 1970'S" }, + { regex => qr/^(?:in (?:the year )?)?198\d\p{PosixPunct}?/i, category => "THE 1980'S" }, + { regex => qr/^(?:in (?:the year )?)?199\d\p{PosixPunct}?/i, category => "THE 1990'S" }, + { regex => qr/^(?:in (?:the year )?)?20\d\d\p{PosixPunct}?/i, category => "THE 2000'S" }, + { regex => qr/(?:Name The Year|In what year)/i, category => 'NAME THE YEAR' }, { regex => qr/baby names/i, category => 'BABY NAMES' }, { regex => qr/what word mean/i, category => 'Definitions' }, { regex => qr/What (?:one word|word links)/i, category => 'GUESS THE WORD' }, { regex => qr/^(If [Yy]ou [Ww]ere [Bb]orn|Astrology)/i, category => 'Astrology' }, - { regex => qr/[Oo]lympics/, category => 'Olympics' }, + { regex => qr/[Oo]lympics/i, category => 'Olympics' }, { regex => qr/^How many/i, category => 'HOW MANY' }, - { regex => qr/(?:^What is a group|Group Nouns)/, category => 'animal groups' }, - { regex => qr/(?:[Ww]hat is the fear|phobia is (?:a|the) fear|Phobias)/, category => 'Phobias' }, + { regex => qr/(?:^What is a group|Group Nouns)/i, category => 'animal groups' }, + { regex => qr/(?:[Ww]hat is the fear|phobia is (?:a|the) fear|Phobias)/i, category => 'Phobias' }, { regex => qr/who won the oscar/i, category => 'Oscars' }, - { regex => qr/(?:area code|country code)/, category => 'Phone COUNTRY Codes' }, + { regex => qr/(?:area code|country code)/i, category => 'Phone COUNTRY Codes' }, { regex => qr/17th.century/i, category => "17TH CENTURY" }, { regex => qr/18th.century/i, category => "18TH CENTURY" }, { regex => qr/19th.century/i, category => "19TH CENTURY" }, @@ -62,16 +64,60 @@ my @doc_rules = ( { regex => qr/stephen king/i, category => "STEPHEN KING" }, { regex => qr/wizard of oz/i, category => "WIZARD OF OZ" }, { regex => qr/philosoph/i, category => "PHILOSOPHY" }, - { regex => qr/.*: '.*\.'/, category => "GUESS THE WORD" }, + { regex => qr/.*: '.*\.'/i, category => "GUESS THE WORD" }, { regex => qr/monty python/i, category => "MONTY PYTHON" }, + { regex => qr/^the name/i, category => "NAME THAT THING" }, + { regex => qr/hit single/i, category => "HIT SINGLES" }, + { regex => qr/^a group of/i, category => "A GROUP OF IS CALLED" }, + { regex => qr/^music/i, category => "MUSIC" }, + { regex => qr/(?:canada|canadian)/i, category => "CANADA" }, + { regex => qr/who (is|was) the author/i, category => "NAME THE AUTHOR" }, + { regex => qr/who.?s the author/i, category => "NAME THE AUTHOR" }, + { regex => qr/which.*author wrote/i, category => "NAME THE AUTHOR" }, + { regex => qr/\bmusic\b/i, category => "MUSIC" }, + { regex => qr/\bauthor\b/i, category => "AUTHORS" }, + { regex => qr/bitish slang/i, category => "BRITISH SLANG" }, + { regex => qr/australian slang/i, category => "AUSSIE SLANG" }, + { regex => qr/aussie slang/i, category => "AUSSIE SLANG" }, + { regex => qr/slang term/i, category => "SLANG" }, + { regex => qr/\bslang\b/i, category => "SLANG" }, + { regex => qr/theme songi/i, category => "THEME SONGS" }, + { regex => qr/christian name/i, category => "FIRST NAMES" }, + { regex => qr/^who directed /i, category => "NAME THE DIRECTOR" }, + { regex => qr/mathemat/i, category => "MATHEMATICS" }, + { regex => qr/autobiograp/i, category => "AUTOBIOGRAPHIES" }, + { regex => qr/biograph/i, category => "BIOGRAPHIES" }, + { regex => qr/^who recorded/i, category => "NAME THE ARTIST" }, + { regex => qr/^who painted/i, category => "NAME THE PAINTER" }, + { regex => qr/^who play(?:s|ed) for/i, category => "NAME THE PLAYER" }, + { regex => qr/^who play/i, category => "NAME THE ACTOR" }, + { regex => qr/patron saint/i, category => "PATRON SAINTS" }, + { regex => qr/^who founded/i, category => "NAME THE FOUNDER" }, + { regex => qr/^who discovered/i, category => "DISCOVERIES" }, + { regex => qr/^who created/i, category => "NAME THE CREATOR" }, + { regex => qr/^who composed/i, category => "NAME THE COMPOSER" }, + { regex => qr/(?:which|what) planet/i, category => "PLANETS" }, + { regex => qr/opera/i, category => "OPERA" }, + { regex => qr/who wrote the book/i, category => "NAME THE AUTHOR" }, + { regex => qr/(?:chemistry|chemical)/i, category => "CHEMISTRY" }, + { regex => qr/(?:darth vader|chewbacca|luke skywalker|han solo|jabba the hut)/i, category => "STAR WARS" }, + { regex => qr/(?:US|U S) president/i, category => "US PRESIDENTS" }, + { regex => qr/united states president/i, category => "US PRESIDENTS" }, + { regex => qr/(?:US| U S) (?:state|city|capiti?al)/i, category => "UNITED STATES" }, + { regex => qr/authority/i, category => "AUTHORITY" }, + { regex => qr/musical instrum/, category => "MUSICAL INSTRUMENTS" }, + { regex => qr/musical term/, category => "MUSICAL TERMS" }, { regex => qr/musical/i, category => "MUSICALS" }, - { regex => qr/^the name/, category => "NAME THAT THING" }, - { regex => qr/hit single/, category => "HIT SINGLES" }, - { regex => qr/^a group of/, category => "A GROUP OF IS CALLED" }, - { regex => qr/^music/, category => "MUSIC" }, +); + +my @remaining_uncat_rules = ( + { regex => qr/^who wrote/i, category => "WHO WROTE" }, + { regex => qr/^who won/i, category => "WHO WON" }, + { regex => qr/^who was/i, category => "WHO WAS" }, ); my @rename_rules = ( + { old => qr/MUSIC TERM/, new => "MUSIC TERMS" }, { old => qr/^007$/, new => "JAMES BOND" }, { old => qr/^191\d/, new => "THE 1910'S" }, { old => qr/^192\d/, new => "THE 1920'S" }, @@ -83,6 +129,7 @@ my @rename_rules = ( { old => qr/^198\d/, new => "THE 1980'S" }, { old => qr/^199\d/, new => "THE 1990'S" }, { old => qr/^200\d/, new => "THE 2000'S" }, + { old => qr/NEWS 2002/, new => "THE 2000'S" }, { old => qr/19TH CENT ART/, new => "19TH CENTURY" }, { old => qr/^20'S$/, new => "THE 1920'S" }, { old => qr/^30'S$/, new => "THE 1930'S" }, @@ -101,6 +148,7 @@ my @rename_rules = ( { old => qr/TV-MOVIES/, new => "TV / MOVIES"}, { old => qr/MOVIE TRIVIA/, new => "MOVIES" }, { old => qr/AT THE MOVIES/, new => "MOVIES" }, + { old => qr/AUTHORS OF 1992/, new => "AUTHORS" }, { old => qr/^\d+ MOVIES/, new => "MOVIES" }, { old => qr/^1993 THE YEAR/, new => "THE 1990'S" }, { old => qr/TV \/ MOVIE/, new => "TV / MOVIES" }, @@ -180,6 +228,11 @@ my @rename_rules = ( { old => qr/MONTY PYTHON/, new => "MONTY PYTHON" }, { old => qr/BARBIE/, new => "BARBIE DOLL" }, { old => qr/(?:AMERICAN|INTL) BEER/, new => "BEER" }, + { old => qr/80'S TUNE/, new => "80'S SONGS" }, + { old => qr/^UK FOOTY$/, new => "UK FOOTBALL CLUBS" }, + { old => qr/^MISCELLANEOUS$/, new => "MISC" }, + { old => qr/CRAP JOKES/, new => "CRAPPY JOKES" }, + { old => qr/IF YOU WERE BORN ON/, new => "BIRTHS" }, ); my @skip_rules = ( @@ -227,13 +280,302 @@ my %refilter_rules = ( { regex => qr/x-men/i, category => "X-MEN" }, { regex => qr/dc comics/i, category => "DC COMICS" }, ], + "SONGS" => [ + { regex => qr/theme song/i, category => "THEME SONGS" }, + ], + "MUSIC" => [ + { regex => qr/theme song/i, category => "THEME SONGS" }, + { regex => qr/80's tune performed by/i, category => "80'S TUNE PERFORMED BY" }, + { regex => qr/90's tune performed by/i, category => "90'S TUNE PERFORMED BY" }, + { regex => qr/50's chart toppers/i, category => "1950'S CHART TOPPERS" }, + { regex => qr/60's chart toppers/i, category => "1960'S CHART TOPPERS" }, + { regex => qr/70's chart toppers/i, category => "1970'S CHART TOPPERS" }, + { regex => qr/80's chart toppers/i, category => "1980'S CHART TOPPERS" }, + { regex => qr/90's chart toppers/i, category => "1990'S CHART TOPPERS" }, + { regex => qr/\b191\d/i, category => "1910'S MUSIC" }, + { regex => qr/\b192\d/i, category => "1920'S MUSIC" }, + { regex => qr/\b193\d/i, category => "1930'S MUSIC" }, + { regex => qr/\b194\d/i, category => "1940'S MUSIC" }, + { regex => qr/\b195\d/i, category => "1950'S MUSIC" }, + { regex => qr/\b196\d/i, category => "1960'S MUSIC" }, + { regex => qr/\b197\d/i, category => "1970'S MUSIC" }, + { regex => qr/\b198\d/i, category => "1980'S MUSIC" }, + { regex => qr/\b199\d/i, category => "1990'S MUSIC" }, + { regex => qr/\b200\d/i, category => "2000'S MUSIC" }, + { regex => qr/\b201\d/i, category => "2010'S MUSIC" }, + { regex => qr/fifties/i, category => "1950'S MUSIC" }, + { regex => qr/sixties/i, category => "1960'S MUSIC" }, + { regex => qr/seventies/i, category => "1970'S MUSIC" }, + { regex => qr/eighties/i, category => "1958'S MUSIC" }, + { regex => qr/nineties/i, category => "1990'S MUSIC" }, + { regex => qr/who sang/i, category => "WHO SANG THIS SONG" }, + { regex => qr/beatles/i, category => "BEATLES" }, + { regex => qr/20s tune/i, category => "1920'S MUSIC" }, + { regex => qr/30s tune/i, category => "1930'S MUSIC" }, + { regex => qr/40s tune/i, category => "1940'S MUSIC" }, + { regex => qr/50s tune/i, category => "1950'S MUSIC" }, + { regex => qr/60s tune/i, category => "1960'S MUSIC" }, + { regex => qr/70s tune/i, category => "1970'S MUSIC" }, + { regex => qr/80s tune/i, category => "1980'S MUSIC" }, + { regex => qr/90s tune/i, category => "1990'S MUSIC" }, + { regex => qr/musicals/i, category => "MUSICALS" }, + { regex => qr/grease:/i, category => "GREASE" }, + { regex => qr/terms:/i, category => "MUSIC TERMS" }, + { regex => qr/animaniacs/i, category => "ANIMANIACS" }, + { regex => qr/alternative/i, category => "ALTERNATIVE MUSIC" }, + { regex => qr/aerosmith/i, category => "AEROSMITH" }, + { regex => qr/alan parsons project/i, category => "ALAN PARSONS PROJECT" }, + { regex => qr/biggest hits/i, category => "BIGGEST HITS" }, + { regex => qr/blues brothers/i, category => "BLUES BROTHERS" }, + { regex => qr/christmas songs/i, category => "CHRISTMAS SONGS" }, + { regex => qr/country music/i, category => "COUNTRY MUSIC" }, + { regex => qr/covers:/i, category => "MUSIC COVERS" }, + { regex => qr/disney songs/i, category => "DISNEY SONGS" }, + { regex => qr/pop rock/i, category => "POP ROCK" }, + { regex => qr/frank sinatra/i, category => "FRANK SINATRA" }, + { regex => qr/finish the song line/i, category => "FINISH THE SONG LINE" }, + { regex => qr/first song on album/i, category => "FIRST SONG ON ALBUM" }, + { regex => qr/female vocalists/i, category => "FEMALE VOCALISTS" }, + { regex => qr/epic songs/i, category => "EPIC SONGS" }, + { regex => qr/FOOPY MUSIC/i, category => "FOOPY MUSIC" }, + { regex => qr/gee music/i, category => "MUSICAL LETTER G" }, + { regex => qr/:g music/i, category => "MUSICAL LETTER G" }, + { regex => qr/grunge/i, category => "GRUNGE MUSIC" }, + { regex => qr/john lennon/i, category => "JOHN LENNON" }, + { regex => qr/i get around:/i, category => "I GET AROUND MUSIC" }, + { regex => qr/keep on rocking/i, category => "KEEP ON ROCKING" }, + { regex => qr/LEAD SINGERS:/i, category => "LEAD SINGERS" }, + { regex => qr/LITERATE ROCK/i, category => "LITERATE ROCK" }, + { regex => qr/MALE VOCALISTS/i, category => "MALE VOCALISTS" }, + { regex => qr/MISHEARD LYRICS/i, category => "MISHEARD LYRICS" }, + { regex => qr/MODERN ROCK/i, category => "MODERN ROCK" }, + { regex => qr/MONTY PYTHON LYRIC/i, category => "MONTY PYTHON LYRIC" }, + { regex => qr/\bMTV\b/i, category => "MTV" }, + { regex => qr/musical food/i, category => "MUSICAL FOOD" }, + { regex => qr/MUSICAL GRAB BAG/i, category => "MUSICAL GRAB BAG" }, + { regex => qr/ONE HIT WONDERS/i, category => "ONE HIT WONDERS" }, + { regex => qr/NUMBER ONE SONGS/i, category => "NUMBER ONE SONGS" }, + { regex => qr/NUMBER 1 SONGS/i, category => "NUMBER ONE SONGS" }, + { regex => qr/RAP TRIVIA/i, category => "RAP TRIVIA" }, + { regex => qr/RUSH:/i, category => "RUSH" }, + { regex => qr/SKA MUSIC/i, category => "SKA MUSIC" }, + { regex => qr/SONG TITLES/i, category => "SONG TITLES" }, + { regex => qr/THE JACKSONS:/i, category => "MUSIC: THE JACKSONS" }, + { regex => qr/THE POLICE:/i, category => "MUSIC: THE POLICE" }, + { regex => qr/videos:/i, category => "NAME THIS MUSIC VIDEO" }, + { regex => qr/WEIRD AL/i, category => "WEIRD AL" }, + { regex => qr/WHO SANG IT/i, category => "WHO SANG IT" }, + { regex => qr/CLASSICAL/i, category => "CLASSICAL MUSIC" }, + { regex => qr/COLOURFUL SONGS/i, category => "COLOURFUL SONGS" }, + { regex => qr/COMPLETE THIS ELVIS SONG TITLE/i, category => "COMPLETE THIS ELVIS SONG TITLE" }, + { regex => qr/COPYCATS:/i, category => "MUSIC COPYCATS" }, + { regex => qr/COMPOSER:/i, category => "COMPOSERS" }, + { regex => qr/david bowie/i, category => "DAVID BOWIE" }, + { regex => qr/eagles:/i, category => "MUSIC: EAGLES" }, + { regex => qr/eagles song/i, category => "MUSIC: EAGLES" }, + { regex => qr/elton john/i, category => "ELTON JOHN" }, + { regex => qr/elvis costello/i, category => "ELVIS COSTELLO" }, + { regex => qr/elvis presley/i, category => "ELVIS PRESLEY" }, + { regex => qr/essential clapton/i, category => "ERIC CLAPTON" }, + { regex => qr/first lines of songs/i, category => "FIRST LINES OF SONGS" }, + { regex => qr/food:/i, category => "MUSIC AND FOOD" }, + { regex => qr/garth brooks/i, category => "GARTH BROOKS" }, + { regex => qr/grateful dead/i, category => "GRATEFUL DEAD" }, + { regex => qr/jazz/i, category => "JAZZ" }, + { regex => qr/JIMMY BUFFET/i, category => "JIMMY BUFFET" }, + { regex => qr/METALLICA/i, category => "METALLICA" }, + { regex => qr/MIDNIGHT OIL:/i, category => "MUSIC: MIDNIGHT OIL" }, + { regex => qr/MONTY PYTHON SONGS/i, category => "MONTY PYTHON SONGS" }, + { regex => qr/MUSICAL STYLES:/i, category => "MUSICAL STYLES" }, + { regex => qr/MUSICAL (?:LETTER )? A:/i, category => "MUSICAL LETTER A" }, + { regex => qr/MUSICAL (?:LETTER )? B:/i, category => "MUSICAL LETTER B" }, + { regex => qr/MUSICAL (?:LETTER )? C:/i, category => "MUSICAL LETTER C" }, + { regex => qr/MUSICAL (?:LETTER )? D:/i, category => "MUSICAL LETTER D" }, + { regex => qr/MUSICAL (?:LETTER )? E:/i, category => "MUSICAL LETTER E" }, + { regex => qr/MUSICAL (?:LETTER )? F:/i, category => "MUSICAL LETTER F" }, + { regex => qr/MUSICAL (?:LETTER )? G:/i, category => "MUSICAL LETTER G" }, + { regex => qr/MUSICAL (?:LETTER )? H:/i, category => "MUSICAL LETTER H" }, + { regex => qr/MUSICAL (?:LETTER )? I:/i, category => "MUSICAL LETTER I" }, + { regex => qr/MUSICAL (?:LETTER )? J:/i, category => "MUSICAL LETTER J" }, + { regex => qr/MUSICAL (?:LETTER )? K:/i, category => "MUSICAL LETTER K" }, + { regex => qr/MUSICAL (?:LETTER )? L:/i, category => "MUSICAL LETTER L" }, + { regex => qr/MUSICAL (?:LETTER )? M:/i, category => "MUSICAL LETTER M" }, + { regex => qr/MUSICAL (?:LETTER )? N:/i, category => "MUSICAL LETTER N" }, + { regex => qr/MUSICAL (?:LETTER )? O:/i, category => "MUSICAL LETTER O" }, + { regex => qr/MUSICAL (?:LETTER )? P:/i, category => "MUSICAL LETTER P" }, + { regex => qr/MUSICAL (?:LETTER )? Q:/i, category => "MUSICAL LETTER Q" }, + { regex => qr/MUSICAL (?:LETTER )? R:/i, category => "MUSICAL LETTER R" }, + { regex => qr/MUSICAL (?:LETTER )? S:/i, category => "MUSICAL LETTER S" }, + { regex => qr/MUSICAL (?:LETTER )? T:/i, category => "MUSICAL LETTER T" }, + { regex => qr/MUSICAL (?:LETTER )? U:/i, category => "MUSICAL LETTER U" }, + { regex => qr/MUSICAL (?:LETTER )? V:/i, category => "MUSICAL LETTER V" }, + { regex => qr/MUSICAL (?:LETTER )? W:/i, category => "MUSICAL LETTER W" }, + { regex => qr/MUSICAL (?:LETTER )? X:/i, category => "MUSICAL LETTER X" }, + { regex => qr/MUSICAL (?:LETTER )? Y:/i, category => "MUSICAL LETTER Y" }, + { regex => qr/MUSICAL (?:LETTER )? Z:/i, category => "MUSICAL LETTER Z" }, + { regex => qr/MICHAEL JACKSON/i, category => "MICHAEL JACKSON" }, + { regex => qr/NAMES IN SONGS/i, category => "NAMES IN SONGS" }, + { regex => qr/NAME THE ARTIST/i, category => "NAME THE BAND" }, + { regex => qr/NAME THE ALBUM/i, category => "NAME THE ALBUM" }, + { regex => qr/name the band/i, category => "NAME THE BAND" }, + { regex => qr/NAME THE SONG/i, category => "NAME THE SONG" }, + { regex => qr/NAME THE GROUP/i, category => "NAME THE BAND" }, + { regex => qr/ROLLING STONES/i, category => "ROLLING STONES" }, + { regex => qr/REGGAE:/i, category => "REGGAE" }, + { regex => qr/RECORD LABELS/i, category => "RECORD LABELS" }, + { regex => qr/RAVE CULTURE/i, category => "RAVE CULTURE" }, + { regex => qr/RAY CHARLES/i, category => "RAY CHARLES" }, + { regex => qr/SINATRA/i, category => "FRANK SINATRA" }, + { regex => qr/SONG TITLE"/i, category => "SONG TITLES" }, + { regex => qr/SPORTS IN MUSIC/i, category => "SPORTS IN MUSIC" }, + { regex => qr/STYLES:/i, category => "MUSIC STYLES" }, + { regex => qr/THE DOORS:/i, category => "MUSIC: THE DOORS" }, + { regex => qr/TOM PETTY/i, category => "TOM PETTY" }, + { regex => qr/TORI AMOS/i, category => "TORI AMOS" }, + { regex => qr/TV THEMES/i, category => "THEME SONGS" }, + { regex => qr/WHAT INSTRUMENT /i, category => "WHAT INSTRUMENT DID THEY USE" }, + { regex => qr/MADONNA/i, category => "MADONNA" }, + { regex => qr/ERIC CLAPTON/i, category => "ERIC CLAPTON" }, + { regex => qr/: WHO RECORDED/i, category => "NAME THAT BAND" }, + { regex => qr/: WHO SANG/i, category => "NAME THAT SINGER" }, + { regex => qr/pink floyd/i, category => "PINK FLOYD" }, + { regex => qr/pop singers/i, category => "POP SINGERS" }, + { regex => qr/punk rock/i, category => "PUNK ROCK" }, + { regex => qr/NINE INCH NAILS/i, category => "NINE INCH NAILS" }, + { regex => qr/NIRVANA/i, category => "NIRVANA" }, + { regex => qr/NICKNAMES:/i, category => "NICKNAMES" }, + { regex => qr/opera/i, category => "OPERA" }, + ], + "TV / MOVIES" => [ + { regex => qr/007:/i, category => "JAMES BOND" }, + { regex => qr/charlie chaplin/i, category => "CHARLIE CHAPLIN" }, + { regex => qr/starred in this movie/i, category => "NAME THE ACTOR" }, + { regex => qr/w[io]n the oscar/i, category => "WHICH FILM WON THE OSCAR FOR..." }, + { regex => qr/\b191\d/i, category => "1910'S TV / MOVIES" }, + { regex => qr/\b192\d/i, category => "1920'S TV / MOVIES" }, + { regex => qr/\b193\d/i, category => "1930'S TV / MOVIES" }, + { regex => qr/\b194\d/i, category => "1940'S TV / MOVIES" }, + { regex => qr/\b195\d/i, category => "1950'S TV / MOVIES" }, + { regex => qr/\b196\d/i, category => "1960'S TV / MOVIES" }, + { regex => qr/\b197\d/i, category => "1970'S TV / MOVIES" }, + { regex => qr/\b198\d/i, category => "1980'S TV / MOVIES" }, + { regex => qr/\b199\d/i, category => "1990'S TV / MOVIES" }, + { regex => qr/\b200\d/i, category => "2000'S TV / MOVIES" }, + { regex => qr/\b201\d/i, category => "2010'S TV / MOVIES" }, + { regex => qr/cartoons/i, category => "CARTOONS" }, + { regex => qr/: 50.?s/i, category => "1950'S TV / MOVIES" }, + { regex => qr/: 60.?s/i, category => "1960'S TV / MOVIES" }, + { regex => qr/: 70.?s/i, category => "1970'S TV / MOVIES" }, + { regex => qr/: 80.?s/i, category => "1980'S TV / MOVIES" }, + { regex => qr/: 90.?s/i, category => "1990'S TV / MOVIES" }, + { regex => qr/academy award/i, category => "ACADAMY AWARDS" }, + { regex => qr/(?:ACTORS|ACTRESSES) IN FILM/i, category => "ACTORS IN FILM" }, + { regex => qr/(?:ACTORS|ACTRESSES) IN THE ROLE/i, category => "ACTORS IN THE ROLE" }, + { regex => qr/(?:ACTORS|ACTRESSES) IN TV/i, category => "ACTORS IN TV" }, + { regex => qr/CARTOONISTS/i, category => "CARTOONISTS" }, + { regex => qr/ANIMANIACS/i, category => "ANIMANIACS" }, + { regex => qr/ANIME/i, category => "ANIME" }, + { regex => qr/BACK TO THE FUTURE/i, category => "BACK TO THE FUTURE" }, + { regex => qr/BATMAN/i, category => "BATMAN" }, + { regex => qr/BLADE RUNNER/i, category => "BLADE RUNNER" }, + { regex => qr/90210/i, category => "BEVERLY HILLS 90210" }, + { regex => qr/BLAZING SADDLES/i, category => "BLAZING SADDLES" }, + { regex => qr/BLUES BROTHERS/i, category => "BLUES BROTHERS" }, + { regex => qr/B MOVIES/i, category => "B MOVIES" }, + { regex => qr/BRADY MANIA/i, category => "BRADY MANIA" }, + { regex => qr/CARTOON SIDEKICKS/i, category => "CARTOON SIDEKICKS" }, + { regex => qr/CHEERS TRIVIA/i, category => "CHEERS TRIVIA" }, + { regex => qr/DEFINING ROLES/i, category => "DEFINING ROLES" }, + { regex => qr/DICK VAN DYKE/i, category => "DICK VAN DYKE" }, + { regex => qr/DISNEY/i, category => "DISNEY" }, + { regex => qr/DIRECTORS:/i, category => "DIRECTORS" }, + { regex => qr/DOCTOR WHO/i, category => "DOCTOR WHO" }, + { regex => qr/DR.? SEUSS/i, category => "DR SEUSS" }, + { regex => qr/DUKES OF HAZZARD/i, category => "DUKES OF HAZZARD" }, + { regex => qr/FAMILY FLICKS/i, category => "FAMILY FLICKS" }, + { regex => qr/FILM TOP COPS/i, category => "FILM TOP COPS" }, + { regex => qr/TV TOP COPS/i, category => "TV TOP COPS" }, + { regex => qr/game shows/i, category => "GAME SHOWS" }, + { regex => qr/fox tv/i, category => "FOX TV" }, + { regex => qr/: FULL HOUSE/i, category => "FULL HOUSE" }, + { regex => qr/GILLIGANS ISLAND/i, category => "GILLIGANS ISLAND" }, + { regex => qr/GREASE:/i, category => "GREASE" }, + { regex => qr/HIGHLANDER/i, category => "HIGHLANDER" }, + { regex => qr/: HOLLYWOOD/i, category => "HOLLYWOOD" }, + { regex => qr/INDEPENDENT FILMS/i, category => "INDEPENDENT FILMS" }, + { regex => qr/INDIANA JONES/i, category => "INDIANA JONES" }, + { regex => qr/JAMES BOND/i, category => "JAMES BOND" }, + { regex => qr/: LETTERMAN/i, category => "LETTERMAN" }, + { regex => qr/MARX MOVIES/i, category => "MARX MOVIES" }, + { regex => qr/MASH/i, category => "M*A*S*H" }, + { regex => qr/MONTY PYTHON/i, category => "MONTY PYTHON" }, + { regex => qr/OLDER MOVIES/i, category => "OLDER MOVIES" }, + { regex => qr/MOVIE BOMBS/i, category => "MOVIE BOMBS" }, + { regex => qr/MOVIE LINES/i, category => "MOVIE LINES" }, + { regex => qr/MOVIE MUSICALS:/i, category => "MOVIE MUSICALS" }, + { regex => qr/MOVIE QUOTES/i, category => "MOVIE QUOTES" }, + { regex => qr/MOVIE TAG LINE/i, category => "MOVIE TAG LINES" }, + { regex => qr/MOVIE THAT FEATURES/i, category => "MOVIE THAT FEATURES..." }, + { regex => qr/MOVIE TRIVIA/i, category => "MOVIE TRIVIA" }, + { regex => qr/MTV FEATURES/i, category => "MTV MOVIES" }, + { regex => qr/MUPPET MANIA/i, category => "MUPPET MANIA" }, + { regex => qr/NAME THEIR NETWORK/i, category => "NAME THEIR TV NETWORK" }, + { regex => qr/NAME THAT TV SHOW/i, category => "NAME THAT TV SHOW" }, + { regex => qr/POWER RANGERS/i, category => "POWER RANGERS" }, + { regex => qr/PULP FICTION/i, category => "PULP FICTION" }, + { regex => qr/QUALITY MOVIES/i, category => "QUALITY MOVIES" }, + { regex => qr/QUANTUM LEAP/i, category => "QUANTUM LEAP" }, + { regex => qr/ROBOTECH/i, category => "ROBOTECH" }, + { regex => qr/ROCKY HORROR/i, category => "ROCKY HORROR" }, + { regex => qr/SCI FI MOVIES/i, category => "SCI FI MOVIES" }, + { regex => qr/RUSH LIMBAUGH/i, category => "RUSH LIMBAUGH" }, + { regex => qr/SIDEKICK/i, category => "SIDEKICKS" }, + { regex => qr/SIMPSONS/i, category => "THE SIMPSONS" }, + { regex => qr/SITCOMS/i, category => "SITCOMS" }, + { regex => qr/SPORTS ACTORS/i, category => "SPORTS ACTORS" }, + { regex => qr/STAR TREK/i, category => "STAR TREK" }, + { regex => qr/SUPERSTARS/i, category => "SUPERSTARS" }, + { regex => qr/TARANTINO/i, category => "TARANTINO" }, + { regex => qr/THREES COMPANY/i, category => "THREES COMPANY" }, + { regex => qr/THE TICK/i, category => "THE TICK" }, + { regex => qr/TV LAST NAMES/i, category => "TV LAST NAMES" }, + { regex => qr/TV OCCUPATIONS/i, category => "TV OCCUPATIONS" }, + { regex => qr/TV PETS/i, category => "TV PETS" }, + { regex => qr/TWIN PEAKS/i, category => "TWIN PEAKS" }, + { regex => qr/UK TV/i, category => "UK TV" }, + { regex => qr/WIZARD OF OZ/i, category => "WIZARD OF OZ" }, + { regex => qr/VAMPIRES/i, category => "VAMPIRES" }, + { regex => qr/WINONA RYDER/i, category => "WINONA RYDER" }, + { regex => qr/x.files/i, category => "X FILES" }, + { regex => qr/CLIVE BARKER/i, category => "CLIVE BARKER" }, + { regex => qr/COMEDY/i, category => "COMEDIES" }, + { regex => qr/who (?:play|star|was)/i, category => "NAME THE ACTOR" }, + ], + "SCIENCE" => [ + { regex => qr/computer/i, category => "COMPUTERS" }, + { regex => qr/science & nature/i, category => "SCIENCE & NATURE" }, + { regex => qr/science & technology/i, category => "SCIENCE & TECHNOLOGY" }, + { regex => qr/periodic table/i, category => "PERIODIC TABLE" }, + ], +); + +my @disregard_rules = ( + qr/ANIMAL IN YOU/, + qr/BOXING/, + qr/ON THIS DAY/, + qr/AUTHORITY/, + qr/PHYSICS/, + qr/CANADIANISMS/, + qr/IN COMMON/, + qr/ELECTRICITY/, + qr/B MOVIES/, ); print STDERR "Categorizing documents\n"; for my $i (0 .. $#lines) { # Remove/fix stupid things - $lines[$i] =~ s/\s*category:\s*//gi; + $lines[$i] =~ s/\s*(?:category|potpourri)\s*:\s*//gi; $lines[$i] =~ s/(\w:)(\w)/$1 $2/g; $lines[$i] =~ s{/}{ / }g; $lines[$i] =~ s{&}{ & }g; @@ -249,6 +591,12 @@ for my $i (0 .. $#lines) { my @l = split /`/, $lines[$i]; + if (not $l[0] =~ m/ /) { + print STDERR "Skipping doc $i (no spaces): $l[0] ($l[1])\n"; + next; + } + + # skip questions that we don't want my $skip = 0; foreach my $rule (@skip_rules) { if ($l[0] =~ m/$rule/) { @@ -285,6 +633,7 @@ for my $i (0 .. $#lines) { $cat =~ s/(\d+)S/$1'S/g; $cat =~ s/ (?:AND|N|'N) / & /; + # rename any categories foreach my $rule (@rename_rules) { if ($cat =~ m/$rule->{old}/) { $cat = uc $rule->{new}; @@ -300,6 +649,7 @@ for my $i (0 .. $#lines) { } } + # No obvious category to extract, use rule-based filtering my $found = 0; foreach my $rule (@doc_rules) { if ($l[0] =~ m/$rule->{regex}/) { @@ -318,6 +668,9 @@ for my $i (0 .. $#lines) { push @uncat, $i; } +my %approved; + +# refilter questions from certain categories into better sub-categories foreach my $key (keys %refilter_rules) { for (my $i = 0; $i < @{$docs{$key}}; $i++) { my $doc = $docs{$key}->[$i]; @@ -327,17 +680,19 @@ foreach my $key (keys %refilter_rules) { print STDERR "Refiltering doc $doc from $key to $rule->{category} $l[0] ($l[1])\n"; push @{$docs{$rule->{category}}}, $doc; splice @{$docs{$key}}, $i--, 1; + $approved{$rule->{category}} = 1; + last; } } } } +# generate a list of categories that meet the minimum category size print STDERR "Done phase 1\n"; print STDERR "Generated ", scalar keys %docs, " categories.\n"; my $small = 0; my $total = 0; -my @approved; foreach my $cat (sort { @{$docs{$b}} <=> @{$docs{$a}} } keys %docs) { print STDERR " $cat: ", scalar @{$docs{$cat}}, "\n"; @@ -346,10 +701,11 @@ foreach my $cat (sort { @{$docs{$b}} <=> @{$docs{$a}} } keys %docs) { $small++ } else { $total += @{$docs{$cat}}; - push @approved, $cat; + $approved{$cat} = 1; } } +# dump the small categories to see what weird things are being mistaken for a category print STDERR "-" x 80, "\n"; print STDERR "Small categories: $small; total cats: ", (scalar keys %docs) - $small, " with $total questions.\n"; print STDERR "-" x 80, "\n"; @@ -360,27 +716,52 @@ foreach my $cat (sort keys %docs) { print STDERR "Uncategorized: ", scalar @uncat, "\n"; +# go through all uncategorized questions and compare the questions word-by-word +# against existing categories word-by-word and filter questions that contain all +# the words of a category into said category +my $stemmer = Lingua::Stem->new; +$stemmer->stem_caching({ -level => 2 }); +$stemmer->add_exceptions( + { + 'authority' => 'authority', + 'musical' => 'musical', + 'anime' => 'anime', + } +); + my @remaining_uncat; my $i = 0; $total = @uncat; foreach my $doc (sort { $lines[$a] cmp $lines[$b] } @uncat) { - print STDERR "$i / $total\n" if $i % 1000 == 0; + if ($i % 1000 == 0) { + print STDERR "-" x 80, "\n"; + print STDERR "$i / $total\n"; + print STDERR "-" x 80, "\n"; + } $i++; my @l = split /`/, $lines[$doc]; my @doc_words = split / /, $l[0]; @doc_words = map { local $_ = $_; s/\p{PosixPunct}//g; lc $_ } @doc_words; - @doc_words = @{ stem grep { length $_ and not exists $stopwords{$_} } @doc_words}; - - #print STDERR "doc words for $doc: $l[0]: @doc_words\n"; + my @doc_numbers = grep { $_ =~ m/\d/ } @doc_words; + @doc_words = @{ $stemmer->stem_in_place(grep { length $_ > 1 and $_ !~ m/\d/ } @doc_words) }; + push @doc_words, @doc_numbers; my $categorized = 0; - foreach my $cat (sort { length $b <=> length $a } @approved) { - next if $cat =~ m/ANIMAL IN YOU/; - next if $cat =~ m/BOXING/; + foreach my $cat (sort { length $b <=> length $a } keys %approved) { + my $skip = 0; + foreach my $rule (@disregard_rules) { + if ($cat =~ m/$rule/) { + $skip = 1; + last; + } + } + next if $skip; my @cat_words = split / /, $cat; @cat_words = map { local $_ = $_; s/\p{PosixPunct}//g; lc $_ } @cat_words; - @cat_words = @{ stem grep { length $_ and not exists $stopwords{$_} } @cat_words}; + my @cat_numbers = grep { $_ =~ m/\d/ } @cat_words; + @cat_words = @{ $stemmer->stem_in_place(grep { length $_ > 1 and $_ !~ m/\d/ } @cat_words) }; + push @cat_words, @cat_numbers; my %matches; foreach my $cat_word (@cat_words) { @@ -394,7 +775,7 @@ foreach my $doc (sort { $lines[$a] cmp $lines[$b] } @uncat) { MATCH: if (keys %matches == @cat_words) { - print STDERR "Adding doc $doc to $cat: $l[0] ($l[1])\n"; + print STDERR "Adding doc $doc to $cat: $l[0] ($l[1]) -- @doc_words == @cat_words\n"; push @{$docs{$cat}}, $doc; $categorized = 1; last; @@ -406,17 +787,54 @@ foreach my $doc (sort { $lines[$a] cmp $lines[$b] } @uncat) { } } +# filter remaining uncategorized questions by uncat rules +my @new_uncat; +foreach my $doc (@remaining_uncat) { + my @l = split /`/, $lines[$doc]; + foreach my $rule (@remaining_uncat_rules) { + if ($l[0] =~ m/$rule->{regex}/) { + my $cat = uc $rule->{'category'}; + push @{$docs{$cat}}, $doc; + if (@{$docs{$cat}} == $minimum_category_size) { + $approved{$cat} = 1; + } + print STDERR "Using uncat rules $cat for doc $i: $l[0] ($l[1])\n"; + } else { + push @new_uncat, $doc; + } + } +} +@remaining_uncat = @new_uncat; + +# refilter questions in certain categories to other categories instead +foreach my $key (keys %refilter_rules) { + for (my $i = 0; $i < @{$docs{$key}}; $i++) { + my $doc = $docs{$key}->[$i]; + my @l = split /`/, $lines[$doc]; + foreach my $rule (@{$refilter_rules{$key}}) { + if ($l[0] =~ m/$rule->{regex}/) { + print STDERR "Refiltering doc $doc from $key to $rule->{category} $l[0] ($l[1])\n"; + push @{$docs{$rule->{category}}}, $doc; + splice @{$docs{$key}}, $i--, 1; + $approved{$rule->{category}} = 1; + last; + } + } + } +} + $total = 0; -foreach my $cat (@approved) { +foreach my $cat (keys %approved) { $total += @{$docs{$cat}}; } +# write the final questions to stdout, dump categories and counts to stderr print STDERR "-" x 80, "\n"; -print STDERR "Categories: ", scalar @approved, " with $total questions.\n"; +print STDERR "Categories: ", scalar keys %approved, " with $total questions.\n"; print STDERR "-" x 80, "\n"; -foreach my $cat (sort @approved) { +foreach my $cat (sort keys %approved) { print STDERR "$cat ... "; my $count = 0; @@ -428,10 +846,18 @@ foreach my $cat (sort @approved) { print STDERR "$count questions.\n"; } +# dump the remaining uncategorized questions to see what we've missed print STDERR "-" x 80, "\n"; - print STDERR "Remaining uncategorized: ", scalar @remaining_uncat, "\n"; - foreach my $i (sort { $lines[$a] cmp $lines[$b] } @remaining_uncat) { print STDERR "uncategorized: $lines[$i]\n"; } + +# dump the final questions sorted by category to make sure the rules are sane +print STDERR "x" x 80, "\n"; +foreach my $cat (sort keys %approved) { + foreach my $i (@{$docs{$cat}}) { + print STDERR "$cat`$lines[$i]\n"; + } +} +