pbot/modules/lookupbot.pl

#!/usr/bin/perl -T

use strict;
use LWP::Simple;
use LWP::UserAgent;
use Encode qw/ decode is_utf8 /;
use CGI qw/escape unescapeHTML/;
use HTML::Entities;
use utf8;

my $VERSION = '1.0.2';

my %IRSSI = (
    'authors' => 'Craig Andrews',
    'contact' => 'craig@simplyspiffing.com',
    'name'    => 'lookupbot',
    'description' => 'Some kind of magical internet searcher',
    'license' => 'Craig\'s Magical Freebie License');

## Changes ##
# 0.0.1 - Initial version, not very good
# 1.0.0 - Finished!
# 1.0.1 - Added 'pre' and 'escape' options to give more flexibility
#         Added tinyurl and !cndb
# 1.0.2 - Changed privmsg handling
#         Removed !dict and !thes because they're too badly broken
#         Added !memetic, !horoscope, !cricket
#         Added different handling for public/private responders
# 1.0.3 - Added !tdm
#         Split request processing away from event handling
#         Added the start of a simple cache
# 1.1.0 - Completion of refactoring, so officially a new version!

##
# Clean up the input data and separate the
# trigger and parameter portions
##
sub get_data {
    my $data = shift;

    my @params = split / +/, $data;
    my $trigger = shift @params;

    $data = join ' ', @params;
    $data =~ s/[^[:print:]]/ /g;
    $data =~ s/  */ /g;

    return $trigger, $data;
}

##
# Retrieve the content from a url
# Params:
#  $url - The URL to query
#  $data - If defined, data to insert into the URL using sprintf
#  $escape - URL encode the data before insertion? 1 = true, 0 = false
##
my %url_cache;
sub get_content {
    my ($url, $data, $escape, $cache) = @_;

    $data = escape($data) unless $escape == 0;
    $url = sprintf($url, $data) if defined $data;

    # Use the cache if requested
    my $timeout = time() - $cache;
    if (defined $cache &&
        $cache > 0 &&
	exists $url_cache{$url} &&
	$url_cache{$url}->{'time'} > $timeout) {

	return $url_cache{$url}->{'content'};
    }

    my $ua = LWP::UserAgent->new(agent => "ME");
    my $result = $ua->get($url, ('Accept-Charset' => 'utf-8,iso-8859-1,*'));

    my $content;
    if ($result->is_success)
    {
        my $encoding = $result->content_encoding;
        if ($encoding eq "") {
            $encoding = is_utf8($result->content)?'utf-8':'iso-8859-1';
        }
        $content = decode($encoding, $result->content()) if $result->is_success;
	$url_cache{$url} = {'time' => time(), 'content' => $content};
    }

    return $content;
}

##
# Google image search
##
sub image_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /imgurl=(.*?)\&/is;
    return $lines;
}

##
# Basic google search
##
sub google_search {
    my $content = shift;

    my ($calcs) = $content =~ /<td nowrap><h2 class=r><font size=\+1><b>(.+?)<\/b>/sm;
    return $calcs if defined $calcs and length $calcs;

    my $lines = join ' ', $content =~ /<div class=g><a href="(.+?)"/is;
    return $lines;
}

##
# Google definition search
##
sub define_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /(?<=<li>)(.+?)(?=<br>|<li>)/is;
    return $lines;
}

##
# Urban dictionary search
##
sub urban_search {
    my $content = shift;
    my $term = shift;

    my @rawlines = $content =~ /<div class=["'](meaning|definition|example|def_p)["']>(.+?)<\/?div/gism;
    my @lines;
    foreach (@rawlines) {
        my @s = split /(?:\n|<br\/?>)/;
        push @lines, $_ for @s;
    }

    my $definition;
    my $def_word = 0;
    my $paragraphs = 0;

    while ($def_word <= 1 &&
           $paragraphs <= 4 &&
           scalar(@lines) > 0) {

        my $s = shift(@lines);
        $s =~ s/^\s*//;
        $s =~ s/\s*$//;
        $s =~ s/<.+?>//g;

        if ($s =~ /(meaning|definition|def_p)/) {
            $def_word++;
        } elsif ($s =~ /example/) {
# Do nothing
        } elsif (length $s > 0) {
            $definition .= "$s\n";
            $paragraphs++;
        }
    }

    return decode_entities($definition);
}

##
# Profanisaurus search
##
sub profan_search {
    my $content = shift;

    my @matches = $content =~ /<a href="(profan_results.php\?profan=searchstory.+?)">(.+?)</gsi;
    return '' unless @matches;

    my %definitions;
    my $ix;
    for ($ix = 0; $ix < @matches; $ix+=2) {
        my $key = $matches[$ix+1];
        $key =~ tr/A-Z/a-z/;
        $definitions{$key} = $matches[$ix];
    }

    my @keys = sort keys %definitions;

    $content = get_content('http://www.viz.co.uk/profanisaurus/'.$definitions{$keys[0]}, 1);
    @matches = $content =~ /class=profandefinition>(.+)/;

    return join "\n", @matches;
}

##
# Urban word of the day
##
sub uwotd_search {
    my $content = shift;

    my ($word) = $content =~ /(<item .+?<\/item>)/s;
    my ($title, $description) = $word =~ /<(?:title|description)>(.+?)<\//gs;

    $description = unescapeHTML($description);
    $description =~ s/<br\s*\/?>/\n/g;

    my @lines = $description =~ /<p>(.+?)<\/p>/gs;
    unshift @lines, $title;

    return join("\n", @lines);
}

##
# Worthless word of the day
##
sub wwotd_search {
    my $content = shift;

    my ($matches) = $content =~ m|(?<=<PRE>\s)(the worthless word for the day is:.+?)(?=</PRE>)|ism;
    my @lines = split "\n", $matches;
    return if length @lines == 0;

    my $blanks = 2;
    my @result;
    while ($blanks > 0 && scalar(@lines)) {
        my $line = shift @lines;
        if (length $line) {
            push @result, $line;
        } else {
            $blanks --;
        }
    }

    return join "\n", @result;
}

##
# Dictionary.com word of the day
##
=cut
sub wotd_search {
    my $content = shift;

    my @lines = $content =~ m|(?<=<span class="hw">).+?(?=</p>)|igosm;
    return if length @lines == 0;
    s/<.+?>//g foreach (@lines);
    @lines = grep { /^.+$/ } split (/\n/, @lines[0]);

    return join ("\n", @lines);
}
=cut
##
# Sloganizer
##
sub slogan_search {
    my $content = shift;

    my ($lines) = $content =~ /<div class="slogan" id="slogan">.<b>(.*?)<\/b>.<\/div>/is;
    return $lines;
}

##
# Compliment generator
##
sub compliment_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /<h2>(.*?)<\/h2>/is;
    $lines =~ s/[\r\n]/ /g;
    return $lines;
}

##
# Insult generator
##
sub insult_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /<div class="insult" id="insult">(.+?)<\/div>/is;
    $lines =~ s/[\r\n]/ /g;
    return $lines;
}

##
# Limerick DB search
##
sub limerick_preprocessor {
    my $parameter = shift;

    if (!defined($parameter) || $parameter == 0) {
        $parameter = 'random';
    }

    return $parameter;
}

sub limerick_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /<div class="quote_output">(.*?)<\/div>/is;
    $lines =~ s/\t//g;
    $lines =~ s/<br\s*\/?>/\n/g;

    return $lines;
}

##
# Bash.org ID search
##
sub bash_preprocessor {
    my $parameter = shift;

    if (!defined($parameter) || $parameter == 0) {
        $parameter = 'random';
    }

    return $parameter;
}

sub bash_search {
    my $content = shift;

    my $lines = join ' ', $content =~ /<p class="qt">(.*?)<\/p>/is;
    $lines =~ s/<br\/?>/\n/g;
    return $lines;
}

##
# Memetic.org ID search
# Preprocessor converts empty parameter to 'random' search
##
sub memetic_preprocessor {
    my $parameter = shift;

    if (!defined($parameter) || $parameter == 0) {
        $parameter = 'random';
    }

    return $parameter;
}

sub memetic_search {
    my $content = shift;

    my @lines = $content =~ /<font size='-1' face='Courier New, Courier, mono'>(.*?)<\/font>/isg;
    my $lines = $lines[1];
    $lines =~ s/<br\/?>/\n/g;
    return $lines;
}

##
# Generate a tinyurl for a given URL
# Only really useful as a privmsg
##
sub tinyurl_search {
    my $content = shift;
    my $term = shift;
    my $server = shift;
    my $nick = shift;

    my @lines = $content =~ /<blockquote><b>(.+?)</gism;

    my $result = '';
    if (scalar(@lines)) {
        $result = $lines[1];
    }

    return $result;
}

##
# Get the current England game score, if any
##
sub cricket_search {
    my $content = shift;

    my @lines = grep {/England/} split /$/m, $content;

    return $lines[0];
}

##
# Celebrity Nude Database search
# Preprocessor switches "Forename Surname" to
# "Surname, Forename" format
##
sub cndb_search {
    my $content = shift;

    my ($name) = $content =~ /<title>CNdb: (.+?)<\/title>/igosm;
    return "" unless defined $name && length $name;

    my @raw = $content =~ m/class="bold">(.+?)<\/td>/gosm;
    return "" unless scalar(@raw);
    my @lines;
    while (scalar(@raw) &&
        $raw[0] !~ /(was this review helpful|login to rate this review|^\s*$)/i) {

        my $l = shift @raw;
        push @lines, $l if $l !~ /\&nbsp;/;
    }

    my $output = "$name has appeared nude in:\n";
    $output .= join "\n", @lines;

    return $output;
}

sub cndb_preprocessor {
    my $parameter = shift;

    $parameter =~ s/(?<=\b)(\w)/\u$1/g;
    my @parts = split /\s+/, $parameter;
    my $last = pop @parts;
    $last .= "," if scalar(@parts);
    unshift @parts, $last;

    return join " ", @parts;
}

##
# Horoscope search
##
sub horoscope_search {
    my $content = shift;
    my $term = shift;

    $content =~ s/[\r\n]/ /gsm;
    my ($line) = $content =~ m|CHANGE $term HERE -->(.+)<!-- END $term HERE|i;
    $line =~ s/  +/ /g;

    if ($line eq "") {
      return "No results found; signs of the Zodiac are Aquarius, Pisces, Aries, Taurus, Gemini, Cancer, Leo, Virgo, Libra, Scorpio, Sagittarius, Capricorn";
    }

    $line =~ s/<ins class.*$//;

    return $line;
}

##
# Horoscope search
##
sub horrorscope_search {
    my $content = shift;
    my $term = shift;

    if ($term eq"") {
      return "Usage: horrorscope sign; signs of the Zodiac are Aquarius, Pisces, Aries, Taurus, Gemini, Cancer, Leo, Virgo, Libra, Scorpio, Sagittarius, Capricorn";
    }

    $content =~ s/[\r\n]/ /gsm;
    my ($line) = $content =~ m|<tr>.*?$term.*?</td>(.*?)</tr>|i;
    $line =~ s/  +/ /g;

    if ($line eq "") {
      return "No results found; signs of the Zodiac are Aquarius, Pisces, Aries, Taurus, Gemini, Cancer, Leo, Virgo, Libra, Scorpio, Sagittarius, Capricorn";
    }

    return $line;
}

##
# Bored.com entertainment provider
##
sub bored_search {
    my $content = shift;

    my @stuff = $content =~ /<b><a href="(.+?)" target="_blank"><font .+?>(.+?)<\/font><\/a> - <\/b> *(.+?)<br>/g;
    my @lines;
    while (scalar(@stuff) > 0) {
        my $url = shift @stuff;
        my $title = shift @stuff;
        my $desc = shift @stuff;

        $url = 'http://www.bored.com'.$url unless $url =~ /^http/;

        my $line = "$title - $url\n$desc";
        push @lines, $line;
    }

    my $pick = rand(scalar(@lines));
    return $lines[$pick];
}

##
# Sickipedia - Sick jokes for all
##

sub sick_search {
    my $content = shift;

    my @stuff = $content =~ /<description><!\[CDATA\[(.+?)]]><\/description>/gosm;

# Try and pick one with less than 5 lines ...
    my $pick = 0;
    my $brs = 0;
    my $count = 3;
    do {
        $pick = rand(scalar(@stuff));
        my @brs = $stuff[$pick] =~ /<br\/>/g;
        $brs = @brs;
        $count--;
    } while ($count > 0 && $brs > 4);
    my $line = $stuff[$pick];

    $line =~ s/<br\/>/\n/g;
    return $line;
}

##
# Random joke
##

sub joke_search {
    my $content = shift;

    my ($line) = $content =~ /<div class="chiste">(.+?)<\/div>/gosm;
    return $line;
}

##
# The Daily Mash random headline
##

sub tdm_search {
    my $content = shift;
    my $term = shift;

    my @lines = $content =~ /<item>(.+?)<\/item>/gosm;

    my $id = rand(scalar(@lines));
    if ($term =~ /^\d+$/ &&
        $term > 0 &&
        $term <= scalar(@lines)) {

	$id = $term - 1;
    }

    my @item = grep { /<(title|description|link)>/ } split /\n/, $lines[$id];
    foreach (@item) {
		s/^\s*//;
		$_ = unescapeHTML($_);
	}

    $item[1] =~ s/<.+?>//g;
    my ($url) = process_request('tinyurl', $item[1]);

    return "$item[0]\n$item[2]\n$url";
}


##
# Random proverbs
##

sub proverb_search {
    my $content = shift;

    $content =~ s/\n/ /g;
    my ($line) = $content =~ /<h2>(.+?)<\/h2>/sm;
    return $line;
}

###
# Many different lookerupperers
# Basic structure is:
#   '!triger' => { detail }
# Where the detail hash can have the following keys
#  'url' (mandatory) - The URL to search, optionally with %s for insertion of parameter
#  'sub' (mandatory) - Reference to sub to call with URL content
#  'pre' - Preprocessor to mangle the parameter before being passed to URL
#  'escape' - URL encode the parameter (1 - true, 0 - false). Defaults to true
#  'cache' - Cache individual URLs for 'cache' seconds (e.g. 3600 = 1 hr)
# All triggers can be called via privmsg. To be able to respond to public
# messages (i.e. 'in channel') the trigger must be prefixed by !
# The only 'private only' responder at the moment is tinyurl
###
my %ENGINES = ('!image' =>      {'url' => 'http://images.google.co.uk/images?hl=en&safe=off&q=%s',
                                 'sub' => \&image_search,
				 'cache' => 600},
               '!google' =>     {'url' => 'http://www.google.co.uk/search?hl=en&q=%s',
                                 'sub' => \&google_search},
               '!define' =>     {'url' => 'http://www.google.co.uk/search?hl=en&q=define%%3A%%20%s',
                                 'sub' => \&define_search},
               '!urban' =>      {'url' => 'http://www.urbandictionary.com/define.php?term=%s',
                                 'sub' => \&urban_search,
				 'cache' => 60},
               '!profan' =>     {'url' => 'http://www.viz.co.uk/profanisaurus/profan_results.php?profan=search&prof_search=%s',
                                 'sub' => \&profan_search},
               '!uwotd' =>      {'url' => 'http://feeds.urbandictionary.com/UrbanWordOfTheDay',
                                 'sub' => \&uwotd_search,
				 'cache' => 3600},
               '!wwotd' =>      {'url' => 'http://home.comcast.net/~wwftd/Frame1.html',
                                 'sub' => \&wwotd_search,
				 'cache' => 3600},
               '!wotd' =>       {'url' => 'http://dictionary.reference.com/wordoftheday/',
                                 'sub' => \&wotd_search,
				 'cache' => 3600},
               '!slogan' =>     {'url' => 'http://www.sloganizer.net/en/?slogan=%s',
                                'sub' => \&slogan_search},
               '!insult' => {'url' => 'http://www.webinsult.com/',
                                 'sub' => \&insult_search},
               '!compliment' => {'url' => 'http://www.madsci.org/cgi-bin/cgiwrap/~lynn/jardin/SCG/',
                                 'sub' => \&compliment_search},
               '!limerick' =>       {'url' => 'http://limerickdb.com/?%s',
                                 'sub' => \&limerick_search,
                                 'pre' => \&limerick_preprocessor},
               '!bash' =>       {'url' => 'http://bash.org/?%s',
                                 'sub' => \&bash_search,
                                 'pre' => \&bash_preprocessor},
               '!memetic' =>    {'url' => 'http://www.memetic.org/%s',
                                 'sub' => \&memetic_search,
                                 'pre' => \&memetic_preprocessor},
               '!cricket' =>    {'url' => 'http://www.cricinfo.com/rss/livescores.xml',
                                 'sub' => \&cricket_search},
               'tinyurl' =>     {'url' => 'http://tinyurl.com/create.php?url=%s',
                                 'sub' => \&tinyurl_search,
                                 'escape' => 0,
				 'cache' => 3600},
               '!cndb' =>       {'url' => 'http://cndb.com/actor.html?name=%s',
                                 'sub' => \&cndb_search,
                                 'pre' => \&cndb_preprocessor,
				 'cache' => 3600},
               '!horoscope' =>  {'url' => 'http://www.astrology-online.com/daily.htm',
                                 'sub' => \&horoscope_search,
				 'cache' => 3600},
               '!horrorscope' =>  {'url' => 'http://www.emilystrange.com/beware/horrorscopes.cfm',
                                 'sub' => \&horrorscope_search,
				 'cache' => 3600},
               '!bored' =>      {'url' => 'http://www.bored.com/',
                                 'sub' => \&bored_search,
				 'cache' => 3600},
               '!procrastinate' =>      {'url' => 'http://www.bored.com/',
                                 'sub' => \&bored_search,
				 'cache' => 3600},
               #'!sick' =>       {'url' => 'http://sickipedia.org/feeds/?1195996408.xml',
               #                  'sub' => \&sick_search},
               '!joke' =>       {'url' => 'http://www.ajokeaday.com/ChisteAlAzar.asp',
                                 'sub' => \&joke_search},
	       '!tdm' =>        {'url' => 'http://www.thedailymash.co.uk/rss.xml',
	                         'sub' => \&tdm_search,
				 'cache' => 3600},
                 '!proverb' => {'url' => 'http://server52204.uk2net.com/b3taproverbs/',
                                'sub' => \&proverb_search});

sub process_request {
    my ($trigger, $term, $server, $nick, $target) = @_;

    my $result = '';
    if (exists $ENGINES{$trigger}) {

        my $url = $ENGINES{$trigger}->{'url'};
        my $sub = $ENGINES{$trigger}->{'sub'};
        my $pre = exists $ENGINES{$trigger}->{'pre'} ?
            $ENGINES{$trigger}->{'pre'} : undef;
        my $escape = exists $ENGINES{$trigger}->{'escape'} ?
            $ENGINES{$trigger}->{'escape'} : 1;
        my $cache = exists $ENGINES{$trigger}->{'cache'} ?
            $ENGINES{$trigger}->{'cache'} : 0;

# Pre-process the parameter if a pre function is defined
        $term = $pre->($term) if defined $pre;

# Get the content from the URL
        my $content = get_content($url, $term, $escape, $cache);

# Get the results of the search
        $result = $sub->($content, $term, $server, $nick, $target) if defined $content;
    }
    else
    {
# Quit if this isn't for us
        return undef;
    }

# Split the resulting lines at linebreaks or
# whitespace delimited lines up to 400 characters long
# to prevent IRSSI truncating the output lines
	my @lines = $result =~ /(.{0,400})(?:\r|\n|\s+|$)/g;
	@lines = () unless @lines;

	my @output = ();
	foreach my $text (@lines) {
		next if $text =~ /^\s*$/;

# Strip HTML
        $text =~ s/<(.*?)>/ /g;
        $text = unescapeHTML($text);

# Strip non-printable characters
        $text =~ s/[^[:print:]]/ /g;

# Sort out whitespace
        $text =~ s/ +/ /g;
        $text =~ s/^ *//;
        $text =~ s/ *$//;

	push @output, $text;
	}

    @output = ('No results found') unless scalar(@output) > 0;

    return @output;
}

# Private responder, for privmsg functionality
##
sub private_responder {
    my ($server, $data, $nick, $mask) = @_;
    public_responder($server, $data, $nick, $mask, $nick);
}

##
# Public responder, where all the work gets done
##
sub public_responder {
    my ($server, $data, $nick, $mask, $target) = @_;
    $data =~ s/`//gosm;

    my ($trigger, $term) = get_data($data);
    $trigger =~ y/A-Z/a-z/;

    my $result;
    my $func;

# If this is a public message and the trigger has no !, silently ignore it
    return if ($nick ne $target && $trigger !~ /^!/);

# If the trigger exists, call the URL and process the result
    my @lines = process_request($trigger, $term, $server, $nick, $target);

# Display if necessary
    if (@lines) {
        $server->command("msg $target -!- $_")
            for grep { /./ } @lines;
    }
}

sub main {
  my ($trigger, $term);

  $trigger = shift(@ARGV);
  $term = join(' ', @ARGV);

  if (not defined $trigger) {
    print "Usage: $0 <trigger> [terms]";
    exit 1;
  }

  if ($trigger eq "list") {
    my $comma = "Triggers: ";
    foreach my $key (sort keys(%ENGINES)) {
      print "$comma$key";
      $comma = ", ";
    }
    print "\n";
    exit 1;
  }

  $trigger =~ s/^/!/;

  my @lines = process_request($trigger, $term, "server", "nick", "target");

  my $result = join(' ', @lines);

  if ($term ne "") {
    print "$term: ";
  }

  print $result . "\n";
}

main;