Limnoria/plugins/URL.py

###
# Copyright (c) 2002-2004, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###

"""
Keeps track of URLs posted to a channel, along with relevant context.  Allows
searching for URLs and returning random URLs.  Also provides statistics on the
URLs in the database.
"""

__revision__ = "$Id$"

import supybot.plugins as plugins

import re

import supybot.dbi as dbi
import supybot.conf as conf
import supybot.utils as utils
from supybot.commands import *
import supybot.ircmsgs as ircmsgs
import supybot.ircutils as ircutils
import supybot.webutils as webutils
import supybot.registry as registry
import supybot.callbacks as callbacks

def configure(advanced):
    from supybot.questions import output, expect, anything, something, yn
    conf.registerPlugin('URL', True)
    if yn("""This plugin also offers a snarfer that will try to fetch the
             title of URLs that it sees in the channel.  Would like you this
             snarfer to be enabled?""", default=False):
        conf.supybot.plugins.URL.titleSnarfer.setValue(True)

conf.registerPlugin('URL')
conf.registerChannelValue(conf.supybot.plugins.URL, 'titleSnarfer',
    registry.Boolean(False, """Determines whether the bot will output the HTML
    title of URLs it sees in the channel."""))
conf.registerChannelValue(conf.supybot.plugins.URL, 'nonSnarfingRegexp',
    registry.Regexp(None, """Determines what URLs are to be snarfed and stored
    in the database in the channel; URLs matching the regexp given will not be
    snarfed.  Give the empty string if you have no URLs that you'd like to
    exclude from being snarfed."""))

class UrlRecord(dbi.Record):
    __fields__ = [
        ('url', eval),
        ('by', eval),
        ('near', eval),
        ('at', eval),
        ]

class DbiUrlDB(plugins.DbiChannelDB):
    class DB(dbi.DB):
        Record = UrlRecord
        def add(self, url, msg):
            record = self.Record(url=url, by=msg.nick,
                                 near=msg.args[1], at=msg.receivedAt)
            super(self.__class__, self).add(record)
        def urls(self, p):
            L = list(self.select(p))
            L.reverse()
            return L

URLDB = plugins.DB('URL', {'flat': DbiUrlDB})

class URL(callbacks.PrivmsgCommandAndRegexp):
    priority = 100 # lower than 99, the normal priority.
    regexps = ['titleSnarfer']
    _titleRe = re.compile('<title>(.*?)</title>', re.I | re.S)
    def __init__(self):
        self.__parent = super(URL, self)
        self.__parent.__init__()
        self.db = URLDB()

    def doPrivmsg(self, irc, msg):
        channel = msg.args[0]
        if ircutils.isChannel(channel):
            if ircmsgs.isAction(msg):
                text = ircmsgs.unAction(msg)
            else:
                text = msg.args[1]
            for url in webutils.urlRe.findall(text):
                r = self.registryValue('nonSnarfingRegexp', channel)
                if r and r.search(url):
                    self.log.debug('Skipping adding %s to db.',
                                   utils.quoted(url))
                    continue
                self.log.debug('Adding %s to db.', utils.quoted(url))
                self.db.add(channel, url, msg)
        self.__parent.doPrivmsg(irc, msg)

    def titleSnarfer(self, irc, msg, match):
        r"https?://[^\])>\s]+"
        channel = msg.args[0]
        if not ircutils.isChannel(channel):
            return
        if callbacks.addressed(irc.nick, msg):
            return
        if self.registryValue('titleSnarfer', channel):
            url = match.group(0)
            r = self.registryValue('nonSnarfingRegexp', channel)
            if r and r.search(url):
                self.log.debug('Not titleSnarfing %s.', utils.quoted(url))
                return
            try:
                size = conf.supybot.protocols.http.peekSize()
                text = webutils.getUrl(url, size=size)
            except webutils.WebError, e:
                self.log.info('Couldn\'t snarf title of %s, %s.', url, e)
                return
            m = self._titleRe.search(text)
            if m is not None:
                domain = webutils.getDomain(url)
                title = utils.htmlToText(m.group(1).strip())
                s = 'Title: %s (at %s)' % (title, domain)
                irc.reply(s, prefixName=False)
    titleSnarfer = urlSnarfer(titleSnarfer)

    def stats(self, irc, msg, args, channel):
        """[<channel>]

        Returns the number of URLs in the URL database.  <channel> is only
        required if the message isn't sent in the channel itself.
        """
        self.db.vacuum(channel)
        count = self.db.size(channel)
        irc.reply('I have %s in my database.' % utils.nItems('URL', count))
    stats = wrap(stats, ['channeldb'])

    def last(self, irc, msg, args, channel, optlist):
        """[<channel>] [--{from,with,without,near,proto}=<value>] --nolimit

        Gives the last URL matching the given criteria.  --from is from whom
        the URL came; --proto is the protocol the URL used; --with is something
        inside the URL; --without is something that should not be in the URL;
        --near is something in the same message as the URL; If --nolimit is
        given, returns all the URLs that are found. to just the URL.
        <channel> is only necessary if the message isn't sent in the channel
        itself.
        """
        predicates = []
        f = None
        nolimit = False
        for (option, arg) in optlist:
            if option == 'nolimit':
                nolimit = True
            elif option == 'from':
                def f(record, arg=arg):
                    return ircutils.strEqual(record.by, arg)
            elif option == 'with':
                def f(record, arg=arg):
                    return arg in record.url
            elif option == 'without':
                def f(record, arg=arg):
                    return arg not in record.url
            elif option == 'proto':
                def f(record, arg=arg):
                    return record.url.startswith(arg)
            elif option == 'near':
                def f(record, arg=arg):
                    return arg in record.near
            if f is not None:
                predicates.append(f)
        def predicate(record):
            for predicate in predicates:
                if not predicate(record):
                    return False
            return True
        urls = [record.url for record in self.db.urls(channel, predicate)]
        if not urls:
            irc.reply('No URLs matched that criteria.')
        else:
            if nolimit:
                urls = ['<%s>' % url for url in urls]
                s = ', '.join(urls)
            else:
                # We should optimize this with another URLDB method eventually.
                s = urls[0]
            irc.reply(s)
    last = wrap(last, ['channeldb',
                       getopts({'from': 'something', 'with': 'something',
                                'near': 'something', 'proto': 'something',
                                'nolimit': '', 'without': 'something',})])

Class = URL

# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: