2003-08-17 11:31:04 +02:00
|
|
|
###
|
2004-08-23 15:14:06 +02:00
|
|
|
# Copyright (c) 2002-2004, Jeremiah Fincher
|
2003-08-17 11:31:04 +02:00
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
|
|
|
"""
|
|
|
|
Keeps track of URLs posted to a channel, along with relevant context. Allows
|
|
|
|
searching for URLs and returning random URLs. Also provides statistics on the
|
|
|
|
URLs in the database.
|
|
|
|
"""
|
|
|
|
|
2003-11-25 09:23:47 +01:00
|
|
|
__revision__ = "$Id$"
|
|
|
|
|
2004-07-24 07:18:26 +02:00
|
|
|
import supybot.plugins as plugins
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2003-10-04 15:53:13 +02:00
|
|
|
import os
|
2003-08-17 11:31:04 +02:00
|
|
|
import re
|
2004-07-28 08:02:09 +02:00
|
|
|
import sets
|
2003-08-17 11:31:04 +02:00
|
|
|
import time
|
2003-08-19 11:10:41 +02:00
|
|
|
import getopt
|
2003-08-17 11:31:04 +02:00
|
|
|
import urlparse
|
2004-07-31 01:39:57 +02:00
|
|
|
import itertools
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2004-09-28 21:58:32 +02:00
|
|
|
import supybot.dbi as dbi
|
2004-07-24 07:18:26 +02:00
|
|
|
import supybot.conf as conf
|
|
|
|
import supybot.utils as utils
|
2004-09-30 12:04:22 +02:00
|
|
|
from supybot.commands import wrap
|
2004-07-24 07:18:26 +02:00
|
|
|
import supybot.ircmsgs as ircmsgs
|
|
|
|
import supybot.ircutils as ircutils
|
2004-09-30 12:04:22 +02:00
|
|
|
import supybot.webutils as webutils
|
2004-07-24 07:18:26 +02:00
|
|
|
import supybot.privmsgs as privmsgs
|
|
|
|
import supybot.registry as registry
|
|
|
|
import supybot.callbacks as callbacks
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2004-01-30 00:58:27 +01:00
|
|
|
def configure(advanced):
|
2004-07-25 20:24:51 +02:00
|
|
|
from supybot.questions import output, expect, anything, something, yn
|
2004-01-27 19:09:35 +01:00
|
|
|
conf.registerPlugin('URL', True)
|
|
|
|
if yn("""This plugin also offers a snarfer that will try to fetch the
|
|
|
|
title of URLs that it sees in the channel. Would like you this
|
2004-01-31 23:24:43 +01:00
|
|
|
snarfer to be enabled?""", default=False):
|
2004-01-27 19:09:35 +01:00
|
|
|
conf.supybot.plugins.URL.titleSnarfer.setValue(True)
|
|
|
|
|
|
|
|
conf.registerPlugin('URL')
|
|
|
|
conf.registerChannelValue(conf.supybot.plugins.URL, 'titleSnarfer',
|
|
|
|
registry.Boolean(False, """Determines whether the bot will output the HTML
|
|
|
|
title of URLs it sees in the channel."""))
|
|
|
|
conf.registerChannelValue(conf.supybot.plugins.URL, 'nonSnarfingRegexp',
|
|
|
|
registry.Regexp(None, """Determines what URLs are to be snarfed and stored
|
2004-08-19 01:15:27 +02:00
|
|
|
in the database in the channel; URLs matching the regexp given will not be
|
2004-01-27 19:09:35 +01:00
|
|
|
snarfed. Give the empty string if you have no URLs that you'd like to
|
|
|
|
exclude from being snarfed."""))
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2004-09-28 21:58:32 +02:00
|
|
|
class UrlRecord(dbi.Record):
|
|
|
|
__fields__ = [
|
|
|
|
'url',
|
|
|
|
'by',
|
|
|
|
'near',
|
|
|
|
'at',
|
|
|
|
]
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2004-09-28 21:58:32 +02:00
|
|
|
class DbiUrlDB(plugins.DbiChannelDB):
|
|
|
|
class DB(dbi.DB):
|
|
|
|
Record = UrlRecord
|
|
|
|
def add(self, url, msg):
|
|
|
|
record = self.Record(url=url, by=msg.nick,
|
|
|
|
near=msg.args[1], at=msg.receivedAt)
|
|
|
|
super(self.__class__, self).add(record)
|
|
|
|
def urls(self, p):
|
|
|
|
L = list(self.select(p))
|
2004-07-28 07:59:30 +02:00
|
|
|
L.reverse()
|
|
|
|
return L
|
2003-08-17 11:31:04 +02:00
|
|
|
|
2004-09-28 21:58:32 +02:00
|
|
|
URLDB = plugins.DB('URL', {'flat': DbiUrlDB})
|
2004-07-31 01:39:57 +02:00
|
|
|
|
2004-07-28 07:59:30 +02:00
|
|
|
class URL(callbacks.PrivmsgCommandAndRegexp):
|
2004-08-16 20:46:13 +02:00
|
|
|
priority = 100 # lower than 99, the normal priority.
|
2004-09-29 06:40:25 +02:00
|
|
|
regexps = ['titleSnarfer']
|
2004-07-28 07:59:30 +02:00
|
|
|
_titleRe = re.compile('<title>(.*?)</title>', re.I | re.S)
|
2004-09-28 21:58:32 +02:00
|
|
|
def __init__(self):
|
|
|
|
self.__parent = super(URL, self)
|
|
|
|
self.__parent.__init__()
|
|
|
|
self.db = URLDB()
|
2004-08-01 16:38:37 +02:00
|
|
|
|
2003-08-17 11:31:04 +02:00
|
|
|
def doPrivmsg(self, irc, msg):
|
|
|
|
channel = msg.args[0]
|
2004-08-22 20:58:28 +02:00
|
|
|
if ircutils.isChannel(channel):
|
|
|
|
if ircmsgs.isAction(msg):
|
|
|
|
text = ircmsgs.unAction(msg)
|
|
|
|
else:
|
|
|
|
text = msg.args[1]
|
|
|
|
for url in webutils.urlRe.findall(text):
|
|
|
|
r = self.registryValue('nonSnarfingRegexp', channel)
|
|
|
|
if r and r.search(url):
|
|
|
|
self.log.debug('Skipping adding %r to db.', url)
|
|
|
|
continue
|
|
|
|
self.log.debug('Adding %r to db.', url)
|
2004-09-28 21:58:32 +02:00
|
|
|
self.db.add(channel, url, msg)
|
|
|
|
self.__parent.doPrivmsg(irc, msg)
|
2003-11-17 07:02:26 +01:00
|
|
|
|
2003-12-17 14:55:22 +01:00
|
|
|
def titleSnarfer(self, irc, msg, match):
|
|
|
|
r"https?://[^\])>\s]+"
|
2004-07-28 07:59:30 +02:00
|
|
|
channel = msg.args[0]
|
|
|
|
if not ircutils.isChannel(channel):
|
2003-12-17 14:55:22 +01:00
|
|
|
return
|
2004-02-13 22:57:38 +01:00
|
|
|
if callbacks.addressed(irc.nick, msg):
|
|
|
|
return
|
2004-01-27 19:09:35 +01:00
|
|
|
if self.registryValue('titleSnarfer', channel):
|
2003-12-17 14:55:22 +01:00
|
|
|
url = match.group(0)
|
2004-07-28 07:59:30 +02:00
|
|
|
r = self.registryValue('nonSnarfingRegexp', channel)
|
2004-06-07 21:45:49 +02:00
|
|
|
if r and r.search(url):
|
2004-07-28 07:59:30 +02:00
|
|
|
self.log.debug('Not titleSnarfing %r.', url)
|
2004-06-07 21:45:49 +02:00
|
|
|
return
|
2004-02-27 18:20:57 +01:00
|
|
|
try:
|
2004-04-30 17:03:20 +02:00
|
|
|
size = conf.supybot.protocols.http.peekSize()
|
|
|
|
text = webutils.getUrl(url, size=size)
|
2004-02-27 18:20:57 +01:00
|
|
|
except webutils.WebError, e:
|
|
|
|
self.log.info('Couldn\'t snarf title of %s, %s.', url, e)
|
|
|
|
return
|
2003-12-17 14:55:22 +01:00
|
|
|
m = self._titleRe.search(text)
|
|
|
|
if m is not None:
|
2004-01-31 21:52:24 +01:00
|
|
|
domain = webutils.getDomain(url)
|
|
|
|
title = utils.htmlToText(m.group(1).strip())
|
|
|
|
s = 'Title: %s (at %s)' % (title, domain)
|
2004-01-08 04:12:14 +01:00
|
|
|
irc.reply(s, prefixName=False)
|
2004-09-30 12:04:22 +02:00
|
|
|
titleSnarfer = wrap(titleSnarfer, decorators=['urlSnarfer'])
|
2004-07-21 21:36:35 +02:00
|
|
|
|
2004-01-18 09:19:44 +01:00
|
|
|
def stats(self, irc, msg, args):
|
2003-08-17 11:31:04 +02:00
|
|
|
"""[<channel>]
|
|
|
|
|
|
|
|
Returns the number of URLs in the URL database. <channel> is only
|
|
|
|
required if the message isn't sent in the channel itself.
|
|
|
|
"""
|
2003-08-19 11:10:41 +02:00
|
|
|
channel = privmsgs.getChannel(msg, args)
|
2004-09-28 21:58:32 +02:00
|
|
|
self.db.vacuum(channel)
|
|
|
|
count = self.db.size(channel)
|
2004-07-28 07:59:30 +02:00
|
|
|
irc.reply('I have %s in my database.' % utils.nItems('URL', count))
|
2003-08-19 11:10:41 +02:00
|
|
|
|
2003-11-04 09:48:05 +01:00
|
|
|
def last(self, irc, msg, args):
|
2004-09-28 21:58:32 +02:00
|
|
|
"""[<channel>] [--{from,with,near,proto}=<value>] --{nolimit}
|
2003-08-19 11:10:41 +02:00
|
|
|
|
|
|
|
Gives the last URL matching the given criteria. --from is from whom
|
2004-07-28 07:59:30 +02:00
|
|
|
the URL came; --proto is the protocol the URL used; --with is something
|
2004-09-28 21:58:32 +02:00
|
|
|
inside the URL; --near is something in the same message as the URL; If
|
|
|
|
--nolimit is given, returns all the URLs that are found. to just the
|
|
|
|
URL. <channel> is only necessary if the message isn't sent in the
|
|
|
|
channel itself.
|
2003-08-19 11:10:41 +02:00
|
|
|
"""
|
|
|
|
channel = privmsgs.getChannel(msg, args)
|
2004-09-28 21:58:32 +02:00
|
|
|
(optlist, rest) = getopt.getopt(args, '', ['from=', 'with=', 'near=',
|
2004-07-28 07:59:30 +02:00
|
|
|
'proto=', 'nolimit',])
|
|
|
|
predicates = []
|
2004-09-28 21:58:32 +02:00
|
|
|
f = None
|
2003-08-29 02:40:28 +02:00
|
|
|
nolimit = False
|
2004-07-28 07:59:30 +02:00
|
|
|
for (option, arg) in optlist:
|
|
|
|
if option == '--nolimit':
|
2003-08-29 02:40:28 +02:00
|
|
|
nolimit = True
|
2004-07-28 07:59:30 +02:00
|
|
|
elif option == '--from':
|
2004-09-28 21:58:32 +02:00
|
|
|
def f(record, arg=arg):
|
|
|
|
return ircutils.strEqual(record.by, arg)
|
2004-07-28 07:59:30 +02:00
|
|
|
elif option == '--with':
|
2004-09-28 21:58:32 +02:00
|
|
|
def f(record, arg=arg):
|
|
|
|
return arg in record.url
|
2004-07-28 07:59:30 +02:00
|
|
|
elif option == '--proto':
|
2004-09-28 21:58:32 +02:00
|
|
|
def f(record, arg=arg):
|
|
|
|
return record.url.startswith(arg)
|
|
|
|
elif option == '--near':
|
|
|
|
def f(record, arg=arg):
|
|
|
|
return arg in record.near
|
|
|
|
if f is not None:
|
|
|
|
predicates.append(f)
|
|
|
|
def predicate(record):
|
2004-07-28 07:59:30 +02:00
|
|
|
for predicate in predicates:
|
2004-09-28 21:58:32 +02:00
|
|
|
if not predicate(record):
|
2004-07-28 07:59:30 +02:00
|
|
|
return False
|
|
|
|
return True
|
2004-09-28 21:58:32 +02:00
|
|
|
urls = [record.url for record in self.db.urls(channel, predicate)]
|
2004-07-28 07:59:30 +02:00
|
|
|
if not urls:
|
2004-01-08 04:12:14 +01:00
|
|
|
irc.reply('No URLs matched that criteria.')
|
2003-08-19 11:10:41 +02:00
|
|
|
else:
|
2003-08-29 02:40:28 +02:00
|
|
|
if nolimit:
|
2004-07-28 07:59:30 +02:00
|
|
|
urls = ['<%s>' % url for url in urls]
|
2003-09-07 08:01:25 +02:00
|
|
|
s = ', '.join(urls)
|
2003-08-29 02:40:28 +02:00
|
|
|
else:
|
2004-07-28 07:59:30 +02:00
|
|
|
# We should optimize this with another URLDB method eventually.
|
|
|
|
s = urls[0]
|
2004-01-08 04:12:14 +01:00
|
|
|
irc.reply(s)
|
2003-08-20 18:26:23 +02:00
|
|
|
|
|
|
|
|
2003-11-04 09:48:05 +01:00
|
|
|
Class = URL
|
2003-08-17 11:31:04 +02:00
|
|
|
|
|
|
|
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
|