Limnoria/plugins/URL.py

394 lines
16 KiB
Python
Raw Normal View History

#!/usr/bin/env python
###
# Copyright (c) 2002, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Keeps track of URLs posted to a channel, along with relevant context. Allows
searching for URLs and returning random URLs. Also provides statistics on the
URLs in the database.
"""
2003-11-25 09:23:47 +01:00
__revision__ = "$Id$"
import plugins
import os
import re
import time
2003-08-19 11:10:41 +02:00
import getopt
2003-11-03 06:39:14 +01:00
import urllib2
import urlparse
import conf
import utils
2003-09-09 09:36:41 +02:00
import ircmsgs
2003-12-17 14:55:22 +01:00
import webutils
import ircutils
import privmsgs
2004-01-27 19:09:35 +01:00
import registry
import callbacks
try:
import sqlite
except ImportError:
raise callbacks.Error, 'You need to have PySQLite installed to use this ' \
'plugin. Download it at <http://pysqlite.sf.net/>'
def configure(advanced):
from questions import output, expect, anything, something, yn
2004-01-27 19:09:35 +01:00
conf.registerPlugin('URL', True)
if yn("""This plugin offers a snarfer that will go to tinyurl.com and get
a shorter version of long URLs that are sent to the channel.
Would you like this snarfer to be enabled?""", default=False):
2004-01-27 19:09:35 +01:00
conf.supybot.plugins.URL.tinyurlSnarfer.setValue(True)
if yn("""This plugin also offers a snarfer that will try to fetch the
title of URLs that it sees in the channel. Would like you this
snarfer to be enabled?""", default=False):
2004-01-27 19:09:35 +01:00
conf.supybot.plugins.URL.titleSnarfer.setValue(True)
conf.registerPlugin('URL')
conf.registerChannelValue(conf.supybot.plugins.URL, 'tinyurlSnarfer',
registry.Boolean(False, """Determines whether the
tinyurl snarfer is enabled. This snarfer will watch for URLs in the
channel, and if they're sufficiently long (as determined by
supybot.plugins.URL.tinyurlSnarfer.minimumLength) it will post a smaller
from tinyurl.com."""))
2004-01-28 16:53:46 +01:00
conf.registerChannelValue(conf.supybot.plugins.URL.tinyurlSnarfer,
2004-01-31 23:04:46 +01:00
'minimumLength',
2004-01-27 19:09:35 +01:00
registry.PositiveInteger(48, """The minimum length a URL must be before the
tinyurl snarfer will snarf it."""))
conf.registerChannelValue(conf.supybot.plugins.URL, 'titleSnarfer',
registry.Boolean(False, """Determines whether the bot will output the HTML
title of URLs it sees in the channel."""))
conf.registerChannelValue(conf.supybot.plugins.URL, 'nonSnarfingRegexp',
registry.Regexp(None, """Determines what URLs are to be snarfed and stored
in the database in the channel; URLs matchin the regexp given will not be
snarfed. Give the empty string if you have no URLs that you'd like to
exclude from being snarfed."""))
class URL(callbacks.PrivmsgCommandAndRegexp,
plugins.ChannelDBHandler):
2003-12-17 14:55:22 +01:00
regexps = ['tinyurlSnarfer', 'titleSnarfer']
_titleRe = re.compile('<title>(.*?)</title>', re.I)
def __init__(self):
self.nextMsgs = {}
plugins.ChannelDBHandler.__init__(self)
callbacks.PrivmsgCommandAndRegexp.__init__(self)
2003-11-11 16:58:20 +01:00
def die(self):
plugins.ChannelDBHandler.die(self)
callbacks.PrivmsgCommandAndRegexp.die(self)
2003-11-11 16:58:20 +01:00
def makeDb(self, filename):
if os.path.exists(filename):
return sqlite.connect(filename)
db = sqlite.connect(filename)
cursor = db.cursor()
cursor.execute("""CREATE TABLE urls (
id INTEGER PRIMARY KEY,
url TEXT,
added TIMESTAMP,
added_by TEXT,
previous_msg TEXT,
current_msg TEXT,
next_msg TEXT,
protocol TEXT,
site TEXT,
filename TEXT
)""")
2003-11-03 06:39:14 +01:00
cursor.execute("""CREATE TABLE tinyurls (
id INTEGER PRIMARY KEY,
url_id INTEGER,
tinyurl TEXT
)""")
db.commit()
return db
def doPrivmsg(self, irc, msg):
channel = msg.args[0]
db = self.getDb(channel)
cursor = db.cursor()
if (msg.nick, channel) in self.nextMsgs:
L = self.nextMsgs.pop((msg.nick, msg.args[0]))
for (url, added) in L:
cursor.execute("""UPDATE urls SET next_msg=%s
WHERE url=%s AND added=%s""",
msg.args[1], url, added)
2003-09-09 09:36:41 +02:00
if ircmsgs.isAction(msg):
text = ircmsgs.unAction(msg)
else:
text = msg.args[1]
2004-01-04 12:18:53 +01:00
for url in webutils.urlRe.findall(text):
2004-01-27 19:09:35 +01:00
r = self.registryValue('nonSnarfingRegexp', channel)
if r and r.search(url):
continue
(protocol, site, filename, _, _, _) = urlparse.urlparse(url)
previousMsg = ''
for oldMsg in reversed(irc.state.history):
if oldMsg.command == 'PRIVMSG':
if oldMsg.nick == msg.nick and oldMsg.args[0] == channel:
previousMsg = oldMsg.args[1]
addedBy = msg.nick
added = int(time.time())
cursor.execute("""INSERT INTO urls VALUES
(NULL, %s, %s, %s, %s, %s, '', %s, %s, %s)""",
url, added, addedBy, msg.args[1], previousMsg,
protocol, site, filename)
key = (msg.nick, channel)
self.nextMsgs.setdefault(key, []).append((url, added))
db.commit()
super(URL, self).doPrivmsg(irc, msg)
def tinyurlSnarfer(self, irc, msg, match):
r"https?://[^\])>\s]{18,}"
if not ircutils.isChannel(msg.args[0]):
return
channel = msg.args[0]
2004-01-27 19:09:35 +01:00
if self.registryValue('tinyurlSnarfer', channel):
2003-12-17 14:55:22 +01:00
url = match.group(0)
2004-01-31 23:04:46 +01:00
minlen = self.registryValue('tinyurlSnarfer.minimumLength',channel)
if len(url) >= minlen:
db = self.getDb(channel)
cursor = db.cursor()
(tinyurl, updateDb) = self._getTinyUrl(url, channel)
if tinyurl is None:
self.log.warning('tinyurl was None for url %r', url)
return
elif updateDb:
self._updateTinyDb(url, tinyurl, channel)
domain = webutils.getDomain(url)
s = '%s (at %s)' % (ircutils.bold(tinyurl), domain)
irc.reply(s, prefixName=False)
tinyurlSnarfer = privmsgs.urlSnarfer(tinyurlSnarfer)
2003-12-17 14:55:22 +01:00
def titleSnarfer(self, irc, msg, match):
r"https?://[^\])>\s]+"
if not ircutils.isChannel(msg.args[0]):
return
if callbacks.addressed(irc.nick, msg):
return
2003-12-17 14:55:22 +01:00
channel = msg.args[0]
2004-01-27 19:09:35 +01:00
if self.registryValue('titleSnarfer', channel):
2003-12-17 14:55:22 +01:00
url = match.group(0)
2004-02-27 18:20:57 +01:00
try:
text = webutils.getUrl(url, size=conf.supybot.httpPeekSize())
except webutils.WebError, e:
self.log.info('Couldn\'t snarf title of %s, %s.', url, e)
return
2003-12-17 14:55:22 +01:00
m = self._titleRe.search(text)
if m is not None:
domain = webutils.getDomain(url)
title = utils.htmlToText(m.group(1).strip())
s = 'Title: %s (at %s)' % (title, domain)
irc.reply(s, prefixName=False)
2003-12-17 14:55:22 +01:00
titleSnarfer = privmsgs.urlSnarfer(titleSnarfer)
def _updateTinyDb(self, url, tinyurl, channel):
db = self.getDb(channel)
cursor = db.cursor()
cursor.execute("""INSERT INTO tinyurls
VALUES (NULL, 0, %s)""", tinyurl)
cursor.execute("""SELECT id FROM urls WHERE url=%s""", url)
id = cursor.fetchone()[0]
cursor.execute("""UPDATE tinyurls SET url_id=%s
WHERE tinyurl=%s""", id, tinyurl)
db.commit()
2003-11-12 18:30:24 +01:00
_tinyRe = re.compile(r'(http://tinyurl\.com/\w+)</blockquote>')
def _getTinyUrl(self, url, channel, cmd=False):
db = self.getDb(channel)
cursor = db.cursor()
cursor.execute("""SELECT tinyurls.tinyurl FROM urls, tinyurls
WHERE urls.url=%s AND
tinyurls.url_id=urls.id""", url)
if cursor.rowcount == 0:
updateDb = True
try:
fd = urllib2.urlopen('http://tinyurl.com/create.php?url=%s' %
url)
s = fd.read()
fd.close()
m = self._tinyRe.search(s)
if m is None:
tinyurl = None
else:
tinyurl = m.group(1)
except urllib2.HTTPError, e:
if cmd:
raise callbacks.Error, e.msg()
else:
self.log.warning(str(e))
else:
updateDb = False
tinyurl = cursor.fetchone()[0]
return (tinyurl, updateDb)
2003-11-03 06:39:14 +01:00
def _formatUrl(self, url, added, addedBy):
2004-01-31 23:04:46 +01:00
when = time.strftime(conf.supybot.humanTimestampFormat(),
time.localtime(int(added)))
return '<%s> (added by %s at %s)' % (url, addedBy, when)
2003-11-04 09:48:05 +01:00
def random(self, irc, msg, args):
"""[<channel>]
Returns a random URL from the URL database. <channel> is only required
if the message isn't sent in the channel itself.
"""
2003-08-19 11:10:41 +02:00
channel = privmsgs.getChannel(msg, args)
db = self.getDb(channel)
cursor = db.cursor()
cursor.execute("""SELECT url, added, added_by
FROM urls
ORDER BY random()
LIMIT 1""")
if cursor.rowcount == 0:
irc.reply('I have no URLs in my database for %s' % channel)
else:
irc.reply(self._formatUrl(*cursor.fetchone()))
2003-11-04 09:48:05 +01:00
def tiny(self, irc, msg, args):
2003-11-03 06:39:14 +01:00
"""<url>
Returns a TinyURL.com version of <url>
"""
url = privmsgs.getArgs(args)
if len(url) < 24:
irc.error(
'Stop being a lazy-biotch and type the URL yourself.')
return
channel = msg.args[0]
2004-01-27 19:09:35 +01:00
snarf = self.registryValue('tinyurlSnarfer', channel)
2004-01-31 23:04:46 +01:00
minlen = self.registryValue('tinyurlSnarfer.minimumLength', channel)
if snarf and len(url) >= minlen:
2003-11-03 06:39:14 +01:00
return
(tinyurl, updateDb) = self._getTinyUrl(url, channel, cmd=True)
if tinyurl:
if updateDb:
self._updateTinyDb(url, tinyurl, channel)
irc.reply(tinyurl)
else:
s = 'Could not parse the TinyURL.com results page.'
irc.errorPossibleBug(s)
tiny = privmsgs.thread(tiny)
2003-11-03 06:39:14 +01:00
2004-01-18 09:19:44 +01:00
def stats(self, irc, msg, args):
"""[<channel>]
Returns the number of URLs in the URL database. <channel> is only
required if the message isn't sent in the channel itself.
"""
2003-08-19 11:10:41 +02:00
channel = privmsgs.getChannel(msg, args)
db = self.getDb(channel)
cursor = db.cursor()
cursor.execute("""SELECT COUNT(*) FROM urls""")
(count,) = cursor.fetchone()
count = int(count)
irc.reply('I have %s %s in my database.' %
(count, count == 1 and 'URL' or 'URLs'))
2003-08-19 11:10:41 +02:00
2003-11-04 09:48:05 +01:00
def last(self, irc, msg, args):
"""[<channel>] [--{from,with,at,proto,near}=<value>] --{nolimit,fancy}
2003-08-19 11:10:41 +02:00
Gives the last URL matching the given criteria. --from is from whom
the URL came; --at is the site of the URL; --proto is the protocol the
URL used; --with is something inside the URL; --near is a string in the
messages before and after the link. If --nolimit is given, returns as
many URLs as can fit in the message. --fancy returns information in
addition to just the URL. <channel> is only necessary if the
2003-08-19 11:10:41 +02:00
message isn't sent in the channel itself.
"""
channel = privmsgs.getChannel(msg, args)
(optlist, rest) = getopt.getopt(args, '', ['from=', 'with=', 'at=',
'proto=', 'near=',
'nolimit', 'fancy'])
2003-08-19 11:10:41 +02:00
criteria = ['1=1']
formats = []
simple = True
nolimit = False
2003-08-19 11:10:41 +02:00
for (option, argument) in optlist:
2003-09-05 21:37:58 +02:00
option = option.lstrip('-') # Strip off the --.
if option == 'nolimit':
nolimit = True
if option == 'fancy':
simple = False
2003-09-03 19:03:38 +02:00
elif option == 'from':
2003-08-19 11:10:41 +02:00
criteria.append('added_by LIKE %s')
formats.append(argument)
elif option == 'with':
if '%' not in argument and '_' not in argument:
argument = '%%%s%%' % argument
2003-08-19 11:10:41 +02:00
criteria.append('url LIKE %s')
formats.append(argument)
elif option == 'at':
if '%' not in argument and '_' not in argument:
argument = '%' + argument
2003-08-19 11:10:41 +02:00
criteria.append('site LIKE %s')
formats.append(argument)
elif option == 'proto':
criteria.append('protocol=%s')
formats.append(argument)
elif option == 'near':
criteria.append("""(previous_msg LIKE %s OR
next_msg LIKE %s OR
current_msg LIKE %s)""")
if '%' not in argument:
argument = '%%%s%%' % argument
formats.append(argument)
formats.append(argument)
formats.append(argument)
db = self.getDb(channel)
cursor = db.cursor()
criterion = ' AND '.join(criteria)
2003-09-08 21:44:09 +02:00
sql = """SELECT id, url, added, added_by
FROM urls
WHERE %s ORDER BY id DESC
2003-09-07 08:23:32 +02:00
LIMIT 100""" % criterion
2003-08-19 11:10:41 +02:00
cursor.execute(sql, *formats)
if cursor.rowcount == 0:
irc.reply('No URLs matched that criteria.')
2003-08-19 11:10:41 +02:00
else:
if nolimit:
2003-09-08 21:44:09 +02:00
urls = ['<%s>' % t[1] for t in cursor.fetchall()]
s = ', '.join(urls)
2003-09-05 21:37:58 +02:00
elif simple:
s = cursor.fetchone()[1]
else:
2003-09-08 21:44:09 +02:00
(id, url, added, added_by) = cursor.fetchone()
timestamp = time.strftime('%I:%M %p, %B %d, %Y',
time.localtime(int(added)))
2003-09-08 21:44:09 +02:00
s = '#%s: <%s>, added by %s at %s.' % \
(id, url, added_by, timestamp)
irc.reply(s)
2003-08-20 18:26:23 +02:00
2003-11-04 09:48:05 +01:00
Class = URL
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: