Limnoria/plugins/RSS.py

396 lines
16 KiB
Python
Raw Normal View History

2003-04-06 13:42:41 +02:00
#!/usr/bin/env python
###
# Copyright (c) 2002, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Provides basic functionality for handling RSS/RDF feeds.
2003-04-06 13:42:41 +02:00
"""
2003-11-25 09:23:47 +01:00
__revision__ = "$Id$"
import plugins
2003-04-06 13:42:41 +02:00
2003-12-03 22:21:43 +01:00
import sets
2003-04-06 13:49:53 +02:00
import time
2004-07-21 21:12:57 +02:00
import getopt
import sgmllib
import threading
from itertools import imap
2003-04-06 13:49:53 +02:00
2003-04-06 13:42:41 +02:00
import rssparser
2003-12-03 22:21:43 +01:00
import conf
2003-04-19 23:40:39 +02:00
import utils
import world
import ircmsgs
import ircutils
2003-04-06 13:42:41 +02:00
import privmsgs
import registry
2003-04-06 13:42:41 +02:00
import callbacks
def configure(advanced):
from questions import output, expect, anything, something, yn
conf.registerPlugin('RSS', True)
prompt = 'Would you like to add an RSS feed?'
while yn(prompt):
prompt = 'Would you like to add another RSS feed?'
2003-08-28 23:27:15 +02:00
name = something('What\'s the name of the website?')
url = something('What\'s the URL of the RSS feed?')
registerFeed(name, url)
2003-08-20 18:26:23 +02:00
2004-07-21 21:12:57 +02:00
class AnnouncedFeeds(registry.SpaceSeparatedListOf):
Value = registry.String
List = ircutils.IrcSet
conf.registerPlugin('RSS')
conf.registerChannelValue(conf.supybot.plugins.RSS, 'bold', registry.Boolean(
True, """Determines whether the bot will bold the title of the feed when it
announces new news."""))
conf.registerChannelValue(conf.supybot.plugins.RSS, 'headlineSeparator',
registry.StringSurroundedBySpaces(' || ', """Determines what string is used
to separate headlines in new feeds."""))
conf.registerChannelValue(conf.supybot.plugins.RSS, 'announcementPrefix',
registry.StringWithSpaceOnRight('New news from ', """Determines what prefix
is prepended (if any) to the new news item announcements made in the
channel."""))
conf.registerChannelValue(conf.supybot.plugins.RSS, 'announce',
2004-07-21 21:12:57 +02:00
AnnouncedFeeds([], """Determines which RSS feeds should be announced in
the channel; valid input is a list of strings (either registered RSS feeds
or RSS feed URLs) separated by spaces."""))
conf.registerGlobalValue(conf.supybot.plugins.RSS, 'waitPeriod',
registry.PositiveInteger(1800, """Indicates how many seconds the bot will
wait between retrieving RSS feeds; requests made within this period will
return cached results."""))
conf.registerGroup(conf.supybot.plugins.RSS, 'feeds')
conf.supybot.plugins.RSS.feeds.help = utils.normalizeWhitespace("""These are
the registered feeds for the RSS plugin.""")
def registerFeed(name, url):
conf.supybot.plugins.RSS.feeds.register(name, registry.String(url, ''))
2004-07-21 21:12:57 +02:00
class RSS(callbacks.Privmsg):
2003-04-06 13:42:41 +02:00
threaded = True
def __init__(self):
callbacks.Privmsg.__init__(self)
2003-12-03 22:21:43 +01:00
self.feedNames = sets.Set()
self.locks = {}
self.lastRequest = {}
self.cachedFeeds = {}
2004-04-13 20:25:49 +02:00
self.gettingLockLock = threading.Lock()
for (name, url) in registry._cache.iteritems():
if name.startswith('supybot.plugins.rss.feeds.'):
name = rsplit(name, '.', 1)[-1]
v = registry.String('', 'help is not needed here')
v.set(url)
url = v()
self.makeFeedCommand(name, url)
def __call__(self, irc, msg):
callbacks.Privmsg.__call__(self, irc, msg)
irc = callbacks.IrcObjectProxyRegexp(irc, msg)
L = conf.supybot.plugins.RSS.announce.getValues(fullNames=False)
newFeeds = {}
for (channel, v) in L:
2004-07-21 21:12:57 +02:00
feeds = v()
for name in feeds:
2004-04-15 16:17:07 +02:00
commandName = callbacks.canonicalName(name)
if self.isCommand(commandName):
name = commandName
url = self.getCommand(name).url
else:
url = name
if self.willGetNewFeed(url):
newFeeds.setdefault((url, name), []).append(channel)
for ((url, name), channels) in newFeeds.iteritems():
# We check if we can acquire the lock right here because if we
# don't, we'll possibly end up spawning a lot of threads to get
# the feed, because this thread may run for a number of bytecodes
# before it switches to a thread that'll get the lock in
# _newHeadlines.
2004-04-13 20:25:49 +02:00
if self.acquireLock(url, blocking=False):
try:
t = threading.Thread(target=self._newHeadlines,
name='Fetching <%s>' % url,
args=(irc, channels, name, url))
self.log.info('Spawning thread to fetch <%s>', url)
world.threadsSpawned += 1
t.setDaemon(True)
t.start()
finally:
2004-04-13 20:25:49 +02:00
self.releaseLock(url)
2004-04-09 17:13:26 +02:00
time.sleep(0.1) # So other threads can run.
def _newHeadlines(self, irc, channels, name, url):
try:
# We acquire the lock here so there's only one announcement thread
# in this code at any given time. Otherwise, several announcement
# threads will getFeed (all blocking, in turn); then they'll all
# want to sent their news messages to the appropriate channels.
2004-04-09 17:13:26 +02:00
# Note that we're allowed to acquire this lock twice within the
# same thread because it's an RLock and not just a normal Lock.
2004-04-13 20:25:49 +02:00
self.acquireLock(url)
try:
oldresults = self.cachedFeeds[url]
oldheadlines = self.getHeadlines(oldresults)
except KeyError:
oldheadlines = []
newresults = self.getFeed(url)
newheadlines = self.getHeadlines(newresults)
2004-06-01 18:22:48 +02:00
def canonicalize(headline):
return tuple(headline.lower().split())
oldheadlines = sets.Set(map(canonicalize, oldheadlines))
for (i, headline) in enumerate(newheadlines):
if canonicalize(headline) in oldheadlines:
2004-04-20 11:51:55 +02:00
newheadlines[i] = None
newheadlines = filter(None, newheadlines) # Removes Nones.
if newheadlines:
for channel in channels:
bold = self.registryValue('bold', channel)
sep = self.registryValue('headlineSeparator', channel)
prefix = self.registryValue('announcementPrefix', channel)
pre = '%s%s: ' % (prefix, name)
if bold:
pre = ircutils.bold(pre)
irc.replies(newheadlines, prefixer=pre, joiner=sep,
to=channel, prefixName=False, private=True)
finally:
2004-04-13 20:25:49 +02:00
self.releaseLock(url)
2004-07-21 21:12:57 +02:00
def willGetNewFeed(self, url):
now = time.time()
wait = self.registryValue('waitPeriod')
if url not in self.lastRequest or now - self.lastRequest[url] > wait:
return True
else:
return False
2004-02-10 04:14:46 +01:00
2004-04-13 20:25:49 +02:00
def acquireLock(self, url, blocking=True):
try:
self.gettingLockLock.acquire()
try:
lock = self.locks[url]
except KeyError:
lock = threading.RLock()
self.locks[url] = lock
return lock.acquire(blocking=blocking)
finally:
self.gettingLockLock.release()
def releaseLock(self, url):
self.locks[url].release()
2004-07-21 21:12:57 +02:00
def getFeed(self, url):
try:
# This is the most obvious place to acquire the lock, because a
# malicious user could conceivably flood the bot with rss commands
# and DoS the website in question.
2004-04-13 20:25:49 +02:00
self.acquireLock(url)
if self.willGetNewFeed(url):
2004-02-10 04:14:46 +01:00
try:
self.log.info('Downloading new feed from <%s>', url)
results = rssparser.parse(url)
except sgmllib.SGMLParseError:
self.log.exception('Uncaught exception from rssparser:')
raise callbacks.Error, 'Invalid (unparseable) RSS feed.'
self.cachedFeeds[url] = results
self.lastRequest[url] = time.time()
try:
return self.cachedFeeds[url]
except KeyError:
self.lastRequest[url] = 0
return {'items': [{'title': 'Unable to download feed.'}]}
finally:
2004-04-13 20:25:49 +02:00
self.releaseLock(url)
def getHeadlines(self, feed):
return [utils.htmlToText(d['title'].strip()) for d in feed['items']]
2004-04-29 13:49:24 +02:00
def _validFeedName(self, name):
if not registry.isValidRegistryName(name):
raise ValueError, name
return callbacks.canonicalName(name)
def makeFeedCommand(self, name, url):
docstring = """<number of headlines>
2003-12-03 22:21:43 +01:00
Reports the titles for %s at the RSS feed <%s>. If
<number of headlines> is given, returns only that many headlines.
RSS feeds are only looked up every supybot.plugins.RSS.waitPeriod
seconds, which defaults to 1800 (30 minutes) since that's what most
websites prefer.
2003-12-03 22:21:43 +01:00
""" % (name, url)
2004-04-29 13:49:24 +02:00
assert name == self._validFeedName(name)
2004-04-13 20:25:49 +02:00
if url not in self.locks:
self.locks[url] = threading.RLock()
if hasattr(self, name):
s = 'I already have a command in this plugin named %s' % name
raise callbacks.Error, s
2003-12-03 22:21:43 +01:00
def f(self, irc, msg, args):
args.insert(0, url)
self.rss(irc, msg, args)
f = utils.changeFunctionName(f, name, docstring)
f.url = url # Used by __call__.
2003-12-03 22:21:43 +01:00
self.feedNames.add(name)
setattr(self.__class__, name, f)
registerFeed(name, url)
2004-07-21 21:12:57 +02:00
def add(self, irc, msg, args):
"""<name> <url>
Adds a command to this plugin that will look up the RSS feed at the
given URL.
"""
(name, url) = privmsgs.getArgs(args, required=2)
2004-04-29 13:49:24 +02:00
try:
name = self._validFeedName(name)
except ValueError:
irc.error('%r is not a valid feed name. Feed names must not '
'include dots, colons, or spaces.' % name)
return
self.makeFeedCommand(name, url)
irc.replySuccess()
2003-12-03 22:21:43 +01:00
def remove(self, irc, msg, args):
"""<name>
Removes the command for looking up RSS feeds at <name> from
this plugin.
"""
name = privmsgs.getArgs(args)
name = callbacks.canonicalName(name)
if name not in self.feedNames:
irc.error('That\'s not a valid RSS feed command name.')
2003-12-03 22:21:43 +01:00
return
self.feedNames.remove(name)
2003-12-03 22:21:43 +01:00
delattr(self.__class__, name)
conf.supybot.plugins.RSS.feeds.unregister(name)
irc.replySuccess()
2004-02-07 23:38:49 +01:00
2004-02-11 08:22:13 +01:00
def announce(self, irc, msg, args, channel):
2004-07-21 21:12:57 +02:00
"""[<channel>] [--remove] [<name|url> ...]
2004-02-07 23:38:49 +01:00
Sets the current list of announced feeds in the channel to the feeds
given. Valid feeds include the names of registered feeds as well as
URLs for a RSS feeds. <channel> is only necessary if the message isn't
2004-07-21 21:12:57 +02:00
sent in the channel itself. If no arguments are specified, replies
with the current list of feeds to announce. If --remove is given,
the specified feeds will be removed from the list of feeds to announce.
2004-02-07 23:38:49 +01:00
"""
2004-07-21 21:12:57 +02:00
(optlist, rest) = getopt.getopt(args, '', ['remove'])
remove = False
announce = conf.supybot.plugins.RSS.announce
for (option, _) in optlist:
if option == '--remove':
remove = True
if remove:
if rest:
feeds = announce.get(channel)()
for feed in rest:
if feed in feeds:
feeds.remove(feed)
announce.get(channel).setValue(feeds)
irc.replySuccess()
return
else:
raise callbacks.ArgumentError
elif not rest:
feeds = utils.commaAndify(announce.get(channel)())
irc.reply(feeds or 'I am currently not announcing any feeds.')
return
else:
2004-07-21 21:12:57 +02:00
announce.get(channel).setValue(rest)
irc.replySuccess()
2004-07-21 21:12:57 +02:00
return
2004-02-11 08:22:13 +01:00
announce = privmsgs.checkChannelCapability(announce, 'op')
2004-07-21 21:12:57 +02:00
def rss(self, irc, msg, args):
"""<url> [<number of headlines>]
Gets the title components of the given RSS feed.
If <number of headlines> is given, return only that many headlines.
"""
(url, n) = privmsgs.getArgs(args, optional=1)
self.log.debug('Fetching %s', url)
feed = self.getFeed(url)
if ircutils.isChannel(msg.args[0]):
channel = msg.args[0]
else:
channel = None
headlines = self.getHeadlines(feed)
if not headlines:
irc.error('Couldn\'t get RSS feed')
return
if n:
try:
n = int(n)
except ValueError:
2004-04-29 13:49:24 +02:00
raise callbacks.ArgumentError
headlines = headlines[:n]
headlines = imap(utils.htmlToText, headlines)
sep = self.registryValue('headlineSeparator', channel)
irc.reply(sep.join(headlines))
2003-08-20 18:26:23 +02:00
def info(self, irc, msg, args):
"""<url|feed>
Returns information from the given RSS feed, namely the title,
URL, description, and last update date, if available.
"""
url = privmsgs.getArgs(args)
try:
url = self.registryValue('feeds.%s' % url)
except registry.NonExistentRegistryEntry:
pass
feed = self.getFeed(url)
info = feed['channel']
if not info:
irc.error('I couldn\'t retrieve that RSS feed.')
return
# check the 'modified' key, if it's there, convert it here first
2003-04-19 23:40:39 +02:00
if 'modified' in feed:
seconds = time.mktime(feed['modified'])
2003-08-28 19:55:01 +02:00
now = time.mktime(time.gmtime())
when = utils.timeElapsed(now - seconds) + ' ago'
else:
2003-04-19 23:40:39 +02:00
when = "time unavailable"
# The rest of the entries are all available in the channel key
2003-04-19 23:40:39 +02:00
response = 'Title: %s; URL: <%s>; ' \
'Description: %s; Last updated %s.' % (
2003-08-20 18:26:23 +02:00
info.get('title', 'unavailable').strip(),
2003-04-19 23:40:39 +02:00
info.get('link', 'unavailable').strip(),
info.get('description', 'unavailable').strip(),
when)
irc.reply(' '.join(response.split()))
2003-04-06 13:42:41 +02:00
Class = RSS
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=78: