From 94fbea6266e263bd2650399a5d2aed3540552f07 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 14:24:47 +0200 Subject: [PATCH 01/14] Add utils.structures.TruncatableSet. --- src/utils/structures.py | 35 +++++++++++++++++++++++++++++++++++ test/test_utils.py | 22 +++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/utils/structures.py b/src/utils/structures.py index 31e8f561f..1be386ffd 100644 --- a/src/utils/structures.py +++ b/src/utils/structures.py @@ -454,5 +454,40 @@ class CacheDict(collections.MutableMapping): def __len__(self): return len(self.d) +class TruncatableSet(collections.MutableSet): + """A set that keeps track of the order of inserted elements so + the oldest can be removed.""" + def __init__(self, iterable): + self._ordered_items = list(iterable) + self._items = set(self._ordered_items) + def __contains__(self, item): + return item in self._items + def __iter__(self): + return iter(self._items) + def __len__(self): + return len(self._items) + def add(self, item): + if item not in self._items: + self._items.add(item) + self._ordered_items.append(item) + def discard(self, item): + self._items.discard(item) + self._ordered_items.remove(item) + def truncate(self, size): + assert size >= 0 + removed_size = len(self)-size + # I make two different cases depending on removed_size Date: Thu, 31 Jul 2014 14:59:22 +0200 Subject: [PATCH 02/14] RSS: Group config variables in the code in a more logical way. --- plugins/RSS/config.py | 70 ++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/plugins/RSS/config.py b/plugins/RSS/config.py index 0d4c824a0..eafc1b946 100644 --- a/plugins/RSS/config.py +++ b/plugins/RSS/config.py @@ -50,12 +50,48 @@ class FeedItemSortOrder(registry.OnlySomeStrings): validStrings = ('asInFeed', 'oldestFirst', 'newestFirst') RSS = conf.registerPlugin('RSS') + +conf.registerGlobalValue(RSS, 'feeds', + FeedNames([], _("""Determines what feeds should be accessible as + commands."""))) + +######## +# Format + +# Common conf.registerChannelValue(RSS, 'bold', registry.Boolean( True, _("""Determines whether the bot will bold the title of the feed when it announces news."""))) conf.registerChannelValue(RSS, 'headlineSeparator', registry.StringSurroundedBySpaces('|', _("""Determines what string is used to separate headlines in new feeds."""))) + +# Format for non-announced headlines +conf.registerChannelValue(RSS, 'showLinks', + registry.Boolean(True, _("""Determines whether the bot will list the link + along with the title of the feed when the rss command is called. + supybot.plugins.RSS.announce.showLinks affects whether links will be + listed when a feed is automatically announced."""))) +conf.registerChannelValue(RSS, 'showPubDate', + registry.Boolean(False, """Determines whether the bot will list the + publication datetime stamp along with the title of the feed when the rss + command is called. + supybot.plugins.RSS.announce.showPubDate affects whether this will be + listed when a feed is automatically announced.""")) + +# Format for announced headlines +conf.registerGroup(RSS, 'announce') +conf.registerChannelValue(RSS.announce, 'showLinks', + registry.Boolean(True, _("""Determines whether the bot will list the link + along with the title of the feed when a feed is automatically + announced."""))) +conf.registerChannelValue(RSS.announce, 'showPubDate', + registry.Boolean(False, """Determines whether the bot will list the + publication datetime stamp along with the title of the feed when a feed + is automatically announced.""")) +conf.registerGlobalValue(RSS.announce, 'cachePeriod', + registry.PositiveInteger(604800, """Maximum age of cached RSS headlines, + in seconds. Headline cache is used to avoid re-announcing old news.""")) conf.registerChannelValue(RSS, 'announcementPrefix', registry.StringWithSpaceOnRight(_('News from '), _("""Determines what prefix is prepended (if any) to the news item announcements made in the @@ -63,6 +99,10 @@ conf.registerChannelValue(RSS, 'announcementPrefix', conf.registerChannelValue(RSS, 'announcementSeparator', registry.StringWithSpaceOnRight(_(': '), _("""Determines what suffix is appended to the feed name in a news item."""))) + +########### +# Announces + conf.registerChannelValue(RSS, 'announce', registry.SpaceSeparatedSetOfStrings([], _("""Determines which RSS feeds should be announced in the channel; valid input is a list of strings @@ -79,20 +119,8 @@ conf.registerGlobalValue(RSS, 'stripRedirect', registry.Boolean( True, """Determines whether the bot will attempt to strip url redirection from headline links, by taking things after the last http://.""")) -conf.registerGlobalValue(RSS, 'feeds', - FeedNames([], _("""Determines what feeds should be accessible as - commands."""))) -conf.registerChannelValue(RSS, 'showLinks', - registry.Boolean(True, _("""Determines whether the bot will list the link - along with the title of the feed when the rss command is called. - supybot.plugins.RSS.announce.showLinks affects whether links will be - listed when a feed is automatically announced."""))) -conf.registerChannelValue(RSS, 'showPubDate', - registry.Boolean(False, """Determines whether the bot will list the - publication datetime stamp along with the title of the feed when the rss - command is called. - supybot.plugins.RSS.announce.showPubDate affects whether this will be - listed when a feed is automatically announced.""")) +#################### +# Headlines filtering conf.registerGlobalValue(RSS, 'defaultNumberOfHeadlines', registry.PositiveInteger(1, _("""Indicates how many headlines an rss feed will output by default, if no number is provided."""))) @@ -108,19 +136,5 @@ conf.registerChannelValue(RSS, 'keywordBlacklist', strings, lets you filter headlines to those not containing any items in this blacklist."""))) -conf.registerGroup(RSS, 'announce') -conf.registerChannelValue(RSS.announce, 'showLinks', - registry.Boolean(True, _("""Determines whether the bot will list the link - along with the title of the feed when a feed is automatically - announced."""))) - -conf.registerChannelValue(RSS.announce, 'showPubDate', - registry.Boolean(False, """Determines whether the bot will list the - publication datetime stamp along with the title of the feed when a feed - is automatically announced.""")) -conf.registerGlobalValue(RSS.announce, 'cachePeriod', - registry.PositiveInteger(604800, """Maximum age of cached RSS headlines, - in seconds. Headline cache is used to avoid re-announcing old news.""")) - # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: From 00aaf79ef8ed10772cdef38fa7d2d3ea2c4933bf Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 15:50:26 +0200 Subject: [PATCH 03/14] Add test for union with TruncatableSet. --- test/test_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 7032a69f6..c5b53fe5b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1153,6 +1153,12 @@ class TestTruncatableSet(SupyTestCase): s.truncate(3) self.assertEqual(s, {'bar', 'baz', 'qux'}) + def testTruncateUnion(self): + s = TruncatableSet(['bar', 'foo']) + s |= {'baz', 'qux'} + s.truncate(3) + self.assertEqual(s, {'foo', 'baz', 'qux'}) + # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: From c51876b4d6fea0da3a561aaab2afd526dc0604ac Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 18:43:40 +0200 Subject: [PATCH 04/14] Allow TruncatableSet to get no argument. --- src/utils/structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/structures.py b/src/utils/structures.py index 1be386ffd..db2a7706b 100644 --- a/src/utils/structures.py +++ b/src/utils/structures.py @@ -457,7 +457,7 @@ class CacheDict(collections.MutableMapping): class TruncatableSet(collections.MutableSet): """A set that keeps track of the order of inserted elements so the oldest can be removed.""" - def __init__(self, iterable): + def __init__(self, iterable=[]): self._ordered_items = list(iterable) self._items = set(self._ordered_items) def __contains__(self, item): From af24192b0f8c9db4ee803b8c1bc1366c8b64de0f Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 18:44:00 +0200 Subject: [PATCH 05/14] Allow utils.str.timestamp to take a struct_time as argument. --- src/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/conf.py b/src/conf.py index b99feecf4..8015ae60a 100644 --- a/src/conf.py +++ b/src/conf.py @@ -374,7 +374,8 @@ registerChannelValue(supybot.reply.format, 'time', def timestamp(t): if t is None: t = time.time() - t = time.localtime(t) + elif isinstance(t, int): + t = time.localtime(t) format = get(supybot.reply.format.time, dynamic.channel) return time.strftime(format, t) utils.str.timestamp = timestamp From 219c47d1c89e0cd3ed494a5cae9b062639ce7142 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 18:44:49 +0200 Subject: [PATCH 06/14] RSS: Major rewriting. @rss works. --- plugins/RSS/config.py | 55 ++--- plugins/RSS/plugin.py | 510 ++++++++++++++++-------------------------- 2 files changed, 211 insertions(+), 354 deletions(-) diff --git a/plugins/RSS/config.py b/plugins/RSS/config.py index eafc1b946..7021d2cb6 100644 --- a/plugins/RSS/config.py +++ b/plugins/RSS/config.py @@ -58,47 +58,21 @@ conf.registerGlobalValue(RSS, 'feeds', ######## # Format -# Common -conf.registerChannelValue(RSS, 'bold', registry.Boolean( - True, _("""Determines whether the bot will bold the title of the feed when - it announces news."""))) conf.registerChannelValue(RSS, 'headlineSeparator', registry.StringSurroundedBySpaces('|', _("""Determines what string is used to separate headlines in new feeds."""))) - -# Format for non-announced headlines -conf.registerChannelValue(RSS, 'showLinks', - registry.Boolean(True, _("""Determines whether the bot will list the link - along with the title of the feed when the rss command is called. - supybot.plugins.RSS.announce.showLinks affects whether links will be - listed when a feed is automatically announced."""))) -conf.registerChannelValue(RSS, 'showPubDate', - registry.Boolean(False, """Determines whether the bot will list the - publication datetime stamp along with the title of the feed when the rss - command is called. - supybot.plugins.RSS.announce.showPubDate affects whether this will be - listed when a feed is automatically announced.""")) - -# Format for announced headlines -conf.registerGroup(RSS, 'announce') -conf.registerChannelValue(RSS.announce, 'showLinks', - registry.Boolean(True, _("""Determines whether the bot will list the link - along with the title of the feed when a feed is automatically - announced."""))) -conf.registerChannelValue(RSS.announce, 'showPubDate', - registry.Boolean(False, """Determines whether the bot will list the - publication datetime stamp along with the title of the feed when a feed - is automatically announced.""")) -conf.registerGlobalValue(RSS.announce, 'cachePeriod', - registry.PositiveInteger(604800, """Maximum age of cached RSS headlines, - in seconds. Headline cache is used to avoid re-announcing old news.""")) -conf.registerChannelValue(RSS, 'announcementPrefix', - registry.StringWithSpaceOnRight(_('News from '), _("""Determines what - prefix is prepended (if any) to the news item announcements made in the - channel."""))) -conf.registerChannelValue(RSS, 'announcementSeparator', - registry.StringWithSpaceOnRight(_(': '), _("""Determines what - suffix is appended to the feed name in a news item."""))) +conf.registerChannelValue(RSS, 'format', + registry.String(_('$date: $title <$link>'), _("""The format the bot + will use for displaying headlines of a RSS feed that is triggered + manually. In addition to fields defined by feedparser ($published + (the entry date), $title, $link, $description, $id, etc.), the following + variables can be used: $feed_name, $date (parsed date, as defined in + supybot.reply.format.time)"""))) +conf.registerChannelValue(RSS, 'announceFormat', + registry.String(_('News from $feed_name: $title <$link>'), + _("""The format the bot will use for displaying headlines of a RSS feed + that is announced. See supybot.plugins.RSS.format for the available + variables."""))) ########### # Announces @@ -115,13 +89,10 @@ conf.registerGlobalValue(RSS, 'sortFeedItems', FeedItemSortOrder('asInFeed', _("""Determines whether feed items should be sorted by their update timestamp or kept in the same order as they appear in a feed."""))) -conf.registerGlobalValue(RSS, 'stripRedirect', registry.Boolean( - True, """Determines whether the bot will attempt to strip url redirection - from headline links, by taking things after the last http://.""")) #################### # Headlines filtering -conf.registerGlobalValue(RSS, 'defaultNumberOfHeadlines', +conf.registerChannelValue(RSS, 'defaultNumberOfHeadlines', registry.PositiveInteger(1, _("""Indicates how many headlines an rss feed will output by default, if no number is provided."""))) conf.registerChannelValue(RSS, 'initialAnnounceHeadlines', diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index b8c81a011..9306d0978 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -1,6 +1,7 @@ ### # Copyright (c) 2002-2004, Jeremiah Fincher # Copyright (c) 2008-2010, James McCoy +# Copyright (c) 2014, Valentin Lorentz # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,6 +31,7 @@ import time import types +import string import socket import threading import re @@ -46,12 +48,67 @@ import supybot.callbacks as callbacks from supybot.i18n import PluginInternationalization, internationalizeDocstring _ = PluginInternationalization('RSS') -def getFeedName(irc, msg, args, state): +def get_feedName(irc, msg, args, state): if not registry.isValidRegistryName(args[0]): state.errorInvalid('feed name', args[0], 'Feed names must not include spaces.') state.args.append(callbacks.canonicalName(args.pop(0))) -addConverter('feedName', getFeedName) +addConverter('feedName', get_feedName) + +class Feed: + __slots__ = ('url', 'name', 'data', 'last_update', 'entries', + 'lock', 'announced_entries') + def __init__(self, name, url, plugin_is_loading=False): + self.name = name + self.url = url + self.data = None + # We don't want to fetch feeds right after the plugin is + # loaded (the bot could be starting, and thus already busy) + self.last_update = time.time() if plugin_is_loading else 0 + self.entries = None + self.lock = threading.Thread() + self.announced_entries = utils.structures.TruncatableSet() + + @property + def command(self): + docstring = format(_("""[] + + Reports the titles for %s at the RSS feed %u. If + is given, returns only that many headlines. + RSS feeds are only looked up every supybot.plugins.RSS.waitPeriod + seconds, which defaults to 1800 (30 minutes) since that's what most + websites prefer."""), self.name, self.url) + if self.isCommandMethod(name): + s = format('I already have a command in this plugin named %s.',name) + raise callbacks.Error(s) + def f(self, irc, msg, args): + args.insert(0, url) + self.rss(irc, msg, args) + f = utils.python.changeFunctionName(f, name, docstring) + f = types.MethodType(f, self) + return f + +def lock_feed(f): + def newf(feed, *args, **kwargs): + with feed.lock: + return f(feed, *args, **kwargs) + return f + +def sort_feed_items(items, order): + """Return feed items, sorted according to sortFeedItems.""" + if order not in ['oldestFirst', 'newestFirst']: + return items + if order == 'oldestFirst': + reverse = False + if order == 'newestFirst': + reverse = True + try: + sitems = sorted(items, key=lambda i: i['updated'], reverse=reverse) + except KeyError: + # feedparser normalizes required timestamp fields in ATOM and RSS + # to the "updated" field. Feeds missing it are unsortable by date. + return items + return sitems class RSS(callbacks.Plugin): """This plugin is useful both for announcing updates to RSS feeds in a @@ -62,326 +119,153 @@ class RSS(callbacks.Plugin): def __init__(self, irc): self.__parent = super(RSS, self) self.__parent.__init__(irc) - # Schema is feed : [url, command] - self.feedNames = callbacks.CanonicalNameDict() - self.locks = {} - self.lastRequest = {} - self.cachedFeeds = {} - self.cachedHeadlines = {} - self.gettingLockLock = threading.Lock() + # Scheme: {name: url} + self.feed_names = callbacks.CanonicalNameDict() + # Scheme: {url: feed} + self.feeds = {} for name in self.registryValue('feeds'): - self._registerFeed(name) + self.register_feed_config(name) try: url = self.registryValue(registry.join(['feeds', name])) except registry.NonExistentRegistryEntry: self.log.warning('%s is not a registered feed, removing.',name) continue - self.makeFeedCommand(name, url) - self.getFeed(url) # So announced feeds don't announce on startup. + self.feed_names[name] = url + self.feeds[url] = Feed(name, url, True) + + ################## + # Feed registering + + def register_feed_config(self, name, url=''): + self.registryValue('feeds').add(name) + group = self.registryValue('feeds', value=False) + conf.registerGlobalValue(group, name, registry.String(url, '')) + + def remove_feed(self, feed): + del self.feed_names[feed.name] + del self.feeds[feed.url] + conf.supybot.plugins.RSS.feeds().remove(name) + conf.supybot.plugins.RSS.feeds.unregister(name) + + ################## + # Methods handling def isCommandMethod(self, name): if not self.__parent.isCommandMethod(name): - if name in self.feedNames: - return True - else: - return False + return bool(self.get_feed(name)) else: return True def listCommands(self): - return self.__parent.listCommands(self.feedNames.keys()) + return self.__parent.listCommands(self.feeds.keys()) def getCommandMethod(self, command): try: return self.__parent.getCommandMethod(command) except AttributeError: - return self.feedNames[command[0]][1] - - def _registerFeed(self, name, url=''): - self.registryValue('feeds').add(name) - group = self.registryValue('feeds', value=False) - conf.registerGlobalValue(group, name, registry.String(url, '')) + return self.feeds[command[0]].command def __call__(self, irc, msg): self.__parent.__call__(irc, msg) - irc = callbacks.SimpleProxy(irc, msg) - newFeeds = {} - for channel in irc.state.channels: - feeds = self.registryValue('announce', channel) - for name in feeds: - commandName = callbacks.canonicalName(name) - if self.isCommandMethod(commandName): - url = self.feedNames[commandName][0] - else: - url = name - if self.willGetNewFeed(url): - newFeeds.setdefault((url, name), []).append(channel) - for ((url, name), channels) in newFeeds.iteritems(): - # We check if we can acquire the lock right here because if we - # don't, we'll possibly end up spawning a lot of threads to get - # the feed, because this thread may run for a number of bytecodes - # before it switches to a thread that'll get the lock in - # _newHeadlines. - if self.acquireLock(url, blocking=False): - try: - t = threading.Thread(target=self._newHeadlines, - name=format('Fetching %u', url), - args=(irc, channels, name, url)) - self.log.info('Checking for announcements at %u', url) - world.threadsSpawned += 1 - t.setDaemon(True) - t.start() - finally: - self.releaseLock(url) - time.sleep(0.1) # So other threads can run. + self.update_feeds() - def buildHeadlines(self, headlines, channel, linksconfig='announce.showLinks', dateconfig='announce.showPubDate'): - newheadlines = [] - for headline in headlines: - link = '' - pubDate = '' - if self.registryValue(linksconfig, channel): - if headline[1]: - if self.registryValue('stripRedirect'): - link = re.sub('^.*http://', 'http://', headline[1]) - else: - link = headline[1] - if self.registryValue(dateconfig, channel): - if headline[2]: - pubDate = ' [%s]' % (headline[2],) - if sys.version_info[0] < 3: - if isinstance(headline[0], unicode): - try: - import charade.universaldetector - u = charade.universaldetector.UniversalDetector() - u.feed(headline[0]) - u.close() - encoding = u.result['encoding'] - except ImportError: - encoding = 'utf8' - newheadlines.append(format('%s %u%s', - headline[0].encode(encoding,'replace'), - link, - pubDate)) - else: - newheadlines.append(format('%s %u%s', - headline[0], - link, - pubDate)) - else: - newheadlines.append(format('%s %u%s', - headline[0], - link, - pubDate)) - return newheadlines - def _newHeadlines(self, irc, channels, name, url): - try: - # We acquire the lock here so there's only one announcement thread - # in this code at any given time. Otherwise, several announcement - # threads will getFeed (all blocking, in turn); then they'll all - # want to send their news messages to the appropriate channels. - # Note that we're allowed to acquire this lock twice within the - # same thread because it's an RLock and not just a normal Lock. - self.acquireLock(url) - t = time.time() - try: - #oldresults = self.cachedFeeds[url] - #oldheadlines = self.getHeadlines(oldresults) - oldheadlines = self.cachedHeadlines[url] - oldheadlines = list(filter(lambda x: t - x[3] < - self.registryValue('announce.cachePeriod'), oldheadlines)) - except KeyError: - oldheadlines = [] - newresults = self.getFeed(url) - newheadlines = self.getHeadlines(newresults) - if len(newheadlines) == 1: - s = newheadlines[0][0] - if s in ('Timeout downloading feed.', - 'Unable to download feed.'): - self.log.debug('%s %u', s, url) - return - def normalize(headline): - return (tuple(headline[0].lower().split()), headline[1]) - oldheadlinesset = set(map(normalize, oldheadlines)) - for (i, headline) in enumerate(newheadlines): - if normalize(headline) in oldheadlinesset: - newheadlines[i] = None - newheadlines = list(filter(None, newheadlines)) # Removes Nones. - number_of_headlines = len(oldheadlines) - oldheadlines.extend(newheadlines) - self.cachedHeadlines[url] = oldheadlines - if newheadlines: - def filter_whitelist(headline): - v = False - for kw in whitelist: - if kw in headline[0] or kw in headline[1]: - v = True - break - return v - def filter_blacklist(headline): - v = True - for kw in blacklist: - if kw in headline[0] or kw in headline[1]: - v = False - break - return v - for channel in channels: - if number_of_headlines == 0: - channelnewheadlines = newheadlines[:self.registryValue('initialAnnounceHeadlines', channel)] - else: - channelnewheadlines = newheadlines[:] - whitelist = self.registryValue('keywordWhitelist', channel) - blacklist = self.registryValue('keywordBlacklist', channel) - if len(whitelist) != 0: - channelnewheadlines = filter(filter_whitelist, channelnewheadlines) - if len(blacklist) != 0: - channelnewheadlines = filter(filter_blacklist, channelnewheadlines) - channelnewheadlines = list(channelnewheadlines) - if len(channelnewheadlines) == 0: - return - bold = self.registryValue('bold', channel) - sep = self.registryValue('headlineSeparator', channel) - prefix = self.registryValue('announcementPrefix', channel) - suffix = self.registryValue('announcementSeparator', channel) - pre = format('%s%s%s', prefix, name, suffix) - if bold: - pre = ircutils.bold(pre) - sep = ircutils.bold(sep) - headlines = self.buildHeadlines(channelnewheadlines, channel) - irc.replies(headlines, prefixer=pre, joiner=sep, - to=channel, prefixNick=False, private=True) - finally: - self.releaseLock(url) + ################## + # Status accessors - def willGetNewFeed(self, url): - now = time.time() - wait = self.registryValue('waitPeriod') - if url not in self.lastRequest or now - self.lastRequest[url] > wait: - return True - else: - return False + def get_feed(self, name): + return self.feeds.get(self.feed_names.get(name, name), None) - def acquireLock(self, url, blocking=True): - try: - self.gettingLockLock.acquire() - try: - lock = self.locks[url] - except KeyError: - lock = threading.RLock() - self.locks[url] = lock - return lock.acquire(blocking=blocking) - finally: - self.gettingLockLock.release() + def is_expired(self, feed): + assert feed + event_horizon = time.time() - self.registryValue('waitPeriod') + return feed.last_update < event_horizon - def releaseLock(self, url): - self.locks[url].release() - def getFeed(self, url): - def error(s): - return {'items': [{'title': s}]} - try: - # This is the most obvious place to acquire the lock, because a - # malicious user could conceivably flood the bot with rss commands - # and DoS the website in question. - self.acquireLock(url) - if self.willGetNewFeed(url): - results = {} - try: - self.log.debug('Downloading new feed from %u', url) - results = feedparser.parse(url) - if 'bozo_exception' in results and not results['entries']: - raise results['bozo_exception'] - except feedparser.sgmllib.SGMLParseError: - self.log.exception('Uncaught exception from feedparser:') - raise callbacks.Error('Invalid (unparsable) RSS feed.') - except socket.timeout: - return error('Timeout downloading feed.') - except Exception as e: - # These seem mostly harmless. We'll need reports of a - # kind that isn't. - self.log.debug('Allowing bozo_exception %r through.', e) - if results.get('feed', {}) and self.getHeadlines(results): - self.cachedFeeds[url] = results - self.lastRequest[url] = time.time() - else: - self.log.debug('Not caching results; feed is empty.') - try: - return self.cachedFeeds[url] - except KeyError: - wait = self.registryValue('waitPeriod') - # If there's a problem retrieving the feed, we should back off - # for a little bit before retrying so that there is time for - # the error to be resolved. - self.lastRequest[url] = time.time() - .5 * wait - return error('Unable to download feed.') - finally: - self.releaseLock(url) + ############### + # Feed fetching + + @lock_feed + def update_feed(self, feed): + d = feedparser.parse(feed.url) + feed.data = d.feed + feed.entries = d.entries + self.announce_feed(feed) + + def update_feed_in_thread(self, feed): + feed.last_update = time.time() + t = world.SupyThread(target=self.update_feed, + name=format('Fetching feed %u', feed.url), + args=(feed,)) + t.setDaemon(True) + t.start() + + def update_feed_if_needed(self, feed): + if self.is_expired(feed): + self.update_feed(feed) + + def update_feeds(self): + for name in self.registryValue('feeds'): + self.update_feed_if_needed(self.get_feed(name)) + + @lock_feed + def announce_feed(self, feed): + entries = feed.entries + new_entries = [entry for entry in entries + if entry.id not in feed.announced_entries] + if not new_entries: + return - def _getConverter(self, feed): - toText = utils.web.htmlToText - if 'encoding' in feed: - def conv(s): - # encode() first so there implicit encoding doesn't happen in - # other functions when unicode and bytestring objects are used - # together - s = s.encode(feed['encoding'], 'replace') - s = toText(s).strip() - return s - return conv - else: - return lambda s: toText(s).strip() - def _sortFeedItems(self, items): - """Return feed items, sorted according to sortFeedItems.""" order = self.registryValue('sortFeedItems') - if order not in ['oldestFirst', 'newestFirst']: - return items - if order == 'oldestFirst': - reverse = False - if order == 'newestFirst': - reverse = True - try: - sitems = sorted(items, key=lambda i: i['updated'], reverse=reverse) - except KeyError: - # feedparser normalizes required timestamp fields in ATOM and RSS - # to the "updated" field. Feeds missing it are unsortable by date. - return items - return sitems + new_entries = sort_feed_items(new_entries, order) + for irc in world.ircs: + for channel in irc.state.channels: + if feed.name not in self.registryValue('announce', channel): + continue + for entry in new_entries: + self.announce_entry(irc, channel, feed, entry) + feed.announced_entries |= {entry.id for entry in new_entries} + # We keep a little more because we don't want to re-announce + # oldest entries if one of the newest gets removed. + feed.announced_entries.truncate(2*len(entries)) - def getHeadlines(self, feed): - headlines = [] - t = time.time() - conv = self._getConverter(feed) - for d in self._sortFeedItems(feed['items']): - if 'title' in d: - title = conv(d['title']) - link = d.get('link') - pubDate = d.get('pubDate', d.get('updated')) - headlines.append((title, link, pubDate, t)) - return headlines - @internationalizeDocstring - def makeFeedCommand(self, name, url): - docstring = format("""[] + ################# + # Entry rendering - Reports the titles for %s at the RSS feed %u. If - is given, returns only that many headlines. - RSS feeds are only looked up every supybot.plugins.RSS.waitPeriod - seconds, which defaults to 1800 (30 minutes) since that's what most - websites prefer. - """, name, url) - if url not in self.locks: - self.locks[url] = threading.RLock() - if self.isCommandMethod(name): - s = format('I already have a command in this plugin named %s.',name) - raise callbacks.Error(s) - def f(self, irc, msg, args): - args.insert(0, url) - self.rss(irc, msg, args) - f = utils.python.changeFunctionName(f, name, docstring) - f = types.MethodType(f, self) - self.feedNames[name] = (url, f) - self._registerFeed(name, url) + def should_send_entry(self, channel, entry): + whitelist = self.registryValue('keywordWhitelist', channel) + blacklist = self.registryValue('keywordBlacklist', channel) + if whitelist: + if all(kw not in entry.title and kw not in entry.description + for kw in whitelist): + return False + if blacklist: + if any(kw in entry.title or kw in entry.description + for kw in blacklist): + return False + return True + + def format_entry(self, channel, feed, entry, is_announce): + if is_announce: + template = self.registryValue('announceFormat', channel) + else: + template = self.registryValue('format', channel) + date = entry.get('published_parsed', entry.get('updated_parsed')) + date = utils.str.timestamp(date) + return string.Template(template).safe_substitute(template, + feed_name=feed.name, + date=date, + **entry) + + def announce_entry(self, irc, channel, feed, entry): + if self.should_send_entry(channel, entry): + s = format_entry(channel, feed, entry, True) + irc.sendMsg(ircmsgs.privmsg(channel, s)) + + + ########## + # Commands @internationalizeDocstring def add(self, irc, msg, args, name, url): @@ -390,7 +274,8 @@ class RSS(callbacks.Plugin): Adds a command to this plugin that will look up the RSS feed at the given URL. """ - self.makeFeedCommand(name, url) + self.register_feed_config(name, url) + self.feeds[name] = Feed(name, url) irc.replySuccess() add = wrap(add, ['feedName', 'url']) @@ -401,12 +286,11 @@ class RSS(callbacks.Plugin): Removes the command for looking up RSS feeds at from this plugin. """ - if name not in self.feedNames: + feed = self.get_feed(name) + if not feed: irc.error(_('That\'s not a valid RSS feed command name.')) return - del self.feedNames[name] - conf.supybot.plugins.RSS.feeds().remove(name) - conf.supybot.plugins.RSS.feeds.unregister(name) + self.remove_feed(feed) irc.replySuccess() remove = wrap(remove, ['feedName']) @@ -467,23 +351,25 @@ class RSS(callbacks.Plugin): If is given, return only that many headlines. """ self.log.debug('Fetching %u', url) - feed = self.getFeed(url) + feed = self.get_feed(url) + if not feed: + feed = Feed(url, url) if irc.isChannel(msg.args[0]): channel = msg.args[0] else: channel = None - headlines = self.getHeadlines(feed) - if not headlines: + self.update_feed_if_needed(feed) + entries = feed.entries + if not entries: irc.error(_('Couldn\'t get RSS feed.')) return - headlines = self.buildHeadlines(headlines, channel, 'showLinks', 'showPubDate') - if n: - headlines = headlines[:n] - else: - headlines = headlines[:self.registryValue('defaultNumberOfHeadlines')] + n = n or self.registryValue('defaultNumberOfHeadlines', channel) + entries = list(filter(lambda e:self.should_send_entry(channel, e), + feed.entries)) + entries = entries[:n] + headlines = map(lambda e:self.format_entry(channel, feed, e, False), + entries) sep = self.registryValue('headlineSeparator', channel) - if self.registryValue('bold', channel): - sep = ircutils.bold(sep) irc.replies(headlines, joiner=sep) rss = wrap(rss, ['url', additional('int')]) @@ -498,9 +384,9 @@ class RSS(callbacks.Plugin): url = self.registryValue('feeds.%s' % url) except registry.NonExistentRegistryEntry: pass - feed = self.getFeed(url) - conv = self._getConverter(feed) - info = feed.get('feed') + feed = self.get_feed(url) + self.update_feed_if_needed(feed) + info = feed.data if not info: irc.error(_('I couldn\'t retrieve that RSS feed.')) return @@ -510,10 +396,10 @@ class RSS(callbacks.Plugin): now = time.mktime(time.gmtime()) when = utils.timeElapsed(now - seconds) + ' ago' else: - when = 'time unavailable' - title = conv(info.get('title', 'unavailable')) - desc = conv(info.get('description', 'unavailable')) - link = conv(info.get('link', 'unavailable')) + when = _('time unavailable') + title = info.get('title', _('unavailable')) + desc = info.get('description', _('unavailable')) + link = info.get('link', _('unavailable')) # The rest of the entries are all available in the channel key response = format(_('Title: %s; URL: %u; ' 'Description: %s; Last updated: %s.'), From 0a6a4991a5f0e0d50577032c5abcbee92034135b Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 18:56:52 +0200 Subject: [PATCH 07/14] RSS: Make feed commands work. --- plugins/RSS/plugin.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index 9306d0978..f4dc71718 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -59,6 +59,10 @@ class Feed: __slots__ = ('url', 'name', 'data', 'last_update', 'entries', 'lock', 'announced_entries') def __init__(self, name, url, plugin_is_loading=False): + assert name, name + if not url: + assert utils.web.httpUrlRe.match(name), name + url = name self.name = name self.url = url self.data = None @@ -69,8 +73,7 @@ class Feed: self.lock = threading.Thread() self.announced_entries = utils.structures.TruncatableSet() - @property - def command(self): + def get_command(self, plugin): docstring = format(_("""[] Reports the titles for %s at the RSS feed %u. If @@ -78,14 +81,11 @@ class Feed: RSS feeds are only looked up every supybot.plugins.RSS.waitPeriod seconds, which defaults to 1800 (30 minutes) since that's what most websites prefer."""), self.name, self.url) - if self.isCommandMethod(name): - s = format('I already have a command in this plugin named %s.',name) - raise callbacks.Error(s) - def f(self, irc, msg, args): - args.insert(0, url) - self.rss(irc, msg, args) - f = utils.python.changeFunctionName(f, name, docstring) - f = types.MethodType(f, self) + def f(self2, irc, msg, args): + args.insert(0, self.url) + self2.rss(irc, msg, args) + f = utils.python.changeFunctionName(f, self.name, docstring) + f = types.MethodType(f, plugin) return f def lock_feed(f): @@ -130,8 +130,7 @@ class RSS(callbacks.Plugin): except registry.NonExistentRegistryEntry: self.log.warning('%s is not a registered feed, removing.',name) continue - self.feed_names[name] = url - self.feeds[url] = Feed(name, url, True) + self.register_feed(name, url, True) ################## # Feed registering @@ -141,6 +140,13 @@ class RSS(callbacks.Plugin): group = self.registryValue('feeds', value=False) conf.registerGlobalValue(group, name, registry.String(url, '')) + def register_feed(self, name, url, plugin_is_loading): + self.feed_names[name] = url + if self.isCommandMethod(name): + s = format('I already have a command in this plugin named %s.',name) + raise callbacks.Error(s) + self.feeds[url] = Feed(name, url, plugin_is_loading) + def remove_feed(self, feed): del self.feed_names[feed.name] del self.feeds[feed.url] @@ -163,7 +169,7 @@ class RSS(callbacks.Plugin): try: return self.__parent.getCommandMethod(command) except AttributeError: - return self.feeds[command[0]].command + return self.get_feed(command[0]).get_command(self) def __call__(self, irc, msg): self.__parent.__call__(irc, msg) @@ -275,7 +281,7 @@ class RSS(callbacks.Plugin): given URL. """ self.register_feed_config(name, url) - self.feeds[name] = Feed(name, url) + self.register_feed(name, url, False) irc.replySuccess() add = wrap(add, ['feedName', 'url']) From ca425b7b0a71e607bc69ada8fc35b328f71d828a Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 19:31:20 +0200 Subject: [PATCH 08/14] RSS: Fix checking of feed existancy. --- plugins/RSS/plugin.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index f4dc71718..b1b1e5479 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -124,6 +124,7 @@ class RSS(callbacks.Plugin): # Scheme: {url: feed} self.feeds = {} for name in self.registryValue('feeds'): + self.assert_feed_does_not_exist(name) self.register_feed_config(name) try: url = self.registryValue(registry.join(['feeds', name])) @@ -135,6 +136,11 @@ class RSS(callbacks.Plugin): ################## # Feed registering + def assert_feed_does_not_exist(self, name): + if self.isCommandMethod(name): + s = format('I already have a command in this plugin named %s.',name) + raise callbacks.Error(s) + def register_feed_config(self, name, url=''): self.registryValue('feeds').add(name) group = self.registryValue('feeds', value=False) @@ -142,16 +148,13 @@ class RSS(callbacks.Plugin): def register_feed(self, name, url, plugin_is_loading): self.feed_names[name] = url - if self.isCommandMethod(name): - s = format('I already have a command in this plugin named %s.',name) - raise callbacks.Error(s) self.feeds[url] = Feed(name, url, plugin_is_loading) def remove_feed(self, feed): del self.feed_names[feed.name] del self.feeds[feed.url] - conf.supybot.plugins.RSS.feeds().remove(name) - conf.supybot.plugins.RSS.feeds.unregister(name) + conf.supybot.plugins.RSS.feeds().remove(feed.name) + conf.supybot.plugins.RSS.feeds.unregister(feed.name) ################## # Methods handling @@ -280,6 +283,7 @@ class RSS(callbacks.Plugin): Adds a command to this plugin that will look up the RSS feed at the given URL. """ + self.assert_feed_does_not_exist(name) self.register_feed_config(name, url) self.register_feed(name, url, False) irc.replySuccess() From 70ad2328878aaf8fb14d274e1ec12577d5923ebb Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 19:56:17 +0200 Subject: [PATCH 09/14] Fix af24192b0f (support for struct_time in utils.str.timestamp). --- src/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf.py b/src/conf.py index 8015ae60a..401703fb0 100644 --- a/src/conf.py +++ b/src/conf.py @@ -374,7 +374,7 @@ registerChannelValue(supybot.reply.format, 'time', def timestamp(t): if t is None: t = time.time() - elif isinstance(t, int): + elif isinstance(t, float): t = time.localtime(t) format = get(supybot.reply.format.time, dynamic.channel) return time.strftime(format, t) From 8010cd5ae2fae741aebdf6b5ca65faf6dd25a245 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 19:57:14 +0200 Subject: [PATCH 10/14] Fix unicode handling issue of utils.str.format on Python 2. --- src/utils/str.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/utils/str.py b/src/utils/str.py index a227ff7dc..f3003b43a 100644 --- a/src/utils/str.py +++ b/src/utils/str.py @@ -448,6 +448,13 @@ def format(s, *args, **kwargs): # to add the character to the _formatRe regexp or it will be ignored # (and hard to debug if you don't know the trick). # Of course, you should also document it in the docstring above. + if sys.version_info.major < 3: + def pred(s): + if isinstance(s, unicode): + return s.encode('utf8') + else: + return s + args = map(pred, args) args = list(args) args.reverse() # For more efficient popping. def sub(match): From a5c928b3643d81205937f35734b3f4eefd393491 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 20:12:37 +0200 Subject: [PATCH 11/14] RSS: Fix test broken by 2caade8f. --- plugins/RSS/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/RSS/test.py b/plugins/RSS/test.py index fa1dec66c..eb20e4f57 100644 --- a/plugins/RSS/test.py +++ b/plugins/RSS/test.py @@ -103,7 +103,7 @@ class RSSTestCase(ChannelPluginTestCase): def testRss(self): self.assertNotError('rss %s' % url) m = self.assertNotError('rss %s 2' % url) - self.failUnless(m.args[1].count('||') == 1) + self.failUnless(m.args[1].count(' | ') == 1) def testRssAdd(self): self.assertNotError('rss add advogato %s' % url) From f35ece814758700307eaeed43167ec4846e93057 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 20:14:09 +0200 Subject: [PATCH 12/14] RSS: Fix stupid bugs. --- plugins/RSS/plugin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index b1b1e5479..67f464988 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -42,6 +42,7 @@ import supybot.conf as conf import supybot.utils as utils import supybot.world as world from supybot.commands import * +import supybot.ircmsgs as ircmsgs import supybot.ircutils as ircutils import supybot.registry as registry import supybot.callbacks as callbacks @@ -269,7 +270,7 @@ class RSS(callbacks.Plugin): def announce_entry(self, irc, channel, feed, entry): if self.should_send_entry(channel, entry): - s = format_entry(channel, feed, entry, True) + s = self.format_entry(channel, feed, entry, True) irc.sendMsg(ircmsgs.privmsg(channel, s)) @@ -395,6 +396,8 @@ class RSS(callbacks.Plugin): except registry.NonExistentRegistryEntry: pass feed = self.get_feed(url) + if not feed: + feed = Feed(url, url) self.update_feed_if_needed(feed) info = feed.data if not info: From 784b534a3d8f6c7cef9588b53dbdcfe5ca3e5b2b Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 20:50:12 +0200 Subject: [PATCH 13/14] RSS: Remove deadlock and make announces work. --- plugins/RSS/plugin.py | 58 ++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index 67f464988..04e1dc8da 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -70,8 +70,8 @@ class Feed: # We don't want to fetch feeds right after the plugin is # loaded (the bot could be starting, and thus already busy) self.last_update = time.time() if plugin_is_loading else 0 - self.entries = None - self.lock = threading.Thread() + self.entries = [] + self.lock = threading.Lock() self.announced_entries = utils.structures.TruncatableSet() def get_command(self, plugin): @@ -89,12 +89,6 @@ class Feed: f = types.MethodType(f, plugin) return f -def lock_feed(f): - def newf(feed, *args, **kwargs): - with feed.lock: - return f(feed, *args, **kwargs) - return f - def sort_feed_items(items, order): """Return feed items, sorted according to sortFeedItems.""" if order not in ['oldestFirst', 'newestFirst']: @@ -191,15 +185,15 @@ class RSS(callbacks.Plugin): event_horizon = time.time() - self.registryValue('waitPeriod') return feed.last_update < event_horizon - ############### # Feed fetching - @lock_feed def update_feed(self, feed): - d = feedparser.parse(feed.url) - feed.data = d.feed - feed.entries = d.entries + with feed.lock: + d = feedparser.parse(feed.url) + feed.data = d.feed + feed.entries = d.entries + feed.last_update = time.time() self.announce_feed(feed) def update_feed_in_thread(self, feed): @@ -215,16 +209,28 @@ class RSS(callbacks.Plugin): self.update_feed(feed) def update_feeds(self): - for name in self.registryValue('feeds'): + announced_feeds = set() + for irc in world.ircs: + for channel in irc.state.channels: + announced_feeds |= self.registryValue('announce', channel) + for name in announced_feeds: self.update_feed_if_needed(self.get_feed(name)) - @lock_feed + def get_new_entries(self, feed): + with feed.lock: + entries = feed.entries + new_entries = [entry for entry in entries + if entry.id not in feed.announced_entries] + if not new_entries: + return [] + feed.announced_entries |= {entry.id for entry in new_entries} + # We keep a little more because we don't want to re-announce + # oldest entries if one of the newest gets removed. + feed.announced_entries.truncate(2*len(entries)) + return new_entries + def announce_feed(self, feed): - entries = feed.entries - new_entries = [entry for entry in entries - if entry.id not in feed.announced_entries] - if not new_entries: - return + new_entries = self.get_new_entries(feed) order = self.registryValue('sortFeedItems') new_entries = sort_feed_items(new_entries, order) @@ -234,10 +240,6 @@ class RSS(callbacks.Plugin): continue for entry in new_entries: self.announce_entry(irc, channel, feed, entry) - feed.announced_entries |= {entry.id for entry in new_entries} - # We keep a little more because we don't want to re-announce - # oldest entries if one of the newest gets removed. - feed.announced_entries.truncate(2*len(entries)) ################# @@ -329,10 +331,14 @@ class RSS(callbacks.Plugin): """ announce = conf.supybot.plugins.RSS.announce S = announce.get(channel)() - for feed in feeds: - S.add(feed) + plugin = irc.getCallback('RSS') + for name in feeds: + S.add(name) announce.get(channel).setValue(S) irc.replySuccess() + for name in feeds: + feed = plugin.get_feed(name) + plugin.announce_feed(feed) add = wrap(add, [('checkChannelCapability', 'op'), many(first('url', 'feedName'))]) From e9b58f7820dc22879fea722b15791dcf6ea924d7 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Thu, 31 Jul 2014 19:17:27 +0000 Subject: [PATCH 14/14] =?UTF-8?q?RSS:=20Use=20queueMsg=20instead=20of=20se?= =?UTF-8?q?ndMsg=20(flood=E2=80=A6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- plugins/RSS/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/RSS/plugin.py b/plugins/RSS/plugin.py index 04e1dc8da..0978599a1 100644 --- a/plugins/RSS/plugin.py +++ b/plugins/RSS/plugin.py @@ -273,7 +273,7 @@ class RSS(callbacks.Plugin): def announce_entry(self, irc, channel, feed, entry): if self.should_send_entry(channel, entry): s = self.format_entry(channel, feed, entry, True) - irc.sendMsg(ircmsgs.privmsg(channel, s)) + irc.queueMsg(ircmsgs.privmsg(channel, s)) ##########