RSS: keep track of headlines over multiple feed fetches, with configurable expiration.

This is better at avoiding repeats than just keeping the last fetch, since some feeds
shuffle items around (like google news search).

Conflicts:
	plugins/RSS/config.py
This commit is contained in:
Valentin Lorentz 2013-06-27 07:03:20 +02:00
commit 20bef2dcd0
2 changed files with 17 additions and 9 deletions

View File

@ -75,6 +75,7 @@ conf.registerGlobalValue(RSS, 'sortFeedItems',
FeedItemSortOrder('asInFeed', _("""Determines whether feed items should be FeedItemSortOrder('asInFeed', _("""Determines whether feed items should be
sorted by their update timestamp or kept in the same order as they appear sorted by their update timestamp or kept in the same order as they appear
in a feed."""))) in a feed.""")))
conf.registerGlobalValue(RSS, 'feeds', conf.registerGlobalValue(RSS, 'feeds',
FeedNames([], _("""Determines what feeds should be accessible as FeedNames([], _("""Determines what feeds should be accessible as
commands."""))) commands.""")))
@ -104,6 +105,9 @@ conf.registerChannelValue(RSS.announce, 'showLinks',
along with the title of the feed when a feed is automatically along with the title of the feed when a feed is automatically
announced."""))) announced.""")))
conf.registerGlobalValue(RSS.announce, 'cachePeriod',
registry.PositiveInteger(86400, """Maximum age of cached RSS headlines,
in seconds. Headline cache is used to avoid re-announcing old news."""))
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:

View File

@ -72,6 +72,7 @@ class RSS(callbacks.Plugin):
self.locks = {} self.locks = {}
self.lastRequest = {} self.lastRequest = {}
self.cachedFeeds = {} self.cachedFeeds = {}
self.cachedHeadlines = {}
self.gettingLockLock = threading.Lock() self.gettingLockLock = threading.Lock()
for name in self.registryValue('feeds'): for name in self.registryValue('feeds'):
self._registerFeed(name) self._registerFeed(name)
@ -164,9 +165,12 @@ class RSS(callbacks.Plugin):
# Note that we're allowed to acquire this lock twice within the # Note that we're allowed to acquire this lock twice within the
# same thread because it's an RLock and not just a normal Lock. # same thread because it's an RLock and not just a normal Lock.
self.acquireLock(url) self.acquireLock(url)
t = time.time()
try: try:
oldresults = self.cachedFeeds[url] #oldresults = self.cachedFeeds[url]
oldheadlines = self.getHeadlines(oldresults) #oldheadlines = self.getHeadlines(oldresults)
oldheadlines = self.cachedHeadlines[url]
oldheadlines = filter(lambda x: t - x[2] < self.registryValue('announce.cachePeriod'), oldheadlines)
except KeyError: except KeyError:
oldheadlines = [] oldheadlines = []
newresults = self.getFeed(url) newresults = self.getFeed(url)
@ -179,11 +183,13 @@ class RSS(callbacks.Plugin):
return return
def normalize(headline): def normalize(headline):
return (tuple(headline[0].lower().split()), headline[1]) return (tuple(headline[0].lower().split()), headline[1])
oldheadlines = set(map(normalize, oldheadlines)) oldheadlinesset = set(map(normalize, oldheadlines))
for (i, headline) in enumerate(newheadlines): for (i, headline) in enumerate(newheadlines):
if normalize(headline) in oldheadlines: if normalize(headline) in oldheadlinesset:
newheadlines[i] = None newheadlines[i] = None
newheadlines = filter(None, newheadlines) # Removes Nones. newheadlines = filter(None, newheadlines) # Removes Nones.
oldheadlines.extend(newheadlines)
self.cachedHeadlines[url] = oldheadlines
if newheadlines: if newheadlines:
def filter_whitelist(headline): def filter_whitelist(headline):
v = False v = False
@ -324,15 +330,13 @@ class RSS(callbacks.Plugin):
def getHeadlines(self, feed): def getHeadlines(self, feed):
headlines = [] headlines = []
t = time.time()
conv = self._getConverter(feed) conv = self._getConverter(feed)
for d in self._sortFeedItems(feed['items']): for d in self._sortFeedItems(feed['items']):
if 'title' in d: if 'title' in d:
title = conv(d['title']) title = conv(d['title'])
link = d.get('link') link = d.get('link') # defaults to None
if link: headlines.append((title, link, t))
headlines.append((title, link))
else:
headlines.append((title, None))
return headlines return headlines
@internationalizeDocstring @internationalizeDocstring