RSS: keep track of headlines over multiple feed fetches, with configurable expiration.

This is better at avoiding repeats than just keeping the last fetch, since some feeds shuffle items around (like google news search).
2025-08-12 07:57:29 +02:00 · 2013-05-03 23:39:34 -04:00 · 2013-05-03 23:39:34 -04:00 · bc0d16a4e1
commit bc0d16a4e1
parent 5d6a3c5a46
2 changed files with 17 additions and 10 deletions
--- a/plugins/RSS/config.py
+++ b/plugins/RSS/config.py
@ -62,6 +62,7 @@ conf.registerGlobalValue(RSS, 'waitPeriod',
    registry.PositiveInteger(1800, """Indicates how many seconds the bot will
    wait between retrieving RSS feeds; requests made within this period will
    return cached results."""))
+
 conf.registerGlobalValue(RSS, 'feeds',
    FeedNames([], """Determines what feeds should be accessible as
    commands."""))
@ -91,7 +92,9 @@ conf.registerChannelValue(RSS.announce, 'showLinks',
    registry.Boolean(False, """Determines whether the bot will list the link
    along with the title of the feed when a feed is automatically
    announced."""))
-
+conf.registerGlobalValue(RSS.announce, 'cachePeriod',
+    registry.PositiveInteger(86400, """Maximum age of cached RSS headlines,
+    in seconds. Headline cache is used to avoid re-announcing old news."""))


 # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
--- a/plugins/RSS/plugin.py
+++ b/plugins/RSS/plugin.py
@ -70,6 +70,7 @@ class RSS(callbacks.Plugin):
        self.locks = {}
        self.lastRequest = {}
        self.cachedFeeds = {}
+        self.cachedHeadlines = {}
        self.gettingLockLock = threading.Lock()
        for name in self.registryValue('feeds'):
            self._registerFeed(name)
@ -161,9 +162,12 @@ class RSS(callbacks.Plugin):
            # Note that we're allowed to acquire this lock twice within the
            # same thread because it's an RLock and not just a normal Lock.
            self.acquireLock(url)
+            t = time.time()
            try:
-                oldresults = self.cachedFeeds[url]
-                oldheadlines = self.getHeadlines(oldresults)
+                #oldresults = self.cachedFeeds[url]
+                #oldheadlines = self.getHeadlines(oldresults)
+                oldheadlines = self.cachedHeadlines[url]
+                oldheadlines = filter(lambda x: t - x[2] < self.registryValue('announce.cachePeriod'), oldheadlines)
            except KeyError:
                oldheadlines = []
            newresults = self.getFeed(url)
@ -176,11 +180,13 @@ class RSS(callbacks.Plugin):
                    return
            def normalize(headline):
                return (tuple(headline[0].lower().split()), headline[1])
-            oldheadlines = set(map(normalize, oldheadlines))
+            oldheadlinesset = set(map(normalize, oldheadlines))
            for (i, headline) in enumerate(newheadlines):
-                if normalize(headline) in oldheadlines:
+                if normalize(headline) in oldheadlinesset:
                    newheadlines[i] = None
            newheadlines = filter(None, newheadlines) # Removes Nones.
+            oldheadlines.extend(newheadlines)
+            self.cachedHeadlines[url] = oldheadlines
            if newheadlines:
                def filter_whitelist(headline):
                    v = False
@ -301,15 +307,13 @@ class RSS(callbacks.Plugin):

    def getHeadlines(self, feed):
        headlines = []
+        t = time.time()
        conv = self._getConverter(feed)
        for d in feed['items']:
            if 'title' in d:
                title = conv(d['title'])
-                link = d.get('link')
-                if link:
-                    headlines.append((title, link))
-                else:
-                    headlines.append((title, None))
+                link = d.get('link') # defaults to None
+                headlines.append((title, link, t))
        return headlines

    def makeFeedCommand(self, name, url):