Updated to the newest version of feedparser (nee rssparser; we kept the old name).

2026-01-29 22:17:59 +01:00 · 2003-12-18 09:02:39 +00:00 · 2003-12-18 09:02:39 +00:00 · b6c780cff3
commit b6c780cff3
parent 36b6821c5e
1 changed files with 272 additions and 69 deletions
--- a/others/rssparser.py
+++ b/others/rssparser.py
@ -1,34 +1,41 @@
-#!/usr/bin/env python
-"""Ultra-liberal RSS parser
+#!/usr/bin/python
+"""Ultra-liberal feed parser

-Visit http://diveintomark.org/projects/rss_parser/ for the latest version
+Visit http://diveintomark.org/projects/feed_parser/ for the latest version

-Handles RSS 0.9x and RSS 1.0 feeds
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, Pie/Atom/Echo feeds

-RSS 0.9x elements:
- title, link, description, webMaster, managingEditor, language
+RSS 0.9x/common elements:
+- title, link, guid, description, webMaster, managingEditor, language
  copyright, lastBuildDate, pubDate

-RSS 1.0 elements:
+Additional RSS 1.0/2.0 elements:
 - dc:rights, dc:language, dc:creator, dc:date, dc:subject,
-  content:encoded
+  content:encoded, admin:generatorAgent, admin:errorReportsTo,
  
-Things it handles that choke other RSS parsers:
- bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds)
- illegal XML characters (most Radio feeds)
- naked and/or invalid HTML in description (The Register)
- content:encoded in item element (Aaron Swartz)
- guid in item element (Scripting News)
- fullitem in item element (Jon Udell)
- non-standard namespaces (BitWorking)
+Addition Pie/Atom/Echo elements:
+- subtitle, created, issued, modified, summary, id, content
+
+Things it handles that choke other parsers:
+- bastard combinations of RSS 0.9x and RSS 1.0
+- illegal XML characters
+- naked and/or invalid HTML in description
+- content:encoded in item element
+- guid in item element
+- fullitem in item element
+- non-standard namespaces
+- inline XML in content (Pie/Atom/Echo)
+- multiple content items per entry (Pie/Atom/Echo)

 Requires Python 2.2 or later
 """

-__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
-__copyright__ = "Copyright 2002, Mark Pilgrim"
-__contributors__ = ["Jason Diamond (jason@injektilo.org)"]
-__license__ = "GPL"
+__version__ = "2.5.3"
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__copyright__ = "Copyright 2002-3, Mark Pilgrim"
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
+                    "John Beimler <http://john.beimler.org/>"]
+__license__ = "Python"
 __history__ = """
 1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
  added Simon Fell's test suite
@ -52,6 +59,25 @@ __history__ = """
 2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
  start_admingeneratoragent is an example of how to handle elements with
  only attributes, no content.
+2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
+  also, make sure we send the User-Agent even if urllib2 isn't available.
+  Match any variation of backend.userland.com/rss namespace.
+2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
+2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
+  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
+  project name
+2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
+  removed unnecessary urllib code -- urllib2 should always be available anyway;
+  return actual url, status, and full HTTP headers (as result['url'],
+  result['status'], and result['headers']) if parsing a remote feed over HTTP --
+  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
+  added the latest namespace-of-the-week for RSS 2.0
+2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
+  User-Agent (otherwise urllib2 sends two, which confuses some servers)
+2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
+  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
+2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
+  textInput, and also to return the character encoding (if specified)
 """

 try:
@ -59,9 +85,11 @@ try:
    timeoutsocket.setDefaultSocketTimeout(10)
 except ImportError:
    pass
-import cgi, re, sgmllib, string, StringIO, urllib, gzip
+import cgi, re, sgmllib, string, StringIO, gzip, urllib2
 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')

+USER_AGENT = "UltraLiberalFeedParser/%s +http://diveintomark.org/projects/feed_parser/" % __version__
+
 def decodeEntities(data):
    data = data or ''
    data = data.replace('&lt;', '<')
@ -71,15 +99,21 @@ def decodeEntities(data):
    data = data.replace('&amp;', '&')
    return data

-class RSSParser(sgmllib.SGMLParser):
+class FeedParser(sgmllib.SGMLParser):
    namespaces = {"http://backend.userland.com/rss": "",
-                  "http://backend.userland.com/rss2": "",
+                  "http://blogs.law.harvard.edu/tech/rss": "",
                  "http://purl.org/rss/1.0/": "",
+                  "http://example.com/newformat#": "",
+                  "http://example.com/necho": "",
+                  "http://purl.org/echo/": "",
+                  "uri/of/echo/namespace#": "",
+                  "http://purl.org/pie/": "",
                  "http://purl.org/rss/1.0/modules/textinput/": "ti",
                  "http://purl.org/rss/1.0/modules/company/": "co",
                  "http://purl.org/rss/1.0/modules/syndication/": "sy",
                  "http://purl.org/dc/elements/1.1/": "dc",
-                  "http://webns.net/mvcb/": "admin"}
+                  "http://webns.net/mvcb/": "admin",
+                  "http://www.w3.org/1999/xhtml": "xhtml"}

    def reset(self):
        self.channel = {}
@ -87,6 +121,12 @@ class RSSParser(sgmllib.SGMLParser):
        self.elementstack = []
        self.inchannel = 0
        self.initem = 0
+        self.incontent = 0
+        self.intextinput = 0
+        self.inimage = 0
+        self.contentmode = None
+        self.contenttype = None
+        self.contentlang = None
        self.namespacemap = {}
        sgmllib.SGMLParser.reset(self)

@ -100,15 +140,22 @@ class RSSParser(sgmllib.SGMLParser):
        if not expectingText: return
        output = "".join(pieces)
        output = decodeEntities(output)
-        if self.initem:
+        if self.incontent and self.initem:
+            if not self.items[-1].has_key(element):
+                self.items[-1][element] = []
+            self.items[-1][element].append({"language":self.contentlang, "type":self.contenttype, "value":output})
+        elif self.initem:
            self.items[-1][element] = output
-        elif self.inchannel:
+        elif self.inchannel and (not self.intextinput) and (not self.inimage):
            self.channel[element] = output

    def _addNamespaces(self, attrs):
        for prefix, value in attrs:
            if not prefix.startswith("xmlns:"): continue
            prefix = prefix[6:]
+            if prefix.find('backend.userland.com/rss') <> -1:
+                # match any backend.userland.com namespace
+                prefix = 'http://backend.userland.com/rss'
            if self.namespaces.has_key(value):
                self.namespacemap[prefix] = self.namespaces[value]

@ -137,6 +184,18 @@ class RSSParser(sgmllib.SGMLParser):
        self.pop('channel')
        self.inchannel = 0
    
+    def start_image(self, attrs):
+        self.inimage = 1
+            
+    def end_image(self):
+        self.inimage = 0
+                
+    def start_textinput(self, attrs):
+        self.intextinput = 1
+        
+    def end_textinput(self):
+        self.intextinput = 0
+
    def start_item(self, attrs):
        self.items.append({})
        self.push('item', 0)
@ -201,6 +260,9 @@ class RSSParser(sgmllib.SGMLParser):
    def end_guid(self):
        self.pop('guid')
        if self.guidislink:
+            if not self.items[-1].has_key('link'):
+                # guid acts as link, but only if "ispermalink" is not present or is "true",
+                # and only if the item doesn't already have a link element
                self.items[-1]['link'] = self.items[-1]['guid']

    def start_title(self, attrs):
@ -224,7 +286,101 @@ class RSSParser(sgmllib.SGMLParser):
            self.elementstack[-1][2].append(value)
        self.pop('generator')

+    def start_feed(self, attrs):
+        self.inchannel = 1
+
+    def end_feed(self):
+        self.inchannel = 0
+
+    def start_entry(self, attrs):
+        self.items.append({})
+        self.push('item', 0)
+        self.initem = 1
+
+    def end_entry(self):
+        self.pop('item')
+        self.initem = 0
+        
+    def start_subtitle(self, attrs):
+        self.push('subtitle', 1)
+
+    def end_subtitle(self):
+        self.pop('subtitle')
+
+    def start_summary(self, attrs):
+        self.push('summary', 1)
+
+    def end_summary(self):
+        self.pop('summary')
+        
+    def start_modified(self, attrs):
+        self.push('modified', 1)
+
+    def end_modified(self):
+        self.pop('modified')
+
+    def start_created(self, attrs):
+        self.push('created', 1)
+
+    def end_created(self):
+        self.pop('created')
+
+    def start_issued(self, attrs):
+        self.push('issued', 1)
+
+    def end_issued(self):
+        self.pop('issued')
+
+    def start_id(self, attrs):
+        self.push('id', 1)
+
+    def end_id(self):
+        self.pop('id')
+
+    def start_content(self, attrs):
+        self.incontent = 1
+        if ('mode', 'escaped') in attrs:
+            self.contentmode = 'escaped'
+        elif ('mode', 'base64') in attrs:
+            self.contentmode = 'base64'
+        else:
+            self.contentmode = 'xml'
+        mimetype = [v for k, v in attrs if k=='type']
+        if mimetype:
+            self.contenttype = mimetype[0]
+        xmllang = [v for k, v in attrs if k=='xml:lang']
+        if xmllang:
+            self.contentlang = xmllang[0]
+        self.push('content', 1)
+
+    def end_content(self):
+        self.pop('content')
+        self.incontent = 0
+        self.contentmode = None
+        self.contenttype = None
+        self.contentlang = None
+
+    def start_body(self, attrs):
+        self.incontent = 1
+        self.contentmode = 'xml'
+        self.contenttype = 'application/xhtml+xml'
+        xmllang = [v for k, v in attrs if k=='xml:lang']
+        if xmllang:
+            self.contentlang = xmllang[0]
+        self.push('content', 1)
+
+    start_div = start_body
+    start_xhtml_body = start_body
+    start_xhtml_div = start_body
+    end_body = end_content
+    end_div = end_content
+    end_xhtml_body = end_content
+    end_xhtml_div = end_content
+        
    def unknown_starttag(self, tag, attrs):
+        if self.incontent and self.contentmode == 'xml':
+            self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])))
+            return
        self._addNamespaces(attrs)
        colonpos = tag.find(':')
        if colonpos <> -1:
@ -242,6 +398,9 @@ class RSSParser(sgmllib.SGMLParser):
        return self.push(tag, 0)

    def unknown_endtag(self, tag):
+        if self.incontent and self.contentmode == 'xml':
+            self.handle_data("</%s>" % tag)
+            return
        colonpos = tag.find(':')
        if colonpos <> -1:
            prefix = tag[:colonpos]
@ -261,18 +420,26 @@ class RSSParser(sgmllib.SGMLParser):
        # called for each character reference, e.g. for "&#160;", ref will be "160"
        # Reconstruct the original character reference.
        if not self.elementstack: return
-        self.elementstack[-1][2].append("&#%(ref)s;" % locals())
+        text = "&#%s;" % ref
+        if self.incontent and self.contentmode == 'xml':
+            text = cgi.escape(text)
+        self.elementstack[-1][2].append(text)

    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for "&copy;", ref will be "copy"
        # Reconstruct the original entity reference.
        if not self.elementstack: return
-        self.elementstack[-1][2].append("&%(ref)s;" % locals())
+        text = "&%s;" % ref
+        if self.incontent and self.contentmode == 'xml':
+            text = cgi.escape(text)
+        self.elementstack[-1][2].append(text)

    def handle_data(self, text):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        if not self.elementstack: return
+        if self.incontent and self.contentmode == 'xml':
+            text = cgi.escape(text)
        self.elementstack[-1][2].append(text)

    def handle_comment(self, text):
@ -315,6 +482,29 @@ class RSSParser(sgmllib.SGMLParser):
            return k+3
        return sgmllib.SGMLParser.parse_declaration(self, i)

+class FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+    def http_error_default(self, req, fp, code, msg, headers):
+        if ((code / 100) == 3) and (code != 304):
+            return self.http_error_302(req, fp, code, msg, headers)
+        from urllib import addinfourl
+        infourl = addinfourl(fp, headers, req.get_full_url())
+        infourl.status = code
+        return infourl
+#        raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+    def http_error_302(self, req, fp, code, msg, headers):
+        infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
+        infourl.status = code
+        return infourl
+
+    def http_error_301(self, req, fp, code, msg, headers):
+        infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
+        infourl.status = code
+        return infourl
+
+    http_error_300 = http_error_302
+    http_error_307 = http_error_302
+        
 def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
    """
    URI, filename, or string --> stream
@ -338,10 +528,6 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):

    If the referrer argument is supplied, it will be used as the value of a
    Referer[sic] request header.
-
-    The optional arguments are only used if the source argument is an HTTP
-    URL and the urllib2 module is importable (i.e., you must be using Python
-    version 2.0 or higher).
    """

    if hasattr(source, "read"):
@ -350,35 +536,25 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
    if source == "-":
        return sys.stdin

+    if not agent:
+        agent = USER_AGENT
+        
    # try to open with urllib2 (to use optional headers)
-    try:
-        import urllib2
    request = urllib2.Request(source)
    if etag:
        request.add_header("If-None-Match", etag)
    if modified:
        request.add_header("If-Modified-Since", format_http_date(modified))
-        if agent:
    request.add_header("User-Agent", agent)
    if referrer:
-            # http://www.dictionary.com/search?q=referer
        request.add_header("Referer", referrer)
        request.add_header("Accept-encoding", "gzip")
+    opener = urllib2.build_opener(FeedURLHandler())
+    opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
    try:
-            return urllib2.urlopen(request)
-        except urllib2.HTTPError:
-            # either the resource is not modified or some other HTTP
-            # error occurred so return an empty resource
-            return StringIO.StringIO("")
+        return opener.open(request)
    except:
-            # source must not be a valid URL but it might be a valid filename
-            pass
-    except ImportError:
-        # urllib2 isn't available so try to open with urllib
-        try:
-            return urllib.urlopen(source)
-        except:
-            # source still might be a filename
+        # source is not a valid URL, but it might be a valid filename
        pass
    
    # try to open with native open function (if source is a filename)
@ -478,12 +654,16 @@ def parse_http_date(date):
        return None

 def parse(uri, etag=None, modified=None, agent=None, referrer=None):
-    r = RSSParser()
+    r = FeedParser()
    f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
    data = f.read()
    if hasattr(f, "headers"):
-        if f.headers.get('content-encoding', None) == 'gzip':
+        if f.headers.get('content-encoding', '') == 'gzip':
+            try:
                data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
+            except:
+                # some feeds claim to be gzipped but they're not, so we get garbage
+                data = ''
    r.feed(data)
    result = {"channel": r.channel, "items": r.items}
    newEtag = get_etag(f)
@ -492,6 +672,20 @@ def parse(uri, etag=None, modified=None, agent=None, referrer=None):
    newModified = get_modified(f)
    if newModified: result["modified"] = newModified
    elif modified: result["modified"] = modified
+    if hasattr(f, "url"):
+        result["url"] = f.url
+    if hasattr(f, "headers"):
+        result["headers"] = f.headers.dict
+    if hasattr(f, "status"):
+        result["status"] = f.status
+    elif hasattr(f, "url"):
+        result["status"] = 200
+    # get the xml encoding
+    if result.get('encoding', '') == '':
+        xmlheaderRe = re.compile('<\?.*encoding="(.*)".*\?>')
+        match = xmlheaderRe.match(data)
+        if match:
+            result['encoding'] = match.groups()[0].lower()
    f.close()
    return result

@ -515,5 +709,14 @@ if __name__ == '__main__':
        print url
        print
        result = parse(url)
-        pprint(result['channel'])
+        pprint(result)
        print
+
+"""
+TODO
+- textinput/textInput
+- image
+- author
+- contributor
+- comments
+"""