Updated to the newest version of feedparser (nee rssparser; we kept the old name).

This commit is contained in:
Jeremy Fincher 2003-12-18 09:02:39 +00:00
parent 36b6821c5e
commit b6c780cff3
1 changed files with 272 additions and 69 deletions

View File

@ -1,34 +1,41 @@
#!/usr/bin/env python #!/usr/bin/python
"""Ultra-liberal RSS parser """Ultra-liberal feed parser
Visit http://diveintomark.org/projects/rss_parser/ for the latest version Visit http://diveintomark.org/projects/feed_parser/ for the latest version
Handles RSS 0.9x and RSS 1.0 feeds Handles RSS 0.9x, RSS 1.0, RSS 2.0, Pie/Atom/Echo feeds
RSS 0.9x elements: RSS 0.9x/common elements:
- title, link, description, webMaster, managingEditor, language - title, link, guid, description, webMaster, managingEditor, language
copyright, lastBuildDate, pubDate copyright, lastBuildDate, pubDate
RSS 1.0 elements: Additional RSS 1.0/2.0 elements:
- dc:rights, dc:language, dc:creator, dc:date, dc:subject, - dc:rights, dc:language, dc:creator, dc:date, dc:subject,
content:encoded content:encoded, admin:generatorAgent, admin:errorReportsTo,
Addition Pie/Atom/Echo elements:
- subtitle, created, issued, modified, summary, id, content
Things it handles that choke other RSS parsers: Things it handles that choke other parsers:
- bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds) - bastard combinations of RSS 0.9x and RSS 1.0
- illegal XML characters (most Radio feeds) - illegal XML characters
- naked and/or invalid HTML in description (The Register) - naked and/or invalid HTML in description
- content:encoded in item element (Aaron Swartz) - content:encoded in item element
- guid in item element (Scripting News) - guid in item element
- fullitem in item element (Jon Udell) - fullitem in item element
- non-standard namespaces (BitWorking) - non-standard namespaces
- inline XML in content (Pie/Atom/Echo)
- multiple content items per entry (Pie/Atom/Echo)
Requires Python 2.2 or later Requires Python 2.2 or later
""" """
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)" __version__ = "2.5.3"
__copyright__ = "Copyright 2002, Mark Pilgrim" __author__ = "Mark Pilgrim <http://diveintomark.org/>"
__contributors__ = ["Jason Diamond (jason@injektilo.org)"] __copyright__ = "Copyright 2002-3, Mark Pilgrim"
__license__ = "GPL" __contributors__ = ["Jason Diamond <http://injektilo.org/>",
"John Beimler <http://john.beimler.org/>"]
__license__ = "Python"
__history__ = """ __history__ = """
1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, 1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
added Simon Fell's test suite added Simon Fell's test suite
@ -52,6 +59,25 @@ __history__ = """
2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent. 2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
start_admingeneratoragent is an example of how to handle elements with start_admingeneratoragent is an example of how to handle elements with
only attributes, no content. only attributes, no content.
2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
also, make sure we send the User-Agent even if urllib2 isn't available.
Match any variation of backend.userland.com/rss namespace.
2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
project name
2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
removed unnecessary urllib code -- urllib2 should always be available anyway;
return actual url, status, and full HTTP headers (as result['url'],
result['status'], and result['headers']) if parsing a remote feed over HTTP --
this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
added the latest namespace-of-the-week for RSS 2.0
2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
User-Agent (otherwise urllib2 sends two, which confuses some servers)
2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
textInput, and also to return the character encoding (if specified)
""" """
try: try:
@ -59,9 +85,11 @@ try:
timeoutsocket.setDefaultSocketTimeout(10) timeoutsocket.setDefaultSocketTimeout(10)
except ImportError: except ImportError:
pass pass
import cgi, re, sgmllib, string, StringIO, urllib, gzip import cgi, re, sgmllib, string, StringIO, gzip, urllib2
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
USER_AGENT = "UltraLiberalFeedParser/%s +http://diveintomark.org/projects/feed_parser/" % __version__
def decodeEntities(data): def decodeEntities(data):
data = data or '' data = data or ''
data = data.replace('&lt;', '<') data = data.replace('&lt;', '<')
@ -71,15 +99,21 @@ def decodeEntities(data):
data = data.replace('&amp;', '&') data = data.replace('&amp;', '&')
return data return data
class RSSParser(sgmllib.SGMLParser): class FeedParser(sgmllib.SGMLParser):
namespaces = {"http://backend.userland.com/rss": "", namespaces = {"http://backend.userland.com/rss": "",
"http://backend.userland.com/rss2": "", "http://blogs.law.harvard.edu/tech/rss": "",
"http://purl.org/rss/1.0/": "", "http://purl.org/rss/1.0/": "",
"http://example.com/newformat#": "",
"http://example.com/necho": "",
"http://purl.org/echo/": "",
"uri/of/echo/namespace#": "",
"http://purl.org/pie/": "",
"http://purl.org/rss/1.0/modules/textinput/": "ti", "http://purl.org/rss/1.0/modules/textinput/": "ti",
"http://purl.org/rss/1.0/modules/company/": "co", "http://purl.org/rss/1.0/modules/company/": "co",
"http://purl.org/rss/1.0/modules/syndication/": "sy", "http://purl.org/rss/1.0/modules/syndication/": "sy",
"http://purl.org/dc/elements/1.1/": "dc", "http://purl.org/dc/elements/1.1/": "dc",
"http://webns.net/mvcb/": "admin"} "http://webns.net/mvcb/": "admin",
"http://www.w3.org/1999/xhtml": "xhtml"}
def reset(self): def reset(self):
self.channel = {} self.channel = {}
@ -87,6 +121,12 @@ class RSSParser(sgmllib.SGMLParser):
self.elementstack = [] self.elementstack = []
self.inchannel = 0 self.inchannel = 0
self.initem = 0 self.initem = 0
self.incontent = 0
self.intextinput = 0
self.inimage = 0
self.contentmode = None
self.contenttype = None
self.contentlang = None
self.namespacemap = {} self.namespacemap = {}
sgmllib.SGMLParser.reset(self) sgmllib.SGMLParser.reset(self)
@ -100,15 +140,22 @@ class RSSParser(sgmllib.SGMLParser):
if not expectingText: return if not expectingText: return
output = "".join(pieces) output = "".join(pieces)
output = decodeEntities(output) output = decodeEntities(output)
if self.initem: if self.incontent and self.initem:
if not self.items[-1].has_key(element):
self.items[-1][element] = []
self.items[-1][element].append({"language":self.contentlang, "type":self.contenttype, "value":output})
elif self.initem:
self.items[-1][element] = output self.items[-1][element] = output
elif self.inchannel: elif self.inchannel and (not self.intextinput) and (not self.inimage):
self.channel[element] = output self.channel[element] = output
def _addNamespaces(self, attrs): def _addNamespaces(self, attrs):
for prefix, value in attrs: for prefix, value in attrs:
if not prefix.startswith("xmlns:"): continue if not prefix.startswith("xmlns:"): continue
prefix = prefix[6:] prefix = prefix[6:]
if prefix.find('backend.userland.com/rss') <> -1:
# match any backend.userland.com namespace
prefix = 'http://backend.userland.com/rss'
if self.namespaces.has_key(value): if self.namespaces.has_key(value):
self.namespacemap[prefix] = self.namespaces[value] self.namespacemap[prefix] = self.namespaces[value]
@ -120,7 +167,7 @@ class RSSParser(sgmllib.SGMLParser):
prefix = self.namespacemap.get(prefix, prefix) prefix = self.namespacemap.get(prefix, prefix)
name = prefix + ':' + suffix name = prefix + ':' + suffix
return name return name
def _getAttribute(self, attrs, name): def _getAttribute(self, attrs, name):
value = [v for k, v in attrs if self._mapToStandardPrefix(k) == name] value = [v for k, v in attrs if self._mapToStandardPrefix(k) == name]
if value: if value:
@ -128,7 +175,7 @@ class RSSParser(sgmllib.SGMLParser):
else: else:
value = None value = None
return value return value
def start_channel(self, attrs): def start_channel(self, attrs):
self.push('channel', 0) self.push('channel', 0)
self.inchannel = 1 self.inchannel = 1
@ -136,6 +183,18 @@ class RSSParser(sgmllib.SGMLParser):
def end_channel(self): def end_channel(self):
self.pop('channel') self.pop('channel')
self.inchannel = 0 self.inchannel = 0
def start_image(self, attrs):
self.inimage = 1
def end_image(self):
self.inimage = 0
def start_textinput(self, attrs):
self.intextinput = 1
def end_textinput(self):
self.intextinput = 0
def start_item(self, attrs): def start_item(self, attrs):
self.items.append({}) self.items.append({})
@ -201,7 +260,10 @@ class RSSParser(sgmllib.SGMLParser):
def end_guid(self): def end_guid(self):
self.pop('guid') self.pop('guid')
if self.guidislink: if self.guidislink:
self.items[-1]['link'] = self.items[-1]['guid'] if not self.items[-1].has_key('link'):
# guid acts as link, but only if "ispermalink" is not present or is "true",
# and only if the item doesn't already have a link element
self.items[-1]['link'] = self.items[-1]['guid']
def start_title(self, attrs): def start_title(self, attrs):
self.push('title', self.inchannel or self.initem) self.push('title', self.inchannel or self.initem)
@ -224,7 +286,101 @@ class RSSParser(sgmllib.SGMLParser):
self.elementstack[-1][2].append(value) self.elementstack[-1][2].append(value)
self.pop('generator') self.pop('generator')
def start_feed(self, attrs):
self.inchannel = 1
def end_feed(self):
self.inchannel = 0
def start_entry(self, attrs):
self.items.append({})
self.push('item', 0)
self.initem = 1
def end_entry(self):
self.pop('item')
self.initem = 0
def start_subtitle(self, attrs):
self.push('subtitle', 1)
def end_subtitle(self):
self.pop('subtitle')
def start_summary(self, attrs):
self.push('summary', 1)
def end_summary(self):
self.pop('summary')
def start_modified(self, attrs):
self.push('modified', 1)
def end_modified(self):
self.pop('modified')
def start_created(self, attrs):
self.push('created', 1)
def end_created(self):
self.pop('created')
def start_issued(self, attrs):
self.push('issued', 1)
def end_issued(self):
self.pop('issued')
def start_id(self, attrs):
self.push('id', 1)
def end_id(self):
self.pop('id')
def start_content(self, attrs):
self.incontent = 1
if ('mode', 'escaped') in attrs:
self.contentmode = 'escaped'
elif ('mode', 'base64') in attrs:
self.contentmode = 'base64'
else:
self.contentmode = 'xml'
mimetype = [v for k, v in attrs if k=='type']
if mimetype:
self.contenttype = mimetype[0]
xmllang = [v for k, v in attrs if k=='xml:lang']
if xmllang:
self.contentlang = xmllang[0]
self.push('content', 1)
def end_content(self):
self.pop('content')
self.incontent = 0
self.contentmode = None
self.contenttype = None
self.contentlang = None
def start_body(self, attrs):
self.incontent = 1
self.contentmode = 'xml'
self.contenttype = 'application/xhtml+xml'
xmllang = [v for k, v in attrs if k=='xml:lang']
if xmllang:
self.contentlang = xmllang[0]
self.push('content', 1)
start_div = start_body
start_xhtml_body = start_body
start_xhtml_div = start_body
end_body = end_content
end_div = end_content
end_xhtml_body = end_content
end_xhtml_div = end_content
def unknown_starttag(self, tag, attrs): def unknown_starttag(self, tag, attrs):
if self.incontent and self.contentmode == 'xml':
self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])))
return
self._addNamespaces(attrs) self._addNamespaces(attrs)
colonpos = tag.find(':') colonpos = tag.find(':')
if colonpos <> -1: if colonpos <> -1:
@ -242,6 +398,9 @@ class RSSParser(sgmllib.SGMLParser):
return self.push(tag, 0) return self.push(tag, 0)
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
if self.incontent and self.contentmode == 'xml':
self.handle_data("</%s>" % tag)
return
colonpos = tag.find(':') colonpos = tag.find(':')
if colonpos <> -1: if colonpos <> -1:
prefix = tag[:colonpos] prefix = tag[:colonpos]
@ -261,18 +420,26 @@ class RSSParser(sgmllib.SGMLParser):
# called for each character reference, e.g. for "&#160;", ref will be "160" # called for each character reference, e.g. for "&#160;", ref will be "160"
# Reconstruct the original character reference. # Reconstruct the original character reference.
if not self.elementstack: return if not self.elementstack: return
self.elementstack[-1][2].append("&#%(ref)s;" % locals()) text = "&#%s;" % ref
if self.incontent and self.contentmode == 'xml':
text = cgi.escape(text)
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref): def handle_entityref(self, ref):
# called for each entity reference, e.g. for "&copy;", ref will be "copy" # called for each entity reference, e.g. for "&copy;", ref will be "copy"
# Reconstruct the original entity reference. # Reconstruct the original entity reference.
if not self.elementstack: return if not self.elementstack: return
self.elementstack[-1][2].append("&%(ref)s;" % locals()) text = "&%s;" % ref
if self.incontent and self.contentmode == 'xml':
text = cgi.escape(text)
self.elementstack[-1][2].append(text)
def handle_data(self, text): def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and # called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references # not containing any character or entity references
if not self.elementstack: return if not self.elementstack: return
if self.incontent and self.contentmode == 'xml':
text = cgi.escape(text)
self.elementstack[-1][2].append(text) self.elementstack[-1][2].append(text)
def handle_comment(self, text): def handle_comment(self, text):
@ -315,6 +482,29 @@ class RSSParser(sgmllib.SGMLParser):
return k+3 return k+3
return sgmllib.SGMLParser.parse_declaration(self, i) return sgmllib.SGMLParser.parse_declaration(self, i)
class FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers):
if ((code / 100) == 3) and (code != 304):
return self.http_error_302(req, fp, code, msg, headers)
from urllib import addinfourl
infourl = addinfourl(fp, headers, req.get_full_url())
infourl.status = code
return infourl
# raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
infourl.status = code
return infourl
def http_error_301(self, req, fp, code, msg, headers):
infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
infourl.status = code
return infourl
http_error_300 = http_error_302
http_error_307 = http_error_302
def open_resource(source, etag=None, modified=None, agent=None, referrer=None): def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
""" """
URI, filename, or string --> stream URI, filename, or string --> stream
@ -338,10 +528,6 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
If the referrer argument is supplied, it will be used as the value of a If the referrer argument is supplied, it will be used as the value of a
Referer[sic] request header. Referer[sic] request header.
The optional arguments are only used if the source argument is an HTTP
URL and the urllib2 module is importable (i.e., you must be using Python
version 2.0 or higher).
""" """
if hasattr(source, "read"): if hasattr(source, "read"):
@ -350,37 +536,27 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
if source == "-": if source == "-":
return sys.stdin return sys.stdin
if not agent:
agent = USER_AGENT
# try to open with urllib2 (to use optional headers) # try to open with urllib2 (to use optional headers)
try: request = urllib2.Request(source)
import urllib2 if etag:
request = urllib2.Request(source) request.add_header("If-None-Match", etag)
if etag: if modified:
request.add_header("If-None-Match", etag) request.add_header("If-Modified-Since", format_http_date(modified))
if modified: request.add_header("User-Agent", agent)
request.add_header("If-Modified-Since", format_http_date(modified)) if referrer:
if agent: request.add_header("Referer", referrer)
request.add_header("User-Agent", agent)
if referrer:
# http://www.dictionary.com/search?q=referer
request.add_header("Referer", referrer)
request.add_header("Accept-encoding", "gzip") request.add_header("Accept-encoding", "gzip")
try: opener = urllib2.build_opener(FeedURLHandler())
return urllib2.urlopen(request) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
except urllib2.HTTPError: try:
# either the resource is not modified or some other HTTP return opener.open(request)
# error occurred so return an empty resource except:
return StringIO.StringIO("") # source is not a valid URL, but it might be a valid filename
except: pass
# source must not be a valid URL but it might be a valid filename
pass
except ImportError:
# urllib2 isn't available so try to open with urllib
try:
return urllib.urlopen(source)
except:
# source still might be a filename
pass
# try to open with native open function (if source is a filename) # try to open with native open function (if source is a filename)
try: try:
return open(source) return open(source)
@ -392,7 +568,7 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
def get_etag(resource): def get_etag(resource):
""" """
Get the ETag associated with a response returned from a call to Get the ETag associated with a response returned from a call to
open_resource(). open_resource().
If the resource was not returned from an HTTP server or the server did If the resource was not returned from an HTTP server or the server did
@ -478,12 +654,16 @@ def parse_http_date(date):
return None return None
def parse(uri, etag=None, modified=None, agent=None, referrer=None): def parse(uri, etag=None, modified=None, agent=None, referrer=None):
r = RSSParser() r = FeedParser()
f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer) f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
data = f.read() data = f.read()
if hasattr(f, "headers"): if hasattr(f, "headers"):
if f.headers.get('content-encoding', None) == 'gzip': if f.headers.get('content-encoding', '') == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() try:
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
except:
# some feeds claim to be gzipped but they're not, so we get garbage
data = ''
r.feed(data) r.feed(data)
result = {"channel": r.channel, "items": r.items} result = {"channel": r.channel, "items": r.items}
newEtag = get_etag(f) newEtag = get_etag(f)
@ -492,6 +672,20 @@ def parse(uri, etag=None, modified=None, agent=None, referrer=None):
newModified = get_modified(f) newModified = get_modified(f)
if newModified: result["modified"] = newModified if newModified: result["modified"] = newModified
elif modified: result["modified"] = modified elif modified: result["modified"] = modified
if hasattr(f, "url"):
result["url"] = f.url
if hasattr(f, "headers"):
result["headers"] = f.headers.dict
if hasattr(f, "status"):
result["status"] = f.status
elif hasattr(f, "url"):
result["status"] = 200
# get the xml encoding
if result.get('encoding', '') == '':
xmlheaderRe = re.compile('<\?.*encoding="(.*)".*\?>')
match = xmlheaderRe.match(data)
if match:
result['encoding'] = match.groups()[0].lower()
f.close() f.close()
return result return result
@ -515,5 +709,14 @@ if __name__ == '__main__':
print url print url
print print
result = parse(url) result = parse(url)
pprint(result['channel']) pprint(result)
print print
"""
TODO
- textinput/textInput
- image
- author
- contributor
- comments
"""