mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-27 05:09:23 +01:00
Updated to the newest version of feedparser (nee rssparser; we kept the old name).
This commit is contained in:
parent
36b6821c5e
commit
b6c780cff3
@ -1,34 +1,41 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/python
|
||||||
"""Ultra-liberal RSS parser
|
"""Ultra-liberal feed parser
|
||||||
|
|
||||||
Visit http://diveintomark.org/projects/rss_parser/ for the latest version
|
Visit http://diveintomark.org/projects/feed_parser/ for the latest version
|
||||||
|
|
||||||
Handles RSS 0.9x and RSS 1.0 feeds
|
Handles RSS 0.9x, RSS 1.0, RSS 2.0, Pie/Atom/Echo feeds
|
||||||
|
|
||||||
RSS 0.9x elements:
|
RSS 0.9x/common elements:
|
||||||
- title, link, description, webMaster, managingEditor, language
|
- title, link, guid, description, webMaster, managingEditor, language
|
||||||
copyright, lastBuildDate, pubDate
|
copyright, lastBuildDate, pubDate
|
||||||
|
|
||||||
RSS 1.0 elements:
|
Additional RSS 1.0/2.0 elements:
|
||||||
- dc:rights, dc:language, dc:creator, dc:date, dc:subject,
|
- dc:rights, dc:language, dc:creator, dc:date, dc:subject,
|
||||||
content:encoded
|
content:encoded, admin:generatorAgent, admin:errorReportsTo,
|
||||||
|
|
||||||
Things it handles that choke other RSS parsers:
|
Addition Pie/Atom/Echo elements:
|
||||||
- bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds)
|
- subtitle, created, issued, modified, summary, id, content
|
||||||
- illegal XML characters (most Radio feeds)
|
|
||||||
- naked and/or invalid HTML in description (The Register)
|
Things it handles that choke other parsers:
|
||||||
- content:encoded in item element (Aaron Swartz)
|
- bastard combinations of RSS 0.9x and RSS 1.0
|
||||||
- guid in item element (Scripting News)
|
- illegal XML characters
|
||||||
- fullitem in item element (Jon Udell)
|
- naked and/or invalid HTML in description
|
||||||
- non-standard namespaces (BitWorking)
|
- content:encoded in item element
|
||||||
|
- guid in item element
|
||||||
|
- fullitem in item element
|
||||||
|
- non-standard namespaces
|
||||||
|
- inline XML in content (Pie/Atom/Echo)
|
||||||
|
- multiple content items per entry (Pie/Atom/Echo)
|
||||||
|
|
||||||
Requires Python 2.2 or later
|
Requires Python 2.2 or later
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
|
__version__ = "2.5.3"
|
||||||
__copyright__ = "Copyright 2002, Mark Pilgrim"
|
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
|
||||||
__contributors__ = ["Jason Diamond (jason@injektilo.org)"]
|
__copyright__ = "Copyright 2002-3, Mark Pilgrim"
|
||||||
__license__ = "GPL"
|
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
|
||||||
|
"John Beimler <http://john.beimler.org/>"]
|
||||||
|
__license__ = "Python"
|
||||||
__history__ = """
|
__history__ = """
|
||||||
1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
|
1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
|
||||||
added Simon Fell's test suite
|
added Simon Fell's test suite
|
||||||
@ -52,6 +59,25 @@ __history__ = """
|
|||||||
2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
|
2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
|
||||||
start_admingeneratoragent is an example of how to handle elements with
|
start_admingeneratoragent is an example of how to handle elements with
|
||||||
only attributes, no content.
|
only attributes, no content.
|
||||||
|
2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
|
||||||
|
also, make sure we send the User-Agent even if urllib2 isn't available.
|
||||||
|
Match any variation of backend.userland.com/rss namespace.
|
||||||
|
2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
|
||||||
|
2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
|
||||||
|
snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
|
||||||
|
project name
|
||||||
|
2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
|
||||||
|
removed unnecessary urllib code -- urllib2 should always be available anyway;
|
||||||
|
return actual url, status, and full HTTP headers (as result['url'],
|
||||||
|
result['status'], and result['headers']) if parsing a remote feed over HTTP --
|
||||||
|
this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
|
||||||
|
added the latest namespace-of-the-week for RSS 2.0
|
||||||
|
2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
|
||||||
|
User-Agent (otherwise urllib2 sends two, which confuses some servers)
|
||||||
|
2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
|
||||||
|
inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
|
||||||
|
2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
|
||||||
|
textInput, and also to return the character encoding (if specified)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -59,9 +85,11 @@ try:
|
|||||||
timeoutsocket.setDefaultSocketTimeout(10)
|
timeoutsocket.setDefaultSocketTimeout(10)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
import cgi, re, sgmllib, string, StringIO, urllib, gzip
|
import cgi, re, sgmllib, string, StringIO, gzip, urllib2
|
||||||
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
||||||
|
|
||||||
|
USER_AGENT = "UltraLiberalFeedParser/%s +http://diveintomark.org/projects/feed_parser/" % __version__
|
||||||
|
|
||||||
def decodeEntities(data):
|
def decodeEntities(data):
|
||||||
data = data or ''
|
data = data or ''
|
||||||
data = data.replace('<', '<')
|
data = data.replace('<', '<')
|
||||||
@ -71,15 +99,21 @@ def decodeEntities(data):
|
|||||||
data = data.replace('&', '&')
|
data = data.replace('&', '&')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
class RSSParser(sgmllib.SGMLParser):
|
class FeedParser(sgmllib.SGMLParser):
|
||||||
namespaces = {"http://backend.userland.com/rss": "",
|
namespaces = {"http://backend.userland.com/rss": "",
|
||||||
"http://backend.userland.com/rss2": "",
|
"http://blogs.law.harvard.edu/tech/rss": "",
|
||||||
"http://purl.org/rss/1.0/": "",
|
"http://purl.org/rss/1.0/": "",
|
||||||
|
"http://example.com/newformat#": "",
|
||||||
|
"http://example.com/necho": "",
|
||||||
|
"http://purl.org/echo/": "",
|
||||||
|
"uri/of/echo/namespace#": "",
|
||||||
|
"http://purl.org/pie/": "",
|
||||||
"http://purl.org/rss/1.0/modules/textinput/": "ti",
|
"http://purl.org/rss/1.0/modules/textinput/": "ti",
|
||||||
"http://purl.org/rss/1.0/modules/company/": "co",
|
"http://purl.org/rss/1.0/modules/company/": "co",
|
||||||
"http://purl.org/rss/1.0/modules/syndication/": "sy",
|
"http://purl.org/rss/1.0/modules/syndication/": "sy",
|
||||||
"http://purl.org/dc/elements/1.1/": "dc",
|
"http://purl.org/dc/elements/1.1/": "dc",
|
||||||
"http://webns.net/mvcb/": "admin"}
|
"http://webns.net/mvcb/": "admin",
|
||||||
|
"http://www.w3.org/1999/xhtml": "xhtml"}
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.channel = {}
|
self.channel = {}
|
||||||
@ -87,6 +121,12 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
self.elementstack = []
|
self.elementstack = []
|
||||||
self.inchannel = 0
|
self.inchannel = 0
|
||||||
self.initem = 0
|
self.initem = 0
|
||||||
|
self.incontent = 0
|
||||||
|
self.intextinput = 0
|
||||||
|
self.inimage = 0
|
||||||
|
self.contentmode = None
|
||||||
|
self.contenttype = None
|
||||||
|
self.contentlang = None
|
||||||
self.namespacemap = {}
|
self.namespacemap = {}
|
||||||
sgmllib.SGMLParser.reset(self)
|
sgmllib.SGMLParser.reset(self)
|
||||||
|
|
||||||
@ -100,15 +140,22 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
if not expectingText: return
|
if not expectingText: return
|
||||||
output = "".join(pieces)
|
output = "".join(pieces)
|
||||||
output = decodeEntities(output)
|
output = decodeEntities(output)
|
||||||
if self.initem:
|
if self.incontent and self.initem:
|
||||||
|
if not self.items[-1].has_key(element):
|
||||||
|
self.items[-1][element] = []
|
||||||
|
self.items[-1][element].append({"language":self.contentlang, "type":self.contenttype, "value":output})
|
||||||
|
elif self.initem:
|
||||||
self.items[-1][element] = output
|
self.items[-1][element] = output
|
||||||
elif self.inchannel:
|
elif self.inchannel and (not self.intextinput) and (not self.inimage):
|
||||||
self.channel[element] = output
|
self.channel[element] = output
|
||||||
|
|
||||||
def _addNamespaces(self, attrs):
|
def _addNamespaces(self, attrs):
|
||||||
for prefix, value in attrs:
|
for prefix, value in attrs:
|
||||||
if not prefix.startswith("xmlns:"): continue
|
if not prefix.startswith("xmlns:"): continue
|
||||||
prefix = prefix[6:]
|
prefix = prefix[6:]
|
||||||
|
if prefix.find('backend.userland.com/rss') <> -1:
|
||||||
|
# match any backend.userland.com namespace
|
||||||
|
prefix = 'http://backend.userland.com/rss'
|
||||||
if self.namespaces.has_key(value):
|
if self.namespaces.has_key(value):
|
||||||
self.namespacemap[prefix] = self.namespaces[value]
|
self.namespacemap[prefix] = self.namespaces[value]
|
||||||
|
|
||||||
@ -137,6 +184,18 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
self.pop('channel')
|
self.pop('channel')
|
||||||
self.inchannel = 0
|
self.inchannel = 0
|
||||||
|
|
||||||
|
def start_image(self, attrs):
|
||||||
|
self.inimage = 1
|
||||||
|
|
||||||
|
def end_image(self):
|
||||||
|
self.inimage = 0
|
||||||
|
|
||||||
|
def start_textinput(self, attrs):
|
||||||
|
self.intextinput = 1
|
||||||
|
|
||||||
|
def end_textinput(self):
|
||||||
|
self.intextinput = 0
|
||||||
|
|
||||||
def start_item(self, attrs):
|
def start_item(self, attrs):
|
||||||
self.items.append({})
|
self.items.append({})
|
||||||
self.push('item', 0)
|
self.push('item', 0)
|
||||||
@ -201,6 +260,9 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
def end_guid(self):
|
def end_guid(self):
|
||||||
self.pop('guid')
|
self.pop('guid')
|
||||||
if self.guidislink:
|
if self.guidislink:
|
||||||
|
if not self.items[-1].has_key('link'):
|
||||||
|
# guid acts as link, but only if "ispermalink" is not present or is "true",
|
||||||
|
# and only if the item doesn't already have a link element
|
||||||
self.items[-1]['link'] = self.items[-1]['guid']
|
self.items[-1]['link'] = self.items[-1]['guid']
|
||||||
|
|
||||||
def start_title(self, attrs):
|
def start_title(self, attrs):
|
||||||
@ -224,7 +286,101 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
self.elementstack[-1][2].append(value)
|
self.elementstack[-1][2].append(value)
|
||||||
self.pop('generator')
|
self.pop('generator')
|
||||||
|
|
||||||
|
def start_feed(self, attrs):
|
||||||
|
self.inchannel = 1
|
||||||
|
|
||||||
|
def end_feed(self):
|
||||||
|
self.inchannel = 0
|
||||||
|
|
||||||
|
def start_entry(self, attrs):
|
||||||
|
self.items.append({})
|
||||||
|
self.push('item', 0)
|
||||||
|
self.initem = 1
|
||||||
|
|
||||||
|
def end_entry(self):
|
||||||
|
self.pop('item')
|
||||||
|
self.initem = 0
|
||||||
|
|
||||||
|
def start_subtitle(self, attrs):
|
||||||
|
self.push('subtitle', 1)
|
||||||
|
|
||||||
|
def end_subtitle(self):
|
||||||
|
self.pop('subtitle')
|
||||||
|
|
||||||
|
def start_summary(self, attrs):
|
||||||
|
self.push('summary', 1)
|
||||||
|
|
||||||
|
def end_summary(self):
|
||||||
|
self.pop('summary')
|
||||||
|
|
||||||
|
def start_modified(self, attrs):
|
||||||
|
self.push('modified', 1)
|
||||||
|
|
||||||
|
def end_modified(self):
|
||||||
|
self.pop('modified')
|
||||||
|
|
||||||
|
def start_created(self, attrs):
|
||||||
|
self.push('created', 1)
|
||||||
|
|
||||||
|
def end_created(self):
|
||||||
|
self.pop('created')
|
||||||
|
|
||||||
|
def start_issued(self, attrs):
|
||||||
|
self.push('issued', 1)
|
||||||
|
|
||||||
|
def end_issued(self):
|
||||||
|
self.pop('issued')
|
||||||
|
|
||||||
|
def start_id(self, attrs):
|
||||||
|
self.push('id', 1)
|
||||||
|
|
||||||
|
def end_id(self):
|
||||||
|
self.pop('id')
|
||||||
|
|
||||||
|
def start_content(self, attrs):
|
||||||
|
self.incontent = 1
|
||||||
|
if ('mode', 'escaped') in attrs:
|
||||||
|
self.contentmode = 'escaped'
|
||||||
|
elif ('mode', 'base64') in attrs:
|
||||||
|
self.contentmode = 'base64'
|
||||||
|
else:
|
||||||
|
self.contentmode = 'xml'
|
||||||
|
mimetype = [v for k, v in attrs if k=='type']
|
||||||
|
if mimetype:
|
||||||
|
self.contenttype = mimetype[0]
|
||||||
|
xmllang = [v for k, v in attrs if k=='xml:lang']
|
||||||
|
if xmllang:
|
||||||
|
self.contentlang = xmllang[0]
|
||||||
|
self.push('content', 1)
|
||||||
|
|
||||||
|
def end_content(self):
|
||||||
|
self.pop('content')
|
||||||
|
self.incontent = 0
|
||||||
|
self.contentmode = None
|
||||||
|
self.contenttype = None
|
||||||
|
self.contentlang = None
|
||||||
|
|
||||||
|
def start_body(self, attrs):
|
||||||
|
self.incontent = 1
|
||||||
|
self.contentmode = 'xml'
|
||||||
|
self.contenttype = 'application/xhtml+xml'
|
||||||
|
xmllang = [v for k, v in attrs if k=='xml:lang']
|
||||||
|
if xmllang:
|
||||||
|
self.contentlang = xmllang[0]
|
||||||
|
self.push('content', 1)
|
||||||
|
|
||||||
|
start_div = start_body
|
||||||
|
start_xhtml_body = start_body
|
||||||
|
start_xhtml_div = start_body
|
||||||
|
end_body = end_content
|
||||||
|
end_div = end_content
|
||||||
|
end_xhtml_body = end_content
|
||||||
|
end_xhtml_div = end_content
|
||||||
|
|
||||||
def unknown_starttag(self, tag, attrs):
|
def unknown_starttag(self, tag, attrs):
|
||||||
|
if self.incontent and self.contentmode == 'xml':
|
||||||
|
self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])))
|
||||||
|
return
|
||||||
self._addNamespaces(attrs)
|
self._addNamespaces(attrs)
|
||||||
colonpos = tag.find(':')
|
colonpos = tag.find(':')
|
||||||
if colonpos <> -1:
|
if colonpos <> -1:
|
||||||
@ -242,6 +398,9 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
return self.push(tag, 0)
|
return self.push(tag, 0)
|
||||||
|
|
||||||
def unknown_endtag(self, tag):
|
def unknown_endtag(self, tag):
|
||||||
|
if self.incontent and self.contentmode == 'xml':
|
||||||
|
self.handle_data("</%s>" % tag)
|
||||||
|
return
|
||||||
colonpos = tag.find(':')
|
colonpos = tag.find(':')
|
||||||
if colonpos <> -1:
|
if colonpos <> -1:
|
||||||
prefix = tag[:colonpos]
|
prefix = tag[:colonpos]
|
||||||
@ -261,18 +420,26 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
# called for each character reference, e.g. for " ", ref will be "160"
|
# called for each character reference, e.g. for " ", ref will be "160"
|
||||||
# Reconstruct the original character reference.
|
# Reconstruct the original character reference.
|
||||||
if not self.elementstack: return
|
if not self.elementstack: return
|
||||||
self.elementstack[-1][2].append("&#%(ref)s;" % locals())
|
text = "&#%s;" % ref
|
||||||
|
if self.incontent and self.contentmode == 'xml':
|
||||||
|
text = cgi.escape(text)
|
||||||
|
self.elementstack[-1][2].append(text)
|
||||||
|
|
||||||
def handle_entityref(self, ref):
|
def handle_entityref(self, ref):
|
||||||
# called for each entity reference, e.g. for "©", ref will be "copy"
|
# called for each entity reference, e.g. for "©", ref will be "copy"
|
||||||
# Reconstruct the original entity reference.
|
# Reconstruct the original entity reference.
|
||||||
if not self.elementstack: return
|
if not self.elementstack: return
|
||||||
self.elementstack[-1][2].append("&%(ref)s;" % locals())
|
text = "&%s;" % ref
|
||||||
|
if self.incontent and self.contentmode == 'xml':
|
||||||
|
text = cgi.escape(text)
|
||||||
|
self.elementstack[-1][2].append(text)
|
||||||
|
|
||||||
def handle_data(self, text):
|
def handle_data(self, text):
|
||||||
# called for each block of plain text, i.e. outside of any tag and
|
# called for each block of plain text, i.e. outside of any tag and
|
||||||
# not containing any character or entity references
|
# not containing any character or entity references
|
||||||
if not self.elementstack: return
|
if not self.elementstack: return
|
||||||
|
if self.incontent and self.contentmode == 'xml':
|
||||||
|
text = cgi.escape(text)
|
||||||
self.elementstack[-1][2].append(text)
|
self.elementstack[-1][2].append(text)
|
||||||
|
|
||||||
def handle_comment(self, text):
|
def handle_comment(self, text):
|
||||||
@ -315,6 +482,29 @@ class RSSParser(sgmllib.SGMLParser):
|
|||||||
return k+3
|
return k+3
|
||||||
return sgmllib.SGMLParser.parse_declaration(self, i)
|
return sgmllib.SGMLParser.parse_declaration(self, i)
|
||||||
|
|
||||||
|
class FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
|
||||||
|
def http_error_default(self, req, fp, code, msg, headers):
|
||||||
|
if ((code / 100) == 3) and (code != 304):
|
||||||
|
return self.http_error_302(req, fp, code, msg, headers)
|
||||||
|
from urllib import addinfourl
|
||||||
|
infourl = addinfourl(fp, headers, req.get_full_url())
|
||||||
|
infourl.status = code
|
||||||
|
return infourl
|
||||||
|
# raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
|
||||||
|
|
||||||
|
def http_error_302(self, req, fp, code, msg, headers):
|
||||||
|
infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
|
||||||
|
infourl.status = code
|
||||||
|
return infourl
|
||||||
|
|
||||||
|
def http_error_301(self, req, fp, code, msg, headers):
|
||||||
|
infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
|
||||||
|
infourl.status = code
|
||||||
|
return infourl
|
||||||
|
|
||||||
|
http_error_300 = http_error_302
|
||||||
|
http_error_307 = http_error_302
|
||||||
|
|
||||||
def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
||||||
"""
|
"""
|
||||||
URI, filename, or string --> stream
|
URI, filename, or string --> stream
|
||||||
@ -338,10 +528,6 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
|||||||
|
|
||||||
If the referrer argument is supplied, it will be used as the value of a
|
If the referrer argument is supplied, it will be used as the value of a
|
||||||
Referer[sic] request header.
|
Referer[sic] request header.
|
||||||
|
|
||||||
The optional arguments are only used if the source argument is an HTTP
|
|
||||||
URL and the urllib2 module is importable (i.e., you must be using Python
|
|
||||||
version 2.0 or higher).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if hasattr(source, "read"):
|
if hasattr(source, "read"):
|
||||||
@ -350,35 +536,25 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
|||||||
if source == "-":
|
if source == "-":
|
||||||
return sys.stdin
|
return sys.stdin
|
||||||
|
|
||||||
|
if not agent:
|
||||||
|
agent = USER_AGENT
|
||||||
|
|
||||||
# try to open with urllib2 (to use optional headers)
|
# try to open with urllib2 (to use optional headers)
|
||||||
try:
|
|
||||||
import urllib2
|
|
||||||
request = urllib2.Request(source)
|
request = urllib2.Request(source)
|
||||||
if etag:
|
if etag:
|
||||||
request.add_header("If-None-Match", etag)
|
request.add_header("If-None-Match", etag)
|
||||||
if modified:
|
if modified:
|
||||||
request.add_header("If-Modified-Since", format_http_date(modified))
|
request.add_header("If-Modified-Since", format_http_date(modified))
|
||||||
if agent:
|
|
||||||
request.add_header("User-Agent", agent)
|
request.add_header("User-Agent", agent)
|
||||||
if referrer:
|
if referrer:
|
||||||
# http://www.dictionary.com/search?q=referer
|
|
||||||
request.add_header("Referer", referrer)
|
request.add_header("Referer", referrer)
|
||||||
request.add_header("Accept-encoding", "gzip")
|
request.add_header("Accept-encoding", "gzip")
|
||||||
|
opener = urllib2.build_opener(FeedURLHandler())
|
||||||
|
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
|
||||||
try:
|
try:
|
||||||
return urllib2.urlopen(request)
|
return opener.open(request)
|
||||||
except urllib2.HTTPError:
|
|
||||||
# either the resource is not modified or some other HTTP
|
|
||||||
# error occurred so return an empty resource
|
|
||||||
return StringIO.StringIO("")
|
|
||||||
except:
|
except:
|
||||||
# source must not be a valid URL but it might be a valid filename
|
# source is not a valid URL, but it might be a valid filename
|
||||||
pass
|
|
||||||
except ImportError:
|
|
||||||
# urllib2 isn't available so try to open with urllib
|
|
||||||
try:
|
|
||||||
return urllib.urlopen(source)
|
|
||||||
except:
|
|
||||||
# source still might be a filename
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# try to open with native open function (if source is a filename)
|
# try to open with native open function (if source is a filename)
|
||||||
@ -478,12 +654,16 @@ def parse_http_date(date):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def parse(uri, etag=None, modified=None, agent=None, referrer=None):
|
def parse(uri, etag=None, modified=None, agent=None, referrer=None):
|
||||||
r = RSSParser()
|
r = FeedParser()
|
||||||
f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
|
f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
if hasattr(f, "headers"):
|
if hasattr(f, "headers"):
|
||||||
if f.headers.get('content-encoding', None) == 'gzip':
|
if f.headers.get('content-encoding', '') == 'gzip':
|
||||||
|
try:
|
||||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||||
|
except:
|
||||||
|
# some feeds claim to be gzipped but they're not, so we get garbage
|
||||||
|
data = ''
|
||||||
r.feed(data)
|
r.feed(data)
|
||||||
result = {"channel": r.channel, "items": r.items}
|
result = {"channel": r.channel, "items": r.items}
|
||||||
newEtag = get_etag(f)
|
newEtag = get_etag(f)
|
||||||
@ -492,6 +672,20 @@ def parse(uri, etag=None, modified=None, agent=None, referrer=None):
|
|||||||
newModified = get_modified(f)
|
newModified = get_modified(f)
|
||||||
if newModified: result["modified"] = newModified
|
if newModified: result["modified"] = newModified
|
||||||
elif modified: result["modified"] = modified
|
elif modified: result["modified"] = modified
|
||||||
|
if hasattr(f, "url"):
|
||||||
|
result["url"] = f.url
|
||||||
|
if hasattr(f, "headers"):
|
||||||
|
result["headers"] = f.headers.dict
|
||||||
|
if hasattr(f, "status"):
|
||||||
|
result["status"] = f.status
|
||||||
|
elif hasattr(f, "url"):
|
||||||
|
result["status"] = 200
|
||||||
|
# get the xml encoding
|
||||||
|
if result.get('encoding', '') == '':
|
||||||
|
xmlheaderRe = re.compile('<\?.*encoding="(.*)".*\?>')
|
||||||
|
match = xmlheaderRe.match(data)
|
||||||
|
if match:
|
||||||
|
result['encoding'] = match.groups()[0].lower()
|
||||||
f.close()
|
f.close()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -515,5 +709,14 @@ if __name__ == '__main__':
|
|||||||
print url
|
print url
|
||||||
print
|
print
|
||||||
result = parse(url)
|
result = parse(url)
|
||||||
pprint(result['channel'])
|
pprint(result)
|
||||||
print
|
print
|
||||||
|
|
||||||
|
"""
|
||||||
|
TODO
|
||||||
|
- textinput/textInput
|
||||||
|
- image
|
||||||
|
- author
|
||||||
|
- contributor
|
||||||
|
- comments
|
||||||
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user