mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-12-25 12:12:54 +01:00
520 lines
19 KiB
Python
520 lines
19 KiB
Python
#!/usr/bin/python
|
|
"""Ultra-liberal RSS parser
|
|
|
|
Visit http://diveintomark.org/projects/rss_parser/ for the latest version
|
|
|
|
Handles RSS 0.9x and RSS 1.0 feeds
|
|
|
|
RSS 0.9x elements:
|
|
- title, link, description, webMaster, managingEditor, language
|
|
copyright, lastBuildDate, pubDate
|
|
|
|
RSS 1.0 elements:
|
|
- dc:rights, dc:language, dc:creator, dc:date, dc:subject,
|
|
content:encoded
|
|
|
|
Things it handles that choke other RSS parsers:
|
|
- bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds)
|
|
- illegal XML characters (most Radio feeds)
|
|
- naked and/or invalid HTML in description (The Register)
|
|
- content:encoded in item element (Aaron Swartz)
|
|
- guid in item element (Scripting News)
|
|
- fullitem in item element (Jon Udell)
|
|
- non-standard namespaces (BitWorking)
|
|
|
|
Requires Python 2.2 or later
|
|
"""
|
|
|
|
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
|
|
__copyright__ = "Copyright 2002, Mark Pilgrim"
|
|
__contributors__ = ["Jason Diamond (jason@injektilo.org)"]
|
|
__license__ = "GPL"
|
|
__history__ = """
|
|
1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
|
|
added Simon Fell's test suite
|
|
1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
|
|
2.0 - 10/19/2002
|
|
JD - use inchannel to watch out for image and textinput elements which can
|
|
also contain title, link, and description elements
|
|
JD - check for isPermaLink="false" attribute on guid elements
|
|
JD - replaced openAnything with open_resource supporting ETag and
|
|
If-Modified-Since request headers
|
|
JD - parse now accepts etag, modified, agent, and referrer optional
|
|
arguments
|
|
JD - modified parse to return a dictionary instead of a tuple so that any
|
|
etag or modified information can be returned and cached by the caller
|
|
2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
|
|
because of etag/modified, return the old etag/modified to the caller to
|
|
indicate why nothing is being returned
|
|
2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
|
|
useless. Fixes the problem JD was addressing by adding it.
|
|
2.1 - 11/14/2002 - MAP - added gzip support
|
|
2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
|
|
start_admingeneratoragent is an example of how to handle elements with
|
|
only attributes, no content.
|
|
"""
|
|
|
|
try:
|
|
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
|
|
timeoutsocket.setDefaultSocketTimeout(10)
|
|
except ImportError:
|
|
pass
|
|
import cgi, re, sgmllib, string, StringIO, urllib, gzip
|
|
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
|
|
|
def decodeEntities(data):
|
|
data = data or ''
|
|
data = data.replace('<', '<')
|
|
data = data.replace('>', '>')
|
|
data = data.replace('"', '"')
|
|
data = data.replace(''', "'")
|
|
data = data.replace('&', '&')
|
|
return data
|
|
|
|
class RSSParser(sgmllib.SGMLParser):
|
|
namespaces = {"http://backend.userland.com/rss": "",
|
|
"http://backend.userland.com/rss2": "",
|
|
"http://purl.org/rss/1.0/": "",
|
|
"http://purl.org/rss/1.0/modules/textinput/": "ti",
|
|
"http://purl.org/rss/1.0/modules/company/": "co",
|
|
"http://purl.org/rss/1.0/modules/syndication/": "sy",
|
|
"http://purl.org/dc/elements/1.1/": "dc",
|
|
"http://webns.net/mvcb/": "admin"}
|
|
|
|
def reset(self):
|
|
self.channel = {}
|
|
self.items = []
|
|
self.elementstack = []
|
|
self.inchannel = 0
|
|
self.initem = 0
|
|
self.namespacemap = {}
|
|
sgmllib.SGMLParser.reset(self)
|
|
|
|
def push(self, element, expectingText):
|
|
self.elementstack.append([element, expectingText, []])
|
|
|
|
def pop(self, element):
|
|
if not self.elementstack: return
|
|
if self.elementstack[-1][0] != element: return
|
|
element, expectingText, pieces = self.elementstack.pop()
|
|
if not expectingText: return
|
|
output = "".join(pieces)
|
|
output = decodeEntities(output)
|
|
if self.initem:
|
|
self.items[-1][element] = output
|
|
elif self.inchannel:
|
|
self.channel[element] = output
|
|
|
|
def _addNamespaces(self, attrs):
|
|
for prefix, value in attrs:
|
|
if not prefix.startswith("xmlns:"): continue
|
|
prefix = prefix[6:]
|
|
if self.namespaces.has_key(value):
|
|
self.namespacemap[prefix] = self.namespaces[value]
|
|
|
|
def _mapToStandardPrefix(self, name):
|
|
colonpos = name.find(':')
|
|
if colonpos <> -1:
|
|
prefix = name[:colonpos]
|
|
suffix = name[colonpos+1:]
|
|
prefix = self.namespacemap.get(prefix, prefix)
|
|
name = prefix + ':' + suffix
|
|
return name
|
|
|
|
def _getAttribute(self, attrs, name):
|
|
value = [v for k, v in attrs if self._mapToStandardPrefix(k) == name]
|
|
if value:
|
|
value = value[0]
|
|
else:
|
|
value = None
|
|
return value
|
|
|
|
def start_channel(self, attrs):
|
|
self.push('channel', 0)
|
|
self.inchannel = 1
|
|
|
|
def end_channel(self):
|
|
self.pop('channel')
|
|
self.inchannel = 0
|
|
|
|
def start_item(self, attrs):
|
|
self.items.append({})
|
|
self.push('item', 0)
|
|
self.initem = 1
|
|
|
|
def end_item(self):
|
|
self.pop('item')
|
|
self.initem = 0
|
|
|
|
def start_dc_language(self, attrs):
|
|
self.push('language', 1)
|
|
start_language = start_dc_language
|
|
|
|
def end_dc_language(self):
|
|
self.pop('language')
|
|
end_language = end_dc_language
|
|
|
|
def start_dc_creator(self, attrs):
|
|
self.push('creator', 1)
|
|
start_managingeditor = start_dc_creator
|
|
start_webmaster = start_dc_creator
|
|
|
|
def end_dc_creator(self):
|
|
self.pop('creator')
|
|
end_managingeditor = end_dc_creator
|
|
end_webmaster = end_dc_creator
|
|
|
|
def start_dc_rights(self, attrs):
|
|
self.push('rights', 1)
|
|
start_copyright = start_dc_rights
|
|
|
|
def end_dc_rights(self):
|
|
self.pop('rights')
|
|
end_copyright = end_dc_rights
|
|
|
|
def start_dc_date(self, attrs):
|
|
self.push('date', 1)
|
|
start_lastbuilddate = start_dc_date
|
|
start_pubdate = start_dc_date
|
|
|
|
def end_dc_date(self):
|
|
self.pop('date')
|
|
end_lastbuilddate = end_dc_date
|
|
end_pubdate = end_dc_date
|
|
|
|
def start_dc_subject(self, attrs):
|
|
self.push('category', 1)
|
|
|
|
def end_dc_subject(self):
|
|
self.pop('category')
|
|
|
|
def start_link(self, attrs):
|
|
self.push('link', self.inchannel or self.initem)
|
|
|
|
def end_link(self):
|
|
self.pop('link')
|
|
|
|
def start_guid(self, attrs):
|
|
self.guidislink = ('ispermalink', 'false') not in attrs
|
|
self.push('guid', 1)
|
|
|
|
def end_guid(self):
|
|
self.pop('guid')
|
|
if self.guidislink:
|
|
self.items[-1]['link'] = self.items[-1]['guid']
|
|
|
|
def start_title(self, attrs):
|
|
self.push('title', self.inchannel or self.initem)
|
|
|
|
def start_description(self, attrs):
|
|
self.push('description', self.inchannel or self.initem)
|
|
|
|
def start_content_encoded(self, attrs):
|
|
self.push('content_encoded', 1)
|
|
start_fullitem = start_content_encoded
|
|
|
|
def end_content_encoded(self):
|
|
self.pop('content_encoded')
|
|
end_fullitem = end_content_encoded
|
|
|
|
def start_admin_generatoragent(self, attrs):
|
|
self.push('generator', 1)
|
|
value = self._getAttribute(attrs, 'rdf:resource')
|
|
if value:
|
|
self.elementstack[-1][2].append(value)
|
|
self.pop('generator')
|
|
|
|
def unknown_starttag(self, tag, attrs):
|
|
self._addNamespaces(attrs)
|
|
colonpos = tag.find(':')
|
|
if colonpos <> -1:
|
|
prefix = tag[:colonpos]
|
|
suffix = tag[colonpos+1:]
|
|
prefix = self.namespacemap.get(prefix, prefix)
|
|
if prefix:
|
|
prefix = prefix + '_'
|
|
methodname = 'start_' + prefix + suffix
|
|
try:
|
|
method = getattr(self, methodname)
|
|
return method(attrs)
|
|
except AttributeError:
|
|
return self.push(prefix + suffix, 0)
|
|
return self.push(tag, 0)
|
|
|
|
def unknown_endtag(self, tag):
|
|
colonpos = tag.find(':')
|
|
if colonpos <> -1:
|
|
prefix = tag[:colonpos]
|
|
suffix = tag[colonpos+1:]
|
|
prefix = self.namespacemap.get(prefix, prefix)
|
|
if prefix:
|
|
prefix = prefix + '_'
|
|
methodname = 'end_' + prefix + suffix
|
|
try:
|
|
method = getattr(self, methodname)
|
|
return method()
|
|
except AttributeError:
|
|
return self.pop(prefix + suffix)
|
|
return self.pop(tag)
|
|
|
|
def handle_charref(self, ref):
|
|
# called for each character reference, e.g. for " ", ref will be "160"
|
|
# Reconstruct the original character reference.
|
|
if not self.elementstack: return
|
|
self.elementstack[-1][2].append("&#%(ref)s;" % locals())
|
|
|
|
def handle_entityref(self, ref):
|
|
# called for each entity reference, e.g. for "©", ref will be "copy"
|
|
# Reconstruct the original entity reference.
|
|
if not self.elementstack: return
|
|
self.elementstack[-1][2].append("&%(ref)s;" % locals())
|
|
|
|
def handle_data(self, text):
|
|
# called for each block of plain text, i.e. outside of any tag and
|
|
# not containing any character or entity references
|
|
if not self.elementstack: return
|
|
self.elementstack[-1][2].append(text)
|
|
|
|
def handle_comment(self, text):
|
|
# called for each comment, e.g. <!-- insert message here -->
|
|
pass
|
|
|
|
def handle_pi(self, text):
|
|
# called for each processing instruction, e.g. <?instruction>
|
|
pass
|
|
|
|
def handle_decl(self, text):
|
|
# called for the DOCTYPE, if present, e.g.
|
|
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
|
# "http://www.w3.org/TR/html4/loose.dtd">
|
|
pass
|
|
|
|
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
|
|
def _scan_name(self, i, declstartpos):
|
|
rawdata = self.rawdata
|
|
n = len(rawdata)
|
|
if i == n:
|
|
return None, -1
|
|
m = self._new_declname_match(rawdata, i)
|
|
if m:
|
|
s = m.group()
|
|
name = s.strip()
|
|
if (i + len(s)) == n:
|
|
return None, -1 # end of buffer
|
|
return string.lower(name), m.end()
|
|
else:
|
|
self.updatepos(declstartpos, i)
|
|
self.error("expected name token")
|
|
|
|
def parse_declaration(self, i):
|
|
# override internal declaration handler to handle CDATA blocks
|
|
if self.rawdata[i:i+9] == '<![CDATA[':
|
|
k = self.rawdata.find(']]>', i)
|
|
if k == -1: k = len(self.rawdata)
|
|
self.handle_data(cgi.escape(self.rawdata[i+9:k]))
|
|
return k+3
|
|
return sgmllib.SGMLParser.parse_declaration(self, i)
|
|
|
|
def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
|
"""
|
|
URI, filename, or string --> stream
|
|
|
|
This function lets you define parsers that take any input source
|
|
(URL, pathname to local or network file, or actual data as a string)
|
|
and deal with it in a uniform manner. Returned object is guaranteed
|
|
to have all the basic stdio read methods (read, readline, readlines).
|
|
Just .close() the object when you're done with it.
|
|
|
|
If the etag argument is supplied, it will be used as the value of an
|
|
If-None-Match request header.
|
|
|
|
If the modified argument is supplied, it must be a tuple of 9 integers
|
|
as returned by gmtime() in the standard Python time module. This MUST
|
|
be in GMT (Greenwich Mean Time). The formatted date/time will be used
|
|
as the value of an If-Modified-Since request header.
|
|
|
|
If the agent argument is supplied, it will be used as the value of a
|
|
User-Agent request header.
|
|
|
|
If the referrer argument is supplied, it will be used as the value of a
|
|
Referer[sic] request header.
|
|
|
|
The optional arguments are only used if the source argument is an HTTP
|
|
URL and the urllib2 module is importable (i.e., you must be using Python
|
|
version 2.0 or higher).
|
|
"""
|
|
|
|
if hasattr(source, "read"):
|
|
return source
|
|
|
|
if source == "-":
|
|
return sys.stdin
|
|
|
|
# try to open with urllib2 (to use optional headers)
|
|
try:
|
|
import urllib2
|
|
request = urllib2.Request(source)
|
|
if etag:
|
|
request.add_header("If-None-Match", etag)
|
|
if modified:
|
|
request.add_header("If-Modified-Since", format_http_date(modified))
|
|
if agent:
|
|
request.add_header("User-Agent", agent)
|
|
if referrer:
|
|
# http://www.dictionary.com/search?q=referer
|
|
request.add_header("Referer", referrer)
|
|
request.add_header("Accept-encoding", "gzip")
|
|
try:
|
|
return urllib2.urlopen(request)
|
|
except urllib2.HTTPError:
|
|
# either the resource is not modified or some other HTTP
|
|
# error occurred so return an empty resource
|
|
return StringIO.StringIO("")
|
|
except:
|
|
# source must not be a valid URL but it might be a valid filename
|
|
pass
|
|
except ImportError:
|
|
# urllib2 isn't available so try to open with urllib
|
|
try:
|
|
return urllib.urlopen(source)
|
|
except:
|
|
# source still might be a filename
|
|
pass
|
|
|
|
# try to open with native open function (if source is a filename)
|
|
try:
|
|
return open(source)
|
|
except:
|
|
pass
|
|
|
|
# treat source as string
|
|
return StringIO.StringIO(str(source))
|
|
|
|
def get_etag(resource):
|
|
"""
|
|
Get the ETag associated with a response returned from a call to
|
|
open_resource().
|
|
|
|
If the resource was not returned from an HTTP server or the server did
|
|
not specify an ETag for the resource, this will return None.
|
|
"""
|
|
|
|
if hasattr(resource, "info"):
|
|
return resource.info().getheader("ETag")
|
|
return None
|
|
|
|
def get_modified(resource):
|
|
"""
|
|
Get the Last-Modified timestamp for a response returned from a call to
|
|
open_resource().
|
|
|
|
If the resource was not returned from an HTTP server or the server did
|
|
not specify a Last-Modified timestamp, this function will return None.
|
|
Otherwise, it returns a tuple of 9 integers as returned by gmtime() in
|
|
the standard Python time module().
|
|
"""
|
|
|
|
if hasattr(resource, "info"):
|
|
last_modified = resource.info().getheader("Last-Modified")
|
|
if last_modified:
|
|
return parse_http_date(last_modified)
|
|
return None
|
|
|
|
short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
|
long_weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
|
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
|
|
|
def format_http_date(date):
|
|
"""
|
|
Formats a tuple of 9 integers into an RFC 1123-compliant timestamp as
|
|
required in RFC 2616. We don't use time.strftime() since the %a and %b
|
|
directives can be affected by the current locale (HTTP dates have to be
|
|
in English). The date MUST be in GMT (Greenwich Mean Time).
|
|
"""
|
|
|
|
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[date[6]], date[2], months[date[1] - 1], date[0], date[3], date[4], date[5])
|
|
|
|
rfc1123_match = re.compile(r"(?P<weekday>[A-Z][a-z]{2}), (?P<day>\d{2}) (?P<month>[A-Z][a-z]{2}) (?P<year>\d{4}) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) GMT").match
|
|
rfc850_match = re.compile(r"(?P<weekday>[A-Z][a-z]+), (?P<day>\d{2})-(?P<month>[A-Z][a-z]{2})-(?P<year>\d{2}) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) GMT").match
|
|
asctime_match = re.compile(r"(?P<weekday>[A-Z][a-z]{2}) (?P<month>[A-Z][a-z]{2}) ?(?P<day>\d\d?) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) (?P<year>\d{4})").match
|
|
|
|
def parse_http_date(date):
|
|
"""
|
|
Parses any of the three HTTP date formats into a tuple of 9 integers as
|
|
returned by time.gmtime(). This should not use time.strptime() since
|
|
that function is not available on all platforms and could also be
|
|
affected by the current locale.
|
|
"""
|
|
|
|
date = str(date)
|
|
year = 0
|
|
weekdays = short_weekdays
|
|
|
|
m = rfc1123_match(date)
|
|
if not m:
|
|
m = rfc850_match(date)
|
|
if m:
|
|
year = 1900
|
|
weekdays = long_weekdays
|
|
else:
|
|
m = asctime_match(date)
|
|
if not m:
|
|
return None
|
|
|
|
try:
|
|
year = year + int(m.group("year"))
|
|
month = months.index(m.group("month")) + 1
|
|
day = int(m.group("day"))
|
|
hour = int(m.group("hour"))
|
|
minute = int(m.group("minute"))
|
|
second = int(m.group("second"))
|
|
weekday = weekdays.index(m.group("weekday"))
|
|
a = int((14 - month) / 12)
|
|
julian_day = (day - 32045 + int(((153 * (month + (12 * a) - 3)) + 2) / 5) + int((146097 * (year + 4800 - a)) / 400)) - (int((146097 * (year + 4799)) / 400) - 31738) + 1
|
|
daylight_savings_flag = 0
|
|
return (year, month, day, hour, minute, second, weekday, julian_day, daylight_savings_flag)
|
|
except:
|
|
# the month or weekday lookup probably failed indicating an invalid timestamp
|
|
return None
|
|
|
|
def parse(uri, etag=None, modified=None, agent=None, referrer=None):
|
|
r = RSSParser()
|
|
f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
|
|
data = f.read()
|
|
if hasattr(f, "headers"):
|
|
if f.headers.get('content-encoding', None) == 'gzip':
|
|
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
|
r.feed(data)
|
|
result = {"channel": r.channel, "items": r.items}
|
|
newEtag = get_etag(f)
|
|
if newEtag: result["etag"] = newEtag
|
|
elif etag: result["etag"] = etag
|
|
newModified = get_modified(f)
|
|
if newModified: result["modified"] = newModified
|
|
elif modified: result["modified"] = modified
|
|
f.close()
|
|
return result
|
|
|
|
TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
|
|
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
if sys.argv[1:]:
|
|
urls = sys.argv[1:]
|
|
else:
|
|
urls = TEST_SUITE
|
|
from pprint import pprint
|
|
for url in urls:
|
|
print url
|
|
print
|
|
result = parse(url)
|
|
pprint(result['channel'])
|
|
print
|