mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-27 05:09:23 +01:00
Initial import.
This commit is contained in:
parent
83dd73a643
commit
1e353f7f12
519
others/rssparser.py
Normal file
519
others/rssparser.py
Normal file
@ -0,0 +1,519 @@
|
||||
#!/usr/bin/python
|
||||
"""Ultra-liberal RSS parser
|
||||
|
||||
Visit http://diveintomark.org/projects/rss_parser/ for the latest version
|
||||
|
||||
Handles RSS 0.9x and RSS 1.0 feeds
|
||||
|
||||
RSS 0.9x elements:
|
||||
- title, link, description, webMaster, managingEditor, language
|
||||
copyright, lastBuildDate, pubDate
|
||||
|
||||
RSS 1.0 elements:
|
||||
- dc:rights, dc:language, dc:creator, dc:date, dc:subject,
|
||||
content:encoded
|
||||
|
||||
Things it handles that choke other RSS parsers:
|
||||
- bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds)
|
||||
- illegal XML characters (most Radio feeds)
|
||||
- naked and/or invalid HTML in description (The Register)
|
||||
- content:encoded in item element (Aaron Swartz)
|
||||
- guid in item element (Scripting News)
|
||||
- fullitem in item element (Jon Udell)
|
||||
- non-standard namespaces (BitWorking)
|
||||
|
||||
Requires Python 2.2 or later
|
||||
"""
|
||||
|
||||
__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
|
||||
__copyright__ = "Copyright 2002, Mark Pilgrim"
|
||||
__contributors__ = ["Jason Diamond (jason@injektilo.org)"]
|
||||
__license__ = "GPL"
|
||||
__history__ = """
|
||||
1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
|
||||
added Simon Fell's test suite
|
||||
1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
|
||||
2.0 - 10/19/2002
|
||||
JD - use inchannel to watch out for image and textinput elements which can
|
||||
also contain title, link, and description elements
|
||||
JD - check for isPermaLink="false" attribute on guid elements
|
||||
JD - replaced openAnything with open_resource supporting ETag and
|
||||
If-Modified-Since request headers
|
||||
JD - parse now accepts etag, modified, agent, and referrer optional
|
||||
arguments
|
||||
JD - modified parse to return a dictionary instead of a tuple so that any
|
||||
etag or modified information can be returned and cached by the caller
|
||||
2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
|
||||
because of etag/modified, return the old etag/modified to the caller to
|
||||
indicate why nothing is being returned
|
||||
2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
|
||||
useless. Fixes the problem JD was addressing by adding it.
|
||||
2.1 - 11/14/2002 - MAP - added gzip support
|
||||
2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
|
||||
start_admingeneratoragent is an example of how to handle elements with
|
||||
only attributes, no content.
|
||||
"""
|
||||
|
||||
try:
|
||||
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
|
||||
timeoutsocket.setDefaultSocketTimeout(10)
|
||||
except ImportError:
|
||||
pass
|
||||
import cgi, re, sgmllib, string, StringIO, urllib, gzip
|
||||
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
||||
|
||||
def decodeEntities(data):
|
||||
data = data or ''
|
||||
data = data.replace('<', '<')
|
||||
data = data.replace('>', '>')
|
||||
data = data.replace('"', '"')
|
||||
data = data.replace(''', "'")
|
||||
data = data.replace('&', '&')
|
||||
return data
|
||||
|
||||
class RSSParser(sgmllib.SGMLParser):
|
||||
namespaces = {"http://backend.userland.com/rss": "",
|
||||
"http://backend.userland.com/rss2": "",
|
||||
"http://purl.org/rss/1.0/": "",
|
||||
"http://purl.org/rss/1.0/modules/textinput/": "ti",
|
||||
"http://purl.org/rss/1.0/modules/company/": "co",
|
||||
"http://purl.org/rss/1.0/modules/syndication/": "sy",
|
||||
"http://purl.org/dc/elements/1.1/": "dc",
|
||||
"http://webns.net/mvcb/": "admin"}
|
||||
|
||||
def reset(self):
|
||||
self.channel = {}
|
||||
self.items = []
|
||||
self.elementstack = []
|
||||
self.inchannel = 0
|
||||
self.initem = 0
|
||||
self.namespacemap = {}
|
||||
sgmllib.SGMLParser.reset(self)
|
||||
|
||||
def push(self, element, expectingText):
|
||||
self.elementstack.append([element, expectingText, []])
|
||||
|
||||
def pop(self, element):
|
||||
if not self.elementstack: return
|
||||
if self.elementstack[-1][0] != element: return
|
||||
element, expectingText, pieces = self.elementstack.pop()
|
||||
if not expectingText: return
|
||||
output = "".join(pieces)
|
||||
output = decodeEntities(output)
|
||||
if self.initem:
|
||||
self.items[-1][element] = output
|
||||
elif self.inchannel:
|
||||
self.channel[element] = output
|
||||
|
||||
def _addNamespaces(self, attrs):
|
||||
for prefix, value in attrs:
|
||||
if not prefix.startswith("xmlns:"): continue
|
||||
prefix = prefix[6:]
|
||||
if self.namespaces.has_key(value):
|
||||
self.namespacemap[prefix] = self.namespaces[value]
|
||||
|
||||
def _mapToStandardPrefix(self, name):
|
||||
colonpos = name.find(':')
|
||||
if colonpos <> -1:
|
||||
prefix = name[:colonpos]
|
||||
suffix = name[colonpos+1:]
|
||||
prefix = self.namespacemap.get(prefix, prefix)
|
||||
name = prefix + ':' + suffix
|
||||
return name
|
||||
|
||||
def _getAttribute(self, attrs, name):
|
||||
value = [v for k, v in attrs if self._mapToStandardPrefix(k) == name]
|
||||
if value:
|
||||
value = value[0]
|
||||
else:
|
||||
value = None
|
||||
return value
|
||||
|
||||
def start_channel(self, attrs):
|
||||
self.push('channel', 0)
|
||||
self.inchannel = 1
|
||||
|
||||
def end_channel(self):
|
||||
self.pop('channel')
|
||||
self.inchannel = 0
|
||||
|
||||
def start_item(self, attrs):
|
||||
self.items.append({})
|
||||
self.push('item', 0)
|
||||
self.initem = 1
|
||||
|
||||
def end_item(self):
|
||||
self.pop('item')
|
||||
self.initem = 0
|
||||
|
||||
def start_dc_language(self, attrs):
|
||||
self.push('language', 1)
|
||||
start_language = start_dc_language
|
||||
|
||||
def end_dc_language(self):
|
||||
self.pop('language')
|
||||
end_language = end_dc_language
|
||||
|
||||
def start_dc_creator(self, attrs):
|
||||
self.push('creator', 1)
|
||||
start_managingeditor = start_dc_creator
|
||||
start_webmaster = start_dc_creator
|
||||
|
||||
def end_dc_creator(self):
|
||||
self.pop('creator')
|
||||
end_managingeditor = end_dc_creator
|
||||
end_webmaster = end_dc_creator
|
||||
|
||||
def start_dc_rights(self, attrs):
|
||||
self.push('rights', 1)
|
||||
start_copyright = start_dc_rights
|
||||
|
||||
def end_dc_rights(self):
|
||||
self.pop('rights')
|
||||
end_copyright = end_dc_rights
|
||||
|
||||
def start_dc_date(self, attrs):
|
||||
self.push('date', 1)
|
||||
start_lastbuilddate = start_dc_date
|
||||
start_pubdate = start_dc_date
|
||||
|
||||
def end_dc_date(self):
|
||||
self.pop('date')
|
||||
end_lastbuilddate = end_dc_date
|
||||
end_pubdate = end_dc_date
|
||||
|
||||
def start_dc_subject(self, attrs):
|
||||
self.push('category', 1)
|
||||
|
||||
def end_dc_subject(self):
|
||||
self.pop('category')
|
||||
|
||||
def start_link(self, attrs):
|
||||
self.push('link', self.inchannel or self.initem)
|
||||
|
||||
def end_link(self):
|
||||
self.pop('link')
|
||||
|
||||
def start_guid(self, attrs):
|
||||
self.guidislink = ('ispermalink', 'false') not in attrs
|
||||
self.push('guid', 1)
|
||||
|
||||
def end_guid(self):
|
||||
self.pop('guid')
|
||||
if self.guidislink:
|
||||
self.items[-1]['link'] = self.items[-1]['guid']
|
||||
|
||||
def start_title(self, attrs):
|
||||
self.push('title', self.inchannel or self.initem)
|
||||
|
||||
def start_description(self, attrs):
|
||||
self.push('description', self.inchannel or self.initem)
|
||||
|
||||
def start_content_encoded(self, attrs):
|
||||
self.push('content_encoded', 1)
|
||||
start_fullitem = start_content_encoded
|
||||
|
||||
def end_content_encoded(self):
|
||||
self.pop('content_encoded')
|
||||
end_fullitem = end_content_encoded
|
||||
|
||||
def start_admin_generatoragent(self, attrs):
|
||||
self.push('generator', 1)
|
||||
value = self._getAttribute(attrs, 'rdf:resource')
|
||||
if value:
|
||||
self.elementstack[-1][2].append(value)
|
||||
self.pop('generator')
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
self._addNamespaces(attrs)
|
||||
colonpos = tag.find(':')
|
||||
if colonpos <> -1:
|
||||
prefix = tag[:colonpos]
|
||||
suffix = tag[colonpos+1:]
|
||||
prefix = self.namespacemap.get(prefix, prefix)
|
||||
if prefix:
|
||||
prefix = prefix + '_'
|
||||
methodname = 'start_' + prefix + suffix
|
||||
try:
|
||||
method = getattr(self, methodname)
|
||||
return method(attrs)
|
||||
except AttributeError:
|
||||
return self.push(prefix + suffix, 0)
|
||||
return self.push(tag, 0)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
colonpos = tag.find(':')
|
||||
if colonpos <> -1:
|
||||
prefix = tag[:colonpos]
|
||||
suffix = tag[colonpos+1:]
|
||||
prefix = self.namespacemap.get(prefix, prefix)
|
||||
if prefix:
|
||||
prefix = prefix + '_'
|
||||
methodname = 'end_' + prefix + suffix
|
||||
try:
|
||||
method = getattr(self, methodname)
|
||||
return method()
|
||||
except AttributeError:
|
||||
return self.pop(prefix + suffix)
|
||||
return self.pop(tag)
|
||||
|
||||
def handle_charref(self, ref):
|
||||
# called for each character reference, e.g. for " ", ref will be "160"
|
||||
# Reconstruct the original character reference.
|
||||
if not self.elementstack: return
|
||||
self.elementstack[-1][2].append("&#%(ref)s;" % locals())
|
||||
|
||||
def handle_entityref(self, ref):
|
||||
# called for each entity reference, e.g. for "©", ref will be "copy"
|
||||
# Reconstruct the original entity reference.
|
||||
if not self.elementstack: return
|
||||
self.elementstack[-1][2].append("&%(ref)s;" % locals())
|
||||
|
||||
def handle_data(self, text):
|
||||
# called for each block of plain text, i.e. outside of any tag and
|
||||
# not containing any character or entity references
|
||||
if not self.elementstack: return
|
||||
self.elementstack[-1][2].append(text)
|
||||
|
||||
def handle_comment(self, text):
|
||||
# called for each comment, e.g. <!-- insert message here -->
|
||||
pass
|
||||
|
||||
def handle_pi(self, text):
|
||||
# called for each processing instruction, e.g. <?instruction>
|
||||
pass
|
||||
|
||||
def handle_decl(self, text):
|
||||
# called for the DOCTYPE, if present, e.g.
|
||||
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
# "http://www.w3.org/TR/html4/loose.dtd">
|
||||
pass
|
||||
|
||||
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
|
||||
def _scan_name(self, i, declstartpos):
|
||||
rawdata = self.rawdata
|
||||
n = len(rawdata)
|
||||
if i == n:
|
||||
return None, -1
|
||||
m = self._new_declname_match(rawdata, i)
|
||||
if m:
|
||||
s = m.group()
|
||||
name = s.strip()
|
||||
if (i + len(s)) == n:
|
||||
return None, -1 # end of buffer
|
||||
return string.lower(name), m.end()
|
||||
else:
|
||||
self.updatepos(declstartpos, i)
|
||||
self.error("expected name token")
|
||||
|
||||
def parse_declaration(self, i):
|
||||
# override internal declaration handler to handle CDATA blocks
|
||||
if self.rawdata[i:i+9] == '<![CDATA[':
|
||||
k = self.rawdata.find(']]>', i)
|
||||
if k == -1: k = len(self.rawdata)
|
||||
self.handle_data(cgi.escape(self.rawdata[i+9:k]))
|
||||
return k+3
|
||||
return sgmllib.SGMLParser.parse_declaration(self, i)
|
||||
|
||||
def open_resource(source, etag=None, modified=None, agent=None, referrer=None):
|
||||
"""
|
||||
URI, filename, or string --> stream
|
||||
|
||||
This function lets you define parsers that take any input source
|
||||
(URL, pathname to local or network file, or actual data as a string)
|
||||
and deal with it in a uniform manner. Returned object is guaranteed
|
||||
to have all the basic stdio read methods (read, readline, readlines).
|
||||
Just .close() the object when you're done with it.
|
||||
|
||||
If the etag argument is supplied, it will be used as the value of an
|
||||
If-None-Match request header.
|
||||
|
||||
If the modified argument is supplied, it must be a tuple of 9 integers
|
||||
as returned by gmtime() in the standard Python time module. This MUST
|
||||
be in GMT (Greenwich Mean Time). The formatted date/time will be used
|
||||
as the value of an If-Modified-Since request header.
|
||||
|
||||
If the agent argument is supplied, it will be used as the value of a
|
||||
User-Agent request header.
|
||||
|
||||
If the referrer argument is supplied, it will be used as the value of a
|
||||
Referer[sic] request header.
|
||||
|
||||
The optional arguments are only used if the source argument is an HTTP
|
||||
URL and the urllib2 module is importable (i.e., you must be using Python
|
||||
version 2.0 or higher).
|
||||
"""
|
||||
|
||||
if hasattr(source, "read"):
|
||||
return source
|
||||
|
||||
if source == "-":
|
||||
return sys.stdin
|
||||
|
||||
# try to open with urllib2 (to use optional headers)
|
||||
try:
|
||||
import urllib2
|
||||
request = urllib2.Request(source)
|
||||
if etag:
|
||||
request.add_header("If-None-Match", etag)
|
||||
if modified:
|
||||
request.add_header("If-Modified-Since", format_http_date(modified))
|
||||
if agent:
|
||||
request.add_header("User-Agent", agent)
|
||||
if referrer:
|
||||
# http://www.dictionary.com/search?q=referer
|
||||
request.add_header("Referer", referrer)
|
||||
request.add_header("Accept-encoding", "gzip")
|
||||
try:
|
||||
return urllib2.urlopen(request)
|
||||
except urllib2.HTTPError:
|
||||
# either the resource is not modified or some other HTTP
|
||||
# error occurred so return an empty resource
|
||||
return StringIO.StringIO("")
|
||||
except:
|
||||
# source must not be a valid URL but it might be a valid filename
|
||||
pass
|
||||
except ImportError:
|
||||
# urllib2 isn't available so try to open with urllib
|
||||
try:
|
||||
return urllib.urlopen(source)
|
||||
except:
|
||||
# source still might be a filename
|
||||
pass
|
||||
|
||||
# try to open with native open function (if source is a filename)
|
||||
try:
|
||||
return open(source)
|
||||
except:
|
||||
pass
|
||||
|
||||
# treat source as string
|
||||
return StringIO.StringIO(str(source))
|
||||
|
||||
def get_etag(resource):
|
||||
"""
|
||||
Get the ETag associated with a response returned from a call to
|
||||
open_resource().
|
||||
|
||||
If the resource was not returned from an HTTP server or the server did
|
||||
not specify an ETag for the resource, this will return None.
|
||||
"""
|
||||
|
||||
if hasattr(resource, "info"):
|
||||
return resource.info().getheader("ETag")
|
||||
return None
|
||||
|
||||
def get_modified(resource):
|
||||
"""
|
||||
Get the Last-Modified timestamp for a response returned from a call to
|
||||
open_resource().
|
||||
|
||||
If the resource was not returned from an HTTP server or the server did
|
||||
not specify a Last-Modified timestamp, this function will return None.
|
||||
Otherwise, it returns a tuple of 9 integers as returned by gmtime() in
|
||||
the standard Python time module().
|
||||
"""
|
||||
|
||||
if hasattr(resource, "info"):
|
||||
last_modified = resource.info().getheader("Last-Modified")
|
||||
if last_modified:
|
||||
return parse_http_date(last_modified)
|
||||
return None
|
||||
|
||||
short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||
long_weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||
|
||||
def format_http_date(date):
|
||||
"""
|
||||
Formats a tuple of 9 integers into an RFC 1123-compliant timestamp as
|
||||
required in RFC 2616. We don't use time.strftime() since the %a and %b
|
||||
directives can be affected by the current locale (HTTP dates have to be
|
||||
in English). The date MUST be in GMT (Greenwich Mean Time).
|
||||
"""
|
||||
|
||||
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[date[6]], date[2], months[date[1] - 1], date[0], date[3], date[4], date[5])
|
||||
|
||||
rfc1123_match = re.compile(r"(?P<weekday>[A-Z][a-z]{2}), (?P<day>\d{2}) (?P<month>[A-Z][a-z]{2}) (?P<year>\d{4}) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) GMT").match
|
||||
rfc850_match = re.compile(r"(?P<weekday>[A-Z][a-z]+), (?P<day>\d{2})-(?P<month>[A-Z][a-z]{2})-(?P<year>\d{2}) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) GMT").match
|
||||
asctime_match = re.compile(r"(?P<weekday>[A-Z][a-z]{2}) (?P<month>[A-Z][a-z]{2}) ?(?P<day>\d\d?) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) (?P<year>\d{4})").match
|
||||
|
||||
def parse_http_date(date):
|
||||
"""
|
||||
Parses any of the three HTTP date formats into a tuple of 9 integers as
|
||||
returned by time.gmtime(). This should not use time.strptime() since
|
||||
that function is not available on all platforms and could also be
|
||||
affected by the current locale.
|
||||
"""
|
||||
|
||||
date = str(date)
|
||||
year = 0
|
||||
weekdays = short_weekdays
|
||||
|
||||
m = rfc1123_match(date)
|
||||
if not m:
|
||||
m = rfc850_match(date)
|
||||
if m:
|
||||
year = 1900
|
||||
weekdays = long_weekdays
|
||||
else:
|
||||
m = asctime_match(date)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
try:
|
||||
year = year + int(m.group("year"))
|
||||
month = months.index(m.group("month")) + 1
|
||||
day = int(m.group("day"))
|
||||
hour = int(m.group("hour"))
|
||||
minute = int(m.group("minute"))
|
||||
second = int(m.group("second"))
|
||||
weekday = weekdays.index(m.group("weekday"))
|
||||
a = int((14 - month) / 12)
|
||||
julian_day = (day - 32045 + int(((153 * (month + (12 * a) - 3)) + 2) / 5) + int((146097 * (year + 4800 - a)) / 400)) - (int((146097 * (year + 4799)) / 400) - 31738) + 1
|
||||
daylight_savings_flag = 0
|
||||
return (year, month, day, hour, minute, second, weekday, julian_day, daylight_savings_flag)
|
||||
except:
|
||||
# the month or weekday lookup probably failed indicating an invalid timestamp
|
||||
return None
|
||||
|
||||
def parse(uri, etag=None, modified=None, agent=None, referrer=None):
|
||||
r = RSSParser()
|
||||
f = open_resource(uri, etag=etag, modified=modified, agent=agent, referrer=referrer)
|
||||
data = f.read()
|
||||
if hasattr(f, "headers"):
|
||||
if f.headers.get('content-encoding', None) == 'gzip':
|
||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||
r.feed(data)
|
||||
result = {"channel": r.channel, "items": r.items}
|
||||
newEtag = get_etag(f)
|
||||
if newEtag: result["etag"] = newEtag
|
||||
elif etag: result["etag"] = etag
|
||||
newModified = get_modified(f)
|
||||
if newModified: result["modified"] = newModified
|
||||
elif modified: result["modified"] = modified
|
||||
f.close()
|
||||
return result
|
||||
|
||||
TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
|
||||
'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if sys.argv[1:]:
|
||||
urls = sys.argv[1:]
|
||||
else:
|
||||
urls = TEST_SUITE
|
||||
from pprint import pprint
|
||||
for url in urls:
|
||||
print url
|
||||
print
|
||||
result = parse(url)
|
||||
pprint(result['channel'])
|
||||
print
|
Loading…
Reference in New Issue
Block a user