utils/web.py: Attempt to fix handling of RSS feeds using broken entity references.

This commit is contained in:
Valentin Lorentz 2013-11-26 15:15:07 +00:00
parent ff5d83e7e6
commit 42b8a0676c

View File

@ -199,7 +199,14 @@ class HtmlToText(HTMLParser, object):
self.data.append(data) self.data.append(data)
def handle_entityref(self, data): def handle_entityref(self, data):
if data in htmlentitydefs.name2codepoint:
self.data.append(unichr(htmlentitydefs.name2codepoint[data])) self.data.append(unichr(htmlentitydefs.name2codepoint[data]))
elif sys.version_info[0] >= 3 and isinstance(data, bytes):
self.data.append(data.decode())
elif sys.version_info[0] < 3 and isinstance(data, str):
self.data.append(data.decode('utf8', errors='replace'))
else:
self.data.append(data)
def getText(self): def getText(self):
text = ''.join(self.data).strip() text = ''.join(self.data).strip()