Use HTMLParser.unescape instead of chr to decode HTML entities.

This adds support for entities encoded in hexadecimal notation.
This commit is contained in:
Valentin Lorentz 2016-03-08 22:00:44 +01:00
parent 25a913a82b
commit 7f38076e59

View File

@ -261,7 +261,7 @@ class HtmlToText(HTMLParser, object):
return normalizeWhitespace(text) return normalizeWhitespace(text)
def handle_charref(self, name): def handle_charref(self, name):
self.append((unichr if minisix.PY2 else chr)(int(name))) self.append(self.unescape('&#%s;' % name))
def htmlToText(s, tagReplace=' '): def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with. """Turns HTML into text. tagReplace is a string to replace HTML tags with.