utils.web: Add function get_encoding.

This commit is contained in:
Valentin Lorentz 2013-07-09 12:02:25 +00:00
parent dbedcbb6c9
commit d4df5de91d

View File

@ -191,16 +191,24 @@ class HtmlToText(HTMLParser, object):
text = ''.join(self.data).strip()
return normalizeWhitespace(text)
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
def get_encoding(s):
# TODO: use <meta charset />
try:
import charade.universaldetector
u = charade.universaldetector.UniversalDetector()
u.feed(s)
u.close()
s = s.decode(u.result['encoding'])
return u.result['encoding']
except:
return None
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
encoding = get_encoding(s)
if encoding:
s = s.decode(encoding)
else:
try:
if sys.version_info[0] < 3 or isinstance(s, bytes):
s = s.decode('utf8')
@ -215,6 +223,7 @@ def mungeEmail(s):
s = s.replace('.', ' DOT ')
return s
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: