utils.web: Add function get_encoding.

This commit is contained in:
Valentin Lorentz 2013-07-09 12:02:25 +00:00
parent dbedcbb6c9
commit d4df5de91d

View File

@ -191,16 +191,24 @@ class HtmlToText(HTMLParser, object):
text = ''.join(self.data).strip() text = ''.join(self.data).strip()
return normalizeWhitespace(text) return normalizeWhitespace(text)
def htmlToText(s, tagReplace=' '): def get_encoding(s):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with. # TODO: use <meta charset />
"""
try: try:
import charade.universaldetector import charade.universaldetector
u = charade.universaldetector.UniversalDetector() u = charade.universaldetector.UniversalDetector()
u.feed(s) u.feed(s)
u.close() u.close()
s = s.decode(u.result['encoding']) return u.result['encoding']
except: except:
return None
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
encoding = get_encoding(s)
if encoding:
s = s.decode(encoding)
else:
try: try:
if sys.version_info[0] < 3 or isinstance(s, bytes): if sys.version_info[0] < 3 or isinstance(s, bytes):
s = s.decode('utf8') s = s.decode('utf8')
@ -215,6 +223,7 @@ def mungeEmail(s):
s = s.replace('.', ' DOT ') s = s.replace('.', ' DOT ')
return s return s
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: