mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-12-24 03:33:11 +01:00
utils.web: Add function get_encoding.
This commit is contained in:
parent
dbedcbb6c9
commit
d4df5de91d
@ -191,16 +191,24 @@ class HtmlToText(HTMLParser, object):
|
||||
text = ''.join(self.data).strip()
|
||||
return normalizeWhitespace(text)
|
||||
|
||||
def htmlToText(s, tagReplace=' '):
|
||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||
"""
|
||||
def get_encoding(s):
|
||||
# TODO: use <meta charset />
|
||||
try:
|
||||
import charade.universaldetector
|
||||
u = charade.universaldetector.UniversalDetector()
|
||||
u.feed(s)
|
||||
u.close()
|
||||
s = s.decode(u.result['encoding'])
|
||||
return u.result['encoding']
|
||||
except:
|
||||
return None
|
||||
|
||||
def htmlToText(s, tagReplace=' '):
|
||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||
"""
|
||||
encoding = get_encoding(s)
|
||||
if encoding:
|
||||
s = s.decode(encoding)
|
||||
else:
|
||||
try:
|
||||
if sys.version_info[0] < 3 or isinstance(s, bytes):
|
||||
s = s.decode('utf8')
|
||||
@ -215,6 +223,7 @@ def mungeEmail(s):
|
||||
s = s.replace('.', ' DOT ')
|
||||
return s
|
||||
|
||||
|
||||
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user