mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-23 19:19:32 +01:00
utils.web: Add function get_encoding.
This commit is contained in:
parent
dbedcbb6c9
commit
d4df5de91d
@ -191,16 +191,24 @@ class HtmlToText(HTMLParser, object):
|
|||||||
text = ''.join(self.data).strip()
|
text = ''.join(self.data).strip()
|
||||||
return normalizeWhitespace(text)
|
return normalizeWhitespace(text)
|
||||||
|
|
||||||
def htmlToText(s, tagReplace=' '):
|
def get_encoding(s):
|
||||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
# TODO: use <meta charset />
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
import charade.universaldetector
|
import charade.universaldetector
|
||||||
u = charade.universaldetector.UniversalDetector()
|
u = charade.universaldetector.UniversalDetector()
|
||||||
u.feed(s)
|
u.feed(s)
|
||||||
u.close()
|
u.close()
|
||||||
s = s.decode(u.result['encoding'])
|
return u.result['encoding']
|
||||||
except:
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def htmlToText(s, tagReplace=' '):
|
||||||
|
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||||
|
"""
|
||||||
|
encoding = get_encoding(s)
|
||||||
|
if encoding:
|
||||||
|
s = s.decode(encoding)
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
if sys.version_info[0] < 3 or isinstance(s, bytes):
|
if sys.version_info[0] < 3 or isinstance(s, bytes):
|
||||||
s = s.decode('utf8')
|
s = s.decode('utf8')
|
||||||
@ -215,6 +223,7 @@ def mungeEmail(s):
|
|||||||
s = s.replace('.', ' DOT ')
|
s = s.replace('.', ' DOT ')
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user