utils.web.getEncoding: use <meta charset /> if available.

This commit is contained in:
Valentin Lorentz 2013-07-09 12:40:42 +00:00
parent b4402b28ed
commit 771b739af7

View File

@ -166,8 +166,18 @@ def getUrl(url, size=None, headers=None, data=None):
def getDomain(url): def getDomain(url):
return urlparse.urlparse(url)[1] return urlparse.urlparse(url)[1]
_charset_re = ('<meta[^a-z<>]+charset='
"""(?P<charset>("[^"]+"|'[^']+'))""")
def getEncoding(s): def getEncoding(s):
# TODO: use <meta charset /> try:
match = re.search(_charset_re, s, re.MULTILINE)
if match:
return match.group('charset')[1:-1]
except:
match = re.search(_charset_re.encode(), s, re.MULTILINE)
if match:
return match.group('charset').decode()[1:-1]
try: try:
import charade.universaldetector import charade.universaldetector
u = charade.universaldetector.UniversalDetector() u = charade.universaldetector.UniversalDetector()