Web: Decode using the charset advertized in response headers

And fall back to the sniffing when not present
This commit is contained in:
Valentin Lorentz 2022-11-26 09:06:47 +01:00
parent e6c4da0fff
commit 1a7c14f4b3

View File

@ -164,8 +164,17 @@ class Web(callbacks.PluginRegexp):
timeout = self.registryValue('timeout') timeout = self.registryValue('timeout')
headers = conf.defaultHttpHeaders(irc.network, msg.channel) headers = conf.defaultHttpHeaders(irc.network, msg.channel)
try: try:
(target, text) = utils.web.getUrlTargetAndContent(url, size=size, fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
timeout=timeout, headers=headers) target = fd.geturl()
text = fd.read(size)
response_headers = fd.headers
fd.close()
except socket.timeout:
if raiseErrors:
irc.error(_('Connection to %s timed out') % url, Raise=True)
else:
selg.log.info('Web plugins TitleSnarfer: URL <%s> timed out',
url)
except Exception as e: except Exception as e:
if raiseErrors: if raiseErrors:
irc.error(_('That URL raised <' + str(e)) + '>', irc.error(_('That URL raised <' + str(e)) + '>',
@ -174,9 +183,19 @@ class Web(callbacks.PluginRegexp):
self.log.info('Web plugin TitleSnarfer: URL <%s> raised <%s>', self.log.info('Web plugin TitleSnarfer: URL <%s> raised <%s>',
url, str(e)) url, str(e))
return return
encoding = None
if 'Content-Type' in fd.headers:
mime_params = [p.split('=', 1)
for p in fd.headers['Content-Type'].split(';')[1:]]
mime_params = {k.strip(): v.strip() for (k, v) in mime_params}
if mime_params.get('charset'):
encoding = mime_params['charset']
encoding = encoding or utils.web.getEncoding(text) or 'utf8'
try: try:
text = text.decode(utils.web.getEncoding(text) or 'utf8', text = text.decode(encoding, 'replace')
'replace')
except UnicodeDecodeError: except UnicodeDecodeError:
if minisix.PY3: if minisix.PY3:
if raiseErrors: if raiseErrors: