Use chardet to guess the character encoding when on python3

This commit is contained in:
Alexander Ralph Michael Minges 2013-01-15 20:55:42 +01:00
parent bbd9d1636a
commit 4316e5936e
1 changed files with 29 additions and 1 deletions

View File

@ -45,6 +45,14 @@ import supybot.world as world
import supybot.drivers as drivers
import supybot.schedule as schedule
from itertools import imap
try:
from chardet.universaldetector import UniversalDetector
chardetLoaded = True
except:
drivers.log.debug('chardet module not available, '
'cannot guess character encoding if'
'using Python3')
chardetLoaded = False
try:
import ssl
SSLError = ssl.SSLError
@ -184,7 +192,27 @@ class SocketDriver(drivers.IrcDriver, drivers.ServersMixin):
self.inbuffer = lines.pop()
for line in lines:
if sys.version_info[0] >= 3:
line = line.decode(errors='replace')
#first, try to decode using utf-8
try:
line = line.decode(encoding='utf-8', errors='strict')
except UnicodeError:
# if this fails and chardet is loaded, try to guess the correct encoding
if chardetLoaded:
u = UniversalDetector()
u.feed(line)
u.close()
if u.result['encoding']:
# try to use the guessed encoding
try:
line = line.decode(u.result['encoding'], errors='strict')
# on error, give up and replace the offending characters
except UnicodeError:
line = line.decode(errors='replace')
# if chardet is not loaded, try to decode using utf-8 and replace any
# offending characters
else:
line = line.decode(encoding='utf-8', errors='replace')
msg = drivers.parseMsg(line)
if msg is not None:
self.irc.feedMsg(msg)