From 1246edaf2c110e0bc3f2653771992c2dd2bd6234 Mon Sep 17 00:00:00 2001 From: James Lu Date: Sat, 27 May 2017 01:27:09 -0700 Subject: [PATCH 1/3] Irc: initial work on encoding support (#101) --- classes.py | 14 ++++++++------ example-conf.yml | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/classes.py b/classes.py index a21c908..0111a95 100644 --- a/classes.py +++ b/classes.py @@ -416,8 +416,6 @@ class Irc(utils.DeprecatedAttributesObject): def run(self): """Main IRC loop which listens for messages.""" - # Some magic below cause this to work, though anything that's - # not encoded in UTF-8 doesn't work very well. buf = b"" data = b"" while not self.aborted.is_set(): @@ -438,11 +436,14 @@ class Irc(utils.DeprecatedAttributesObject): elif (time.time() - self.lastping) > self.pingtimeout: log.error('(%s) Connection timed out.', self.name) return + + # Get the encoding from the config file, falling back to UTF-8 if none is specified. + encoding = self.serverdata.get('encoding') or 'utf-8' + while b'\n' in buf: line, buf = buf.split(b'\n', 1) line = line.strip(b'\r') - # FIXME: respect other encodings? - line = line.decode("utf-8", "replace") + line = line.decode(encoding, "replace") self.runline(line) def runline(self, line): @@ -509,8 +510,9 @@ class Irc(utils.DeprecatedAttributesObject): # Safeguard against newlines in input!! Otherwise, each line gets # treated as a separate command, which is particularly nasty. data = data.replace('\n', ' ') - data = data.encode("utf-8") + b"\n" - stripped_data = data.decode("utf-8").strip("\n") + encoding = self.serverdata.get('encoding') or 'utf-8' + data = data.encode(encoding) + b"\n" + stripped_data = data.decode(encoding).strip("\n") log.debug("(%s) -> %s", self.name, stripped_data) try: diff --git a/example-conf.yml b/example-conf.yml index b7c8358..0312ec5 100644 --- a/example-conf.yml +++ b/example-conf.yml @@ -201,6 +201,12 @@ servers: # This setting defaults to sha256. #ssl_fingerprint_type: sha256 + # Encoding: allows you to override the network's encoding. This can be useful for networks + # using m_nationalchars or something similar. Encoding defaults to utf-8 if not set, and + # should be one of the standard encodings defined at + # https://docs.python.org/3/library/codecs.html#standard-encodings + #encoding: utf-8 + ts6net: ip: ::1 From 2737b6bbfc54802fbe253dd5089e6ae37a0cc358 Mon Sep 17 00:00:00 2001 From: James Lu Date: Sat, 27 May 2017 02:21:12 -0700 Subject: [PATCH 2/3] Irc: simplify _send() code and replace unencodable characters --- classes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/classes.py b/classes.py index 0111a95..34cb576 100644 --- a/classes.py +++ b/classes.py @@ -511,14 +511,14 @@ class Irc(utils.DeprecatedAttributesObject): # treated as a separate command, which is particularly nasty. data = data.replace('\n', ' ') encoding = self.serverdata.get('encoding') or 'utf-8' - data = data.encode(encoding) + b"\n" - stripped_data = data.decode(encoding).strip("\n") - log.debug("(%s) -> %s", self.name, stripped_data) + encoded_data = data.encode(encoding, 'replace') + b"\n" + + log.debug("(%s) -> %s", self.name, data) try: - self.socket.send(data) + self.socket.send(encoded_data) except (OSError, AttributeError): - log.exception("(%s) Failed to send message %r; did the network disconnect?", self.name, stripped_data) + log.exception("(%s) Failed to send message %r; did the network disconnect?", self.name, data) def send(self, data, queue=True): """send() wrapper with optional queueing support.""" From b9aee6ae85a488345d55c29d5471e376238bdc32 Mon Sep 17 00:00:00 2001 From: James Lu Date: Fri, 2 Jun 2017 07:30:20 -0700 Subject: [PATCH 3/3] Irc: only apply encoding settings on connect Changing the encoding after a connection has been established is somewhat dangerous, because it's possible to corrupt channel/user state if characters in the old encoding are no longer valid. Also, mark this option as experimental. --- classes.py | 12 ++++++------ example-conf.yml | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/classes.py b/classes.py index 34cb576..96ddf98 100644 --- a/classes.py +++ b/classes.py @@ -56,6 +56,9 @@ class Irc(utils.DeprecatedAttributesObject): self.botdata = conf['bot'] self.protoname = proto.__name__.split('.')[-1] # Remove leading pylinkirc.protocols. self.proto = proto.Class(self) + + # These options depend on self.serverdata from above to be set. + self.encoding = None self.pingfreq = self.serverdata.get('pingfreq') or 90 self.pingtimeout = self.pingfreq * 2 @@ -111,6 +114,7 @@ class Irc(utils.DeprecatedAttributesObject): (Re)sets an IRC object to its default state. This should be called when an IRC object is first created, and on every reconnection to a network. """ + self.encoding = self.serverdata.get('encoding') or 'utf-8' self.pingfreq = self.serverdata.get('pingfreq') or 90 self.pingtimeout = self.pingfreq * 3 @@ -437,13 +441,10 @@ class Irc(utils.DeprecatedAttributesObject): log.error('(%s) Connection timed out.', self.name) return - # Get the encoding from the config file, falling back to UTF-8 if none is specified. - encoding = self.serverdata.get('encoding') or 'utf-8' - while b'\n' in buf: line, buf = buf.split(b'\n', 1) line = line.strip(b'\r') - line = line.decode(encoding, "replace") + line = line.decode(self.encoding, "replace") self.runline(line) def runline(self, line): @@ -510,8 +511,7 @@ class Irc(utils.DeprecatedAttributesObject): # Safeguard against newlines in input!! Otherwise, each line gets # treated as a separate command, which is particularly nasty. data = data.replace('\n', ' ') - encoding = self.serverdata.get('encoding') or 'utf-8' - encoded_data = data.encode(encoding, 'replace') + b"\n" + encoded_data = data.encode(self.encoding, 'replace') + b"\n" log.debug("(%s) -> %s", self.name, data) diff --git a/example-conf.yml b/example-conf.yml index 0312ec5..5cca36e 100644 --- a/example-conf.yml +++ b/example-conf.yml @@ -205,6 +205,9 @@ servers: # using m_nationalchars or something similar. Encoding defaults to utf-8 if not set, and # should be one of the standard encodings defined at # https://docs.python.org/3/library/codecs.html#standard-encodings + # Changing this setting requires a disconnect and reconnect of the corresponding network + # to apply. + # This setting is EXPERIMENTAL as of PyLink 1.2.x. #encoding: utf-8 ts6net: