From a4a595b39aef5ffee7a78067f3d926bc7840a900 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <progval@progval.net>
Date: Tue, 22 Jan 2013 20:35:11 +0100
Subject: [PATCH] Partial fix of encoding handling.

This fixes mostly everything, except a little bit from the test framework. I'm just saving this in case my computer or I is destroyed in an alien invasion, because this commit is worth hours of debugging.
---
 src/callbacks.py       | 21 ++++++++++++---------
 test/test_callbacks.py |  7 +++++++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/callbacks.py b/src/callbacks.py
index 91784ce00..c647a8ac6 100644
--- a/src/callbacks.py
+++ b/src/callbacks.py
@@ -39,13 +39,15 @@ import sys
 import copy
 import time
 import shlex
+import codecs
 import getopt
 import inspect
 import operator
 
-if sys.version_info < (2, 7, 0):
-    # cStringIO is buggy.
-    # See http://paste.progval.net/show/227/
+if sys.version_info[0] < 3:
+    # cStringIO is buggy with Python 2.6 (
+    # see http://paste.progval.net/show/227/ )
+    # and it does not handle unicode objects in Python  2.x
     from StringIO import StringIO
 else:
     from cStringIO import StringIO
@@ -290,16 +292,17 @@ class Tokenizer(object):
     def _handleToken(self, token):
         if token[0] == token[-1] and token[0] in self.quotes:
             token = token[1:-1]
-            encoding_prefix = 'string' if sys.version_info[0]<3 else 'unicode'
             # FIXME: No need to tell you this is a hack.
             # It has to handle both IRC commands and serialized configuration.
-            try:
-                token = token.decode(encoding_prefix + '_escape')
-            except:
+            if sys.version_info[0] < 3:
                 try:
-                    token = token.encode().decode(encoding_prefix + '_escape')
+                    token = token.encode('utf8').decode('string_escape')
                 except:
-                    pass
+                    token = token.decode('string_escape')
+            else:
+                token = codecs.getencoder('utf8')(token)[0]
+                token = codecs.getdecoder('unicode_escape')(token)[0]
+                token = token.encode('iso-8859-1').decode()
         return token
 
     def _insideBrackets(self, lexer):
diff --git a/test/test_callbacks.py b/test/test_callbacks.py
index f07ab902e..21626edda 100644
--- a/test/test_callbacks.py
+++ b/test/test_callbacks.py
@@ -1,3 +1,4 @@
+# -*- coding: utf8 -*-
 ###
 # Copyright (c) 2002-2005, Jeremiah Fincher
 # All rights reserved.
@@ -71,6 +72,12 @@ class TokenizerTestCase(SupyTestCase):
         self.assertEqual(tokenize('foo "bar baz" quux'),
                          ['foo', 'bar baz', 'quux'])
 
+    def testUnicode(self):
+        print repr((tokenize(u'好'), ['好']))
+        print repr((tokenize(u'"好"'), ['好']))
+        self.assertEqual(tokenize(u'好'), ['好'])
+        self.assertEqual(tokenize(u'"好"'), ['好'])
+
     def testNesting(self):
         self.assertEqual(tokenize('[]'), [[]])
         self.assertEqual(tokenize('[foo]'), [['foo']])