From a4a595b39aef5ffee7a78067f3d926bc7840a900 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Tue, 22 Jan 2013 20:35:11 +0100 Subject: [PATCH] Partial fix of encoding handling. This fixes mostly everything, except a little bit from the test framework. I'm just saving this in case my computer or I is destroyed in an alien invasion, because this commit is worth hours of debugging. --- src/callbacks.py | 21 ++++++++++++--------- test/test_callbacks.py | 7 +++++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/callbacks.py b/src/callbacks.py index 91784ce00..c647a8ac6 100644 --- a/src/callbacks.py +++ b/src/callbacks.py @@ -39,13 +39,15 @@ import sys import copy import time import shlex +import codecs import getopt import inspect import operator -if sys.version_info < (2, 7, 0): - # cStringIO is buggy. - # See http://paste.progval.net/show/227/ +if sys.version_info[0] < 3: + # cStringIO is buggy with Python 2.6 ( + # see http://paste.progval.net/show/227/ ) + # and it does not handle unicode objects in Python 2.x from StringIO import StringIO else: from cStringIO import StringIO @@ -290,16 +292,17 @@ class Tokenizer(object): def _handleToken(self, token): if token[0] == token[-1] and token[0] in self.quotes: token = token[1:-1] - encoding_prefix = 'string' if sys.version_info[0]<3 else 'unicode' # FIXME: No need to tell you this is a hack. # It has to handle both IRC commands and serialized configuration. - try: - token = token.decode(encoding_prefix + '_escape') - except: + if sys.version_info[0] < 3: try: - token = token.encode().decode(encoding_prefix + '_escape') + token = token.encode('utf8').decode('string_escape') except: - pass + token = token.decode('string_escape') + else: + token = codecs.getencoder('utf8')(token)[0] + token = codecs.getdecoder('unicode_escape')(token)[0] + token = token.encode('iso-8859-1').decode() return token def _insideBrackets(self, lexer): diff --git a/test/test_callbacks.py b/test/test_callbacks.py index f07ab902e..21626edda 100644 --- a/test/test_callbacks.py +++ b/test/test_callbacks.py @@ -1,3 +1,4 @@ +# -*- coding: utf8 -*- ### # Copyright (c) 2002-2005, Jeremiah Fincher # All rights reserved. @@ -71,6 +72,12 @@ class TokenizerTestCase(SupyTestCase): self.assertEqual(tokenize('foo "bar baz" quux'), ['foo', 'bar baz', 'quux']) + def testUnicode(self): + print repr((tokenize(u'好'), ['好'])) + print repr((tokenize(u'"好"'), ['好'])) + self.assertEqual(tokenize(u'好'), ['好']) + self.assertEqual(tokenize(u'"好"'), ['好']) + def testNesting(self): self.assertEqual(tokenize('[]'), [[]]) self.assertEqual(tokenize('[foo]'), [['foo']])