Partial fix of encoding handling.

This fixes mostly everything, except a little bit from the test framework. I'm just saving this in case my computer or I is destroyed in an alien invasion, because this commit is worth hours of debugging.
This commit is contained in:
Valentin Lorentz 2013-01-22 20:35:11 +01:00
parent c9b6b56244
commit a4a595b39a
2 changed files with 19 additions and 9 deletions

View File

@ -39,13 +39,15 @@ import sys
import copy
import time
import shlex
import codecs
import getopt
import inspect
import operator
if sys.version_info < (2, 7, 0):
# cStringIO is buggy.
# See http://paste.progval.net/show/227/
if sys.version_info[0] < 3:
# cStringIO is buggy with Python 2.6 (
# see http://paste.progval.net/show/227/ )
# and it does not handle unicode objects in Python 2.x
from StringIO import StringIO
else:
from cStringIO import StringIO
@ -290,16 +292,17 @@ class Tokenizer(object):
def _handleToken(self, token):
if token[0] == token[-1] and token[0] in self.quotes:
token = token[1:-1]
encoding_prefix = 'string' if sys.version_info[0]<3 else 'unicode'
# FIXME: No need to tell you this is a hack.
# It has to handle both IRC commands and serialized configuration.
if sys.version_info[0] < 3:
try:
token = token.decode(encoding_prefix + '_escape')
token = token.encode('utf8').decode('string_escape')
except:
try:
token = token.encode().decode(encoding_prefix + '_escape')
except:
pass
token = token.decode('string_escape')
else:
token = codecs.getencoder('utf8')(token)[0]
token = codecs.getdecoder('unicode_escape')(token)[0]
token = token.encode('iso-8859-1').decode()
return token
def _insideBrackets(self, lexer):

View File

@ -1,3 +1,4 @@
# -*- coding: utf8 -*-
###
# Copyright (c) 2002-2005, Jeremiah Fincher
# All rights reserved.
@ -71,6 +72,12 @@ class TokenizerTestCase(SupyTestCase):
self.assertEqual(tokenize('foo "bar baz" quux'),
['foo', 'bar baz', 'quux'])
def testUnicode(self):
print repr((tokenize(u''), ['']))
print repr((tokenize(u'""'), ['']))
self.assertEqual(tokenize(u''), [''])
self.assertEqual(tokenize(u'""'), [''])
def testNesting(self):
self.assertEqual(tokenize('[]'), [[]])
self.assertEqual(tokenize('[foo]'), [['foo']])