Do not break UTF-8 characters in long words. Closes GH-1333.

2026-01-25 20:17:57 +01:00 · 2018-04-14 22:31:30 +02:00 · 2018-04-14 22:31:30 +02:00 · 0d627c05b7
commit 0d627c05b7
parent cd479717b8
4 changed files with 25 additions and 11 deletions
--- a/src/callbacks.py
+++ b/src/callbacks.py
@ -941,8 +941,7 @@ class NestedCommandsIrcProxy(ReplyIrcProxy):
                                  stripCtcp=stripCtcp)
                        sendMsg(m)
                        return m
-                    msgs = ircutils.wrap(s, allowedLength,
-                            break_long_words=True)
+                    msgs = ircutils.wrap(s, allowedLength)
                    msgs.reverse()
                    instant = conf.get(conf.supybot.reply.mores.instant,target)
                    while instant > 1 and msgs:
--- a/src/ircutils.py
+++ b/src/ircutils.py
@ -596,10 +596,9 @@ class FormatParser(object):
        else:
            self.ungetChar(c)

-def wrap(s, length, break_on_hyphens = False, break_long_words = False):
+def wrap(s, length, break_on_hyphens = False):
    processed = []
-    chunks = utils.str.byteTextWrap(s, length,
-            break_long_words=break_long_words)
+    chunks = utils.str.byteTextWrap(s, length)
    context = None
    for chunk in chunks:
        if context is not None:
--- a/src/utils/str.py
+++ b/src/utils/str.py
@ -306,7 +306,21 @@ def perlVariableSubstitute(vars, text):
                return '$' + unbraced
    return _perlVarSubstituteRe.sub(replacer, text)

-def byteTextWrap(text, size, break_on_hyphens=False, break_long_words=True):
+def splitBytes(word, size):
+    # I'm going to hell for this function
+    for i in range(4): # a character takes at most 4 bytes in UTF-8
+        try:
+            if sys.version_info[0] >= 3:
+                word[size-i:].decode()
+            else:
+                word[size-i:].encode('utf8')
+        except UnicodeDecodeError:
+            continue
+        else:
+            return (word[0:size-i], word[size-i:])
+    assert False, (word, size)
+
+def byteTextWrap(text, size, break_on_hyphens=False):
    """Similar to textwrap.wrap(), but considers the size of strings (in bytes)
    instead of their length (in characters)."""
    try:
@ -320,8 +334,9 @@ def byteTextWrap(text, size, break_on_hyphens=False, break_long_words=True):
    while words:
        word = words.pop(-1)
        if len(word) > size:
-            words.append(word[size:])
-            word = word[0:size]
+            (before, after) = splitBytes(word, size)
+            words.append(after)
+            word = before
        if len(lines[-1]) + len(word) <= size:
            lines[-1] += word
        else:
--- a/test/test_ircutils.py
+++ b/test/test_ircutils.py
@ -222,9 +222,10 @@ class FunctionsTestCase(SupyTestCase):
        self.assertEqual(''.join(r), s)

        s = uchr(233)*500
-        print(repr(ircutils.wrap(s, 500)))
-        import supybot.utils.str
-        print(repr(supybot.utils.str.byteTextWrap(s, 500)))
+        r = ircutils.wrap(s, 500)
+        self.assertTrue(max(map(pred, r)) <= 500)
+        r = ircutils.wrap(s, 139)
+        self.assertTrue(max(map(pred, r)) <= 139)


    def testSafeArgument(self):