Implement and test byte-splitting helper function

This commit is contained in:
Ben Wiederhake 2024-03-07 22:34:15 +01:00
parent 56e7bd01ca
commit 05dddc42ff
2 changed files with 129 additions and 0 deletions

View File

@ -219,6 +219,33 @@ func ClipMessage(text string, length int, clippingMessage string) string {
return text
}
func ClipOrSplitMessage(text string, length int, clippingMessage string, splitMax int) []string {
var msgParts []string
var remainingText = text
// Invariant of this splitting loop: No text is lost (msgParts+remainingText is the original text),
// and all parts is guaranteed to satisfy the length requirement.
for len(msgParts) < splitMax - 1 && len(remainingText) > length {
// Decision: The text needs to be split (again).
var chunk string
var wasted = 0
// The longest UTF-8 encoding of a valid rune is 4 bytes (0xF4 0x8F 0xBF 0xBF, encoding U+10FFFF),
// so we should never need to waste 4 or more bytes at a time.
for wasted < 4 && wasted < length {
chunk = remainingText[:length - wasted]
if r, _ := utf8.DecodeLastRuneInString(chunk); r == utf8.RuneError {
wasted += 1
} else {
break
}
}
// Note: At this point, "chunk" might still be invalid, if "text" is very broken.
msgParts = append(msgParts, chunk)
remainingText = remainingText[len(chunk):]
}
msgParts = append(msgParts, ClipMessage(remainingText, length, clippingMessage))
return msgParts
}
// ParseMarkdown takes in an input string as markdown and parses it to html
func ParseMarkdown(input string) string {
extensions := parser.HardLineBreak | parser.NoIntraEmphasis | parser.FencedCode

View File

@ -125,3 +125,105 @@ func TestConvertWebPToPNG(t *testing.T) {
t.Fail()
}
}
var clippingOrSplittingTestCases = map[string]struct {
inputText string
clipSplitLength int
clippingMessage string
splitMax int
expectedOutput []string
}{
"Short single-line message, split 3": {
inputText: "short",
clipSplitLength: 20,
clippingMessage: "?!?!",
splitMax: 3,
expectedOutput: []string{"short"},
},
"Short single-line message, split 1": {
inputText: "short",
clipSplitLength: 20,
clippingMessage: "?!?!",
splitMax: 1,
expectedOutput: []string{"short"},
},
"Short single-line message, split 0": {
// Mainly check that we don't crash.
inputText: "short",
clipSplitLength: 20,
clippingMessage: "?!?!",
splitMax: 0,
expectedOutput: []string{"short"},
},
"Long single-line message, noclip": {
inputText: "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
clipSplitLength: 50,
clippingMessage: "?!?!",
splitMax: 10,
expectedOutput: []string{
"Lorem ipsum dolor sit amet, consectetur adipiscing",
" elit, sed do eiusmod tempor incididunt ut labore ",
"et dolore magna aliqua.",
},
},
"Long single-line message, noclip tight": {
inputText: "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
clipSplitLength: 50,
clippingMessage: "?!?!",
splitMax: 3,
expectedOutput: []string{
"Lorem ipsum dolor sit amet, consectetur adipiscing",
" elit, sed do eiusmod tempor incididunt ut labore ",
"et dolore magna aliqua.",
},
},
"Long single-line message, clip custom": {
inputText: "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
clipSplitLength: 50,
clippingMessage: "?!?!",
splitMax: 2,
expectedOutput: []string{
"Lorem ipsum dolor sit amet, consectetur adipiscing",
" elit, sed do eiusmod tempor incididunt ut lab?!?!",
},
},
"Long single-line message, clip built-in": {
inputText: "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
clipSplitLength: 50,
clippingMessage: "",
splitMax: 2,
expectedOutput: []string{
"Lorem ipsum dolor sit amet, consectetur adipiscing",
" elit, sed do eiusmod tempor inc <clipped message>",
},
},
"Short multi-line message": {
inputText: "I\ncan't\nget\nno\nsatisfaction!",
clipSplitLength: 50,
clippingMessage: "",
splitMax: 2,
expectedOutput: []string{"I\ncan't\nget\nno\nsatisfaction!"},
},
"Long message containing UTF-8 multi-byte runes": {
inputText: "人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。",
clipSplitLength: 50,
clippingMessage: "",
splitMax: 10,
expectedOutput: []string{
"人人生而自由,在尊嚴和權利上一律", // Note: only 48 bytes!
"平等。 他們都具有理性和良知,應該", // Note: only 49 bytes!
"以兄弟情誼的精神對待彼此。",
},
},
}
func TestClipOrSplitMessage(t *testing.T) {
for testname, testcase := range clippingOrSplittingTestCases {
actualOutput := ClipOrSplitMessage(testcase.inputText, testcase.clipSplitLength, testcase.clippingMessage, testcase.splitMax)
assert.Equalf(t, testcase.expectedOutput, actualOutput, "'%s' testcase should give expected lines with clipping+splitting.", testname)
for _, splitLine := range testcase.expectedOutput {
byteLength := len([]byte(splitLine))
assert.True(t, byteLength <= testcase.clipSplitLength, "Splitted line '%s' of testcase '%s' should not exceed the maximum byte-length (%d vs. %d).", splitLine, testname, testcase.clipSplitLength, byteLength)
}
}
}