From be4d0989455e305683be15388b19932953a78a45 Mon Sep 17 00:00:00 2001 From: Shivaram Lingamneni Date: Tue, 18 Jun 2019 02:34:16 -0400 Subject: [PATCH] fix an edge case in skeletonization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'm' skeletonizes to 'rn' (but is exempted by the isBoring check), but the fullwidth 'm' does not skeletonize to anything. The root cause of this is the (still unexplained) patchiness of the skeleton mapping for fullwidth -> standard-width Latin characters; the fix is to perform width mapping first, before either skeletonization or isBoring. --- irc/strings.go | 8 ++++---- irc/strings_test.go | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/irc/strings.go b/irc/strings.go index 5ef67e78..be671164 100644 --- a/irc/strings.go +++ b/irc/strings.go @@ -163,15 +163,15 @@ func isIdent(name string) bool { // from the original (unfolded) identifier and stored/tracked separately from the // casefolded identifier. func Skeleton(name string) (string, error) { - if !isBoring(name) { - name = confusables.Skeleton(name) - } - // XXX the confusables table includes some, but not all, fullwidth->standard // mappings for latin characters. do a pass of explicit width folding, // same as PRECIS: name = width.Fold.String(name) + if !isBoring(name) { + name = confusables.Skeleton(name) + } + // internationalized lowercasing for skeletons; this is much more lenient than // Casefold. In particular, skeletons are expected to mix scripts (which may // violate the bidi rule). We also don't care if they contain runes diff --git a/irc/strings_test.go b/irc/strings_test.go index 757722d4..595ef7f6 100644 --- a/irc/strings_test.go +++ b/irc/strings_test.go @@ -181,6 +181,10 @@ func TestSkeleton(t *testing.T) { t.Errorf("after skeletonizing, we should casefold") } + if skeleton("smt") != "smt" { + t.Errorf("our friend lover successfully tricked the skeleton algorithm!") + } + if skeleton("еvan") != "evan" { t.Errorf("we must protect against cyrillic homoglyph attacks") }