From c34d9e0b7262f2dc93e3117224ada3ed6837aad9 Mon Sep 17 00:00:00 2001 From: Shivaram Lingamneni Date: Sun, 3 Feb 2019 02:45:02 -0500 Subject: [PATCH] more lenient casefolding for skeletons --- irc/strings.go | 25 +++++++++++++++---------- irc/strings_test.go | 2 ++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/irc/strings.go b/irc/strings.go index d475120f..0e5e50ca 100644 --- a/irc/strings.go +++ b/irc/strings.go @@ -9,8 +9,10 @@ import ( "strings" "github.com/oragono/confusables" + "golang.org/x/text/cases" + "golang.org/x/text/language" "golang.org/x/text/secure/precis" - "golang.org/x/text/unicode/norm" + "golang.org/x/text/width" ) const ( @@ -126,14 +128,6 @@ func isBoring(name string) bool { return true } -var skeletonCasefolder = precis.NewIdentifier(precis.FoldWidth, precis.LowerCase(), precis.Norm(norm.NFC)) - -// similar to Casefold, but exempts the bidi rule, because skeletons may -// mix scripts strangely -func casefoldSkeleton(str string) (string, error) { - return iterateFolding(skeletonCasefolder, str) -} - // Skeleton produces a canonicalized identifier that tries to catch // homoglyphic / confusable identifiers. It's a tweaked version of the TR39 // skeleton algorithm. We apply the skeleton algorithm first and only then casefold, @@ -146,5 +140,16 @@ func Skeleton(name string) (string, error) { if !isBoring(name) { name = confusables.Skeleton(name) } - return casefoldSkeleton(name) + + // XXX the confusables table includes some, but not all, fullwidth->standard + // mappings for latin characters. do a pass of explicit width folding, + // same as PRECIS: + name = width.Fold.String(name) + + // internationalized lowercasing for skeletons; this is much more lenient than + // Casefold. In particular, skeletons are expected to mix scripts (which may + // violate the bidi rule). We also don't care if they contain runes + // that are disallowed by PRECIS, because every identifier must independently + // pass PRECIS --- we are just further canonicalizing the skeleton. + return cases.Lower(language.Und).String(name), nil } diff --git a/irc/strings_test.go b/irc/strings_test.go index 0f4c87de..6b60a0f0 100644 --- a/irc/strings_test.go +++ b/irc/strings_test.go @@ -173,4 +173,6 @@ func TestSkeleton(t *testing.T) { t.Errorf("we must protect against cyrillic homoglyph attacks") } + // should not raise an error: + skeleton("けらんぐ") }