fix an edge case in skeletonization

'm' skeletonizes to 'rn' (but is exempted by the isBoring check),
but the fullwidth 'm' does not skeletonize to anything. The root cause
of this is the (still unexplained) patchiness of the skeleton mapping
for fullwidth -> standard-width Latin characters; the fix is to perform
width mapping first, before either skeletonization or isBoring.
This commit is contained in:
Shivaram Lingamneni 2019-06-18 02:34:16 -04:00
parent 8991846fcf
commit be4d098945
2 changed files with 8 additions and 4 deletions

View File

@ -163,15 +163,15 @@ func isIdent(name string) bool {
// from the original (unfolded) identifier and stored/tracked separately from the // from the original (unfolded) identifier and stored/tracked separately from the
// casefolded identifier. // casefolded identifier.
func Skeleton(name string) (string, error) { func Skeleton(name string) (string, error) {
if !isBoring(name) {
name = confusables.Skeleton(name)
}
// XXX the confusables table includes some, but not all, fullwidth->standard // XXX the confusables table includes some, but not all, fullwidth->standard
// mappings for latin characters. do a pass of explicit width folding, // mappings for latin characters. do a pass of explicit width folding,
// same as PRECIS: // same as PRECIS:
name = width.Fold.String(name) name = width.Fold.String(name)
if !isBoring(name) {
name = confusables.Skeleton(name)
}
// internationalized lowercasing for skeletons; this is much more lenient than // internationalized lowercasing for skeletons; this is much more lenient than
// Casefold. In particular, skeletons are expected to mix scripts (which may // Casefold. In particular, skeletons are expected to mix scripts (which may
// violate the bidi rule). We also don't care if they contain runes // violate the bidi rule). We also don't care if they contain runes

View File

@ -181,6 +181,10 @@ func TestSkeleton(t *testing.T) {
t.Errorf("after skeletonizing, we should casefold") t.Errorf("after skeletonizing, we should casefold")
} }
if skeleton("sm") != "smt" {
t.Errorf("our friend lover successfully tricked the skeleton algorithm!")
}
if skeleton("еvan") != "evan" { if skeleton("еvan") != "evan" {
t.Errorf("we must protect against cyrillic homoglyph attacks") t.Errorf("we must protect against cyrillic homoglyph attacks")
} }