mirror of
https://github.com/ergochat/ergo.git
synced 2024-11-23 04:19:25 +01:00
83 lines
2.0 KiB
Go
83 lines
2.0 KiB
Go
|
//go:generate go run maketables.go > tables.go
|
||
|
|
||
|
package confusables
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
|
||
|
"golang.org/x/text/unicode/norm"
|
||
|
)
|
||
|
|
||
|
// TODO: document casefolding approaches
|
||
|
// (suggest to force casefold strings; explain how to catch paypal - pAypal)
|
||
|
// TODO: DOC you might want to store the Skeleton and check against it later
|
||
|
// TODO: implement xidmodifications.txt restricted characters
|
||
|
|
||
|
type lookupFunc func(rune) (string)
|
||
|
|
||
|
func lookupReplacement(r rune) string {
|
||
|
return confusablesMap[r]
|
||
|
}
|
||
|
|
||
|
func lookupReplacementTweaked(r rune) string {
|
||
|
if replacement, ok := tweaksMap[r]; ok {
|
||
|
return replacement
|
||
|
}
|
||
|
return confusablesMap[r]
|
||
|
}
|
||
|
|
||
|
func skeletonBase(s string, lookup lookupFunc) string {
|
||
|
|
||
|
// 1. Converting X to NFD format
|
||
|
s = norm.NFD.String(s)
|
||
|
|
||
|
// 2. Successively mapping each source character in X to the target string
|
||
|
// according to the specified data table
|
||
|
var buf bytes.Buffer
|
||
|
changed := false // fast path: if this remains false, keep s intact
|
||
|
prevPos := 0
|
||
|
var replacement string
|
||
|
for i, r := range s {
|
||
|
if changed && replacement == "" {
|
||
|
buf.WriteString(s[prevPos:i])
|
||
|
}
|
||
|
prevPos = i
|
||
|
replacement = lookup(r)
|
||
|
if replacement != "" {
|
||
|
if !changed {
|
||
|
changed = true
|
||
|
// first replacement: copy over the previously unmodified text
|
||
|
buf.WriteString(s[:i])
|
||
|
}
|
||
|
buf.WriteString(replacement)
|
||
|
}
|
||
|
}
|
||
|
if changed && replacement == "" {
|
||
|
buf.WriteString(s[prevPos:]) // loop-and-a-half
|
||
|
}
|
||
|
if changed {
|
||
|
s = buf.String()
|
||
|
}
|
||
|
|
||
|
// 3. Reapplying NFD
|
||
|
s = norm.NFD.String(s)
|
||
|
|
||
|
return s
|
||
|
}
|
||
|
|
||
|
// Skeleton converts a string to its "skeleton" form
|
||
|
// as described in http://www.unicode.org/reports/tr39/#Confusable_Detection
|
||
|
func Skeleton(s string) string {
|
||
|
return skeletonBase(s, lookupReplacement)
|
||
|
}
|
||
|
|
||
|
// SkeletonTweaked is like Skeleton, but it implements some custom overrides
|
||
|
// to the confusables table (currently it removes the m -> rn mapping):
|
||
|
func SkeletonTweaked(s string) string {
|
||
|
return skeletonBase(s, lookupReplacementTweaked)
|
||
|
}
|
||
|
|
||
|
func Confusable(x, y string) bool {
|
||
|
return Skeleton(x) == Skeleton(y)
|
||
|
}
|