2016-06-15 13:50:56 +02:00
|
|
|
// Copyright (c) 2012-2014 Jeremy Latt
|
|
|
|
// Copyright (c) 2014-2015 Edmund Huber
|
2017-03-27 14:15:02 +02:00
|
|
|
// Copyright (c) 2016-2017 Daniel Oaks <daniel@danieloaks.net>
|
2016-06-15 13:50:56 +02:00
|
|
|
// released under the MIT license
|
|
|
|
|
2014-03-09 21:45:36 +01:00
|
|
|
package irc
|
|
|
|
|
|
|
|
import (
|
|
|
|
"strings"
|
2016-10-26 16:44:36 +02:00
|
|
|
|
2019-01-31 03:51:54 +01:00
|
|
|
"github.com/oragono/confusables"
|
2019-02-03 08:45:02 +01:00
|
|
|
"golang.org/x/text/cases"
|
|
|
|
"golang.org/x/text/language"
|
2016-10-26 16:44:36 +02:00
|
|
|
"golang.org/x/text/secure/precis"
|
2019-02-03 08:45:02 +01:00
|
|
|
"golang.org/x/text/width"
|
2014-03-09 21:45:36 +01:00
|
|
|
)
|
|
|
|
|
2017-01-13 17:32:15 +01:00
|
|
|
const (
|
2017-12-26 03:30:04 +01:00
|
|
|
casemappingName = "rfc8265"
|
2017-01-13 17:32:15 +01:00
|
|
|
)
|
|
|
|
|
2019-01-31 00:59:49 +01:00
|
|
|
// Each pass of PRECIS casefolding is a composition of idempotent operations,
|
|
|
|
// but not idempotent itself. Therefore, the spec says "do it four times and hope
|
|
|
|
// it converges" (lolwtf). Golang's PRECIS implementation has a "repeat" option,
|
|
|
|
// which provides this functionality, but unfortunately it's not exposed publicly.
|
|
|
|
func iterateFolding(profile *precis.Profile, oldStr string) (str string, err error) {
|
|
|
|
str = oldStr
|
2017-08-17 10:23:24 +02:00
|
|
|
// follow the stabilizing rules laid out here:
|
|
|
|
// https://tools.ietf.org/html/draft-ietf-precis-7564bis-10.html#section-7
|
|
|
|
for i := 0; i < 4; i++ {
|
2019-01-31 00:59:49 +01:00
|
|
|
str, err = profile.CompareKey(str)
|
2017-08-17 10:23:24 +02:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
if oldStr == str {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
oldStr = str
|
|
|
|
}
|
|
|
|
if oldStr != str {
|
|
|
|
return "", errCouldNotStabilize
|
|
|
|
}
|
|
|
|
return str, nil
|
2014-03-09 21:45:36 +01:00
|
|
|
}
|
|
|
|
|
2019-01-31 00:59:49 +01:00
|
|
|
// Casefold returns a casefolded string, without doing any name or channel character checks.
|
|
|
|
func Casefold(str string) (string, error) {
|
|
|
|
return iterateFolding(precis.UsernameCaseMapped, str)
|
|
|
|
}
|
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
// CasefoldChannel returns a casefolded version of a channel name.
|
|
|
|
func CasefoldChannel(name string) (string, error) {
|
2018-12-06 04:35:36 +01:00
|
|
|
if len(name) == 0 {
|
2018-02-03 13:03:36 +01:00
|
|
|
return "", errStringIsEmpty
|
2016-04-21 02:48:15 +02:00
|
|
|
}
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2018-12-06 04:35:36 +01:00
|
|
|
// don't casefold the preceding #'s
|
|
|
|
var start int
|
|
|
|
for start = 0; start < len(name) && name[start] == '#'; start += 1 {
|
|
|
|
}
|
|
|
|
|
|
|
|
if start == 0 {
|
|
|
|
// no preceding #'s
|
2016-10-11 15:51:46 +02:00
|
|
|
return "", errInvalidCharacter
|
|
|
|
}
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2018-12-06 04:35:36 +01:00
|
|
|
lowered, err := Casefold(name[start:])
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
// space can't be used
|
|
|
|
// , is used as a separator
|
|
|
|
// * is used in mask matching
|
|
|
|
// ? is used in mask matching
|
2017-07-26 08:02:35 +02:00
|
|
|
if strings.ContainsAny(lowered, " ,*?") {
|
2016-10-11 15:51:46 +02:00
|
|
|
return "", errInvalidCharacter
|
|
|
|
}
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2018-12-06 04:35:36 +01:00
|
|
|
return name[:start] + lowered, err
|
2014-03-09 21:45:36 +01:00
|
|
|
}
|
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
// CasefoldName returns a casefolded version of a nick/user name.
|
|
|
|
func CasefoldName(name string) (string, error) {
|
2017-01-13 17:32:15 +01:00
|
|
|
lowered, err := Casefold(name)
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
2017-01-22 03:44:05 +01:00
|
|
|
} else if len(lowered) == 0 {
|
2018-02-03 13:03:36 +01:00
|
|
|
return "", errStringIsEmpty
|
2016-10-11 15:51:46 +02:00
|
|
|
}
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
// space can't be used
|
|
|
|
// , is used as a separator
|
|
|
|
// * is used in mask matching
|
|
|
|
// ? is used in mask matching
|
|
|
|
// . denotes a server name
|
|
|
|
// ! separates nickname from username
|
|
|
|
// @ separates username from hostname
|
|
|
|
// : means trailing
|
|
|
|
// # is a channel prefix
|
|
|
|
// ~&@%+ are channel membership prefixes
|
|
|
|
// - I feel like disallowing
|
2017-07-26 08:02:35 +02:00
|
|
|
if strings.ContainsAny(lowered, " ,*?.!@:") || strings.ContainsAny(string(lowered[0]), "#~&@%+-") {
|
2016-10-11 15:51:46 +02:00
|
|
|
return "", errInvalidCharacter
|
|
|
|
}
|
2014-03-09 21:45:36 +01:00
|
|
|
|
2016-10-11 15:51:46 +02:00
|
|
|
return lowered, err
|
2014-03-09 21:45:36 +01:00
|
|
|
}
|
2019-01-31 00:59:49 +01:00
|
|
|
|
|
|
|
// "boring" names are exempt from skeletonization.
|
|
|
|
// this is because confusables.txt considers various pure ASCII alphanumeric
|
|
|
|
// strings confusable: 0 and O, 1 and l, m and rn. IMO this causes more problems
|
|
|
|
// than it solves.
|
|
|
|
func isBoring(name string) bool {
|
|
|
|
for i := 0; i < len(name); i += 1 {
|
|
|
|
chr := name[i]
|
|
|
|
if (chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') {
|
|
|
|
continue // alphanumerics
|
|
|
|
}
|
|
|
|
switch chr {
|
|
|
|
case '$', '%', '^', '&', '(', ')', '{', '}', '[', ']', '<', '>', '=':
|
|
|
|
continue // benign printable ascii characters
|
|
|
|
default:
|
|
|
|
return false // potentially confusable ascii like | ' `, non-ascii
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-02-03 09:49:42 +01:00
|
|
|
// returns true if the given name is a valid ident, using a mix of Insp and
|
|
|
|
// Chary's ident restrictions.
|
|
|
|
func isIdent(name string) bool {
|
|
|
|
if len(name) < 1 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < len(name); i++ {
|
|
|
|
chr := name[i]
|
|
|
|
if (chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') {
|
|
|
|
continue // alphanumerics
|
|
|
|
}
|
|
|
|
if i == 0 {
|
|
|
|
return false // first char must be alnum
|
|
|
|
}
|
|
|
|
switch chr {
|
|
|
|
case '[', '\\', ']', '^', '_', '{', '|', '}', '-', '.', '`':
|
|
|
|
continue // allowed chars
|
|
|
|
default:
|
|
|
|
return false // disallowed chars
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2019-01-31 00:59:49 +01:00
|
|
|
// Skeleton produces a canonicalized identifier that tries to catch
|
|
|
|
// homoglyphic / confusable identifiers. It's a tweaked version of the TR39
|
|
|
|
// skeleton algorithm. We apply the skeleton algorithm first and only then casefold,
|
|
|
|
// because casefolding first would lose some information about visual confusability.
|
|
|
|
// This has the weird consequence that the skeleton is not a function of the
|
|
|
|
// casefolded identifier --- therefore it must always be computed
|
|
|
|
// from the original (unfolded) identifier and stored/tracked separately from the
|
|
|
|
// casefolded identifier.
|
|
|
|
func Skeleton(name string) (string, error) {
|
2019-02-03 08:45:02 +01:00
|
|
|
// XXX the confusables table includes some, but not all, fullwidth->standard
|
|
|
|
// mappings for latin characters. do a pass of explicit width folding,
|
|
|
|
// same as PRECIS:
|
|
|
|
name = width.Fold.String(name)
|
|
|
|
|
2019-06-18 08:34:16 +02:00
|
|
|
if !isBoring(name) {
|
|
|
|
name = confusables.Skeleton(name)
|
|
|
|
}
|
|
|
|
|
2019-02-03 08:45:02 +01:00
|
|
|
// internationalized lowercasing for skeletons; this is much more lenient than
|
|
|
|
// Casefold. In particular, skeletons are expected to mix scripts (which may
|
|
|
|
// violate the bidi rule). We also don't care if they contain runes
|
|
|
|
// that are disallowed by PRECIS, because every identifier must independently
|
|
|
|
// pass PRECIS --- we are just further canonicalizing the skeleton.
|
|
|
|
return cases.Lower(language.Und).String(name), nil
|
2019-01-31 00:59:49 +01:00
|
|
|
}
|