mirror of
https://github.com/42wim/matterbridge.git
synced 2025-01-04 09:32:39 +01:00
196 lines
4.6 KiB
Go
196 lines
4.6 KiB
Go
|
package charset
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
func init() {
|
||
|
registerClass("cp932", fromCP932, nil)
|
||
|
}
|
||
|
|
||
|
// encoding details
|
||
|
// (Traditional) Shift-JIS
|
||
|
//
|
||
|
// 00..1f control characters
|
||
|
// 20 space
|
||
|
// 21..7f JIS X 0201:1976/1997 roman (see notes)
|
||
|
// 80 undefined
|
||
|
// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
|
||
|
// a0 undefined
|
||
|
// a1..df JIS X 0201:1976/1997 katakana
|
||
|
// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
|
||
|
// eb..ff undefined
|
||
|
//
|
||
|
// CP932 (windows-31J)
|
||
|
//
|
||
|
// this encoding scheme extends Shift-JIS in the following way
|
||
|
//
|
||
|
// eb..ec undefined (marked as lead bytes - see notes below)
|
||
|
// ed..ee lead byte of NEC-selected IBM extended characters
|
||
|
// ef undefined (marked as lead byte - see notes below)
|
||
|
// f0..f9 lead byte of User defined GAIJI (see note below)
|
||
|
// fa..fc lead byte of IBM extended characters
|
||
|
// fd..ff undefined
|
||
|
//
|
||
|
//
|
||
|
// Notes
|
||
|
//
|
||
|
// JISX 0201:1976/1997 roman
|
||
|
// this is the same as ASCII but with 0x5c (ASCII code for '\')
|
||
|
// representing the Yen currency symbol '¥' (U+00a5)
|
||
|
// This mapping is contentious, some conversion packages implent it
|
||
|
// others do not.
|
||
|
// The mapping files from The Unicode Consortium show cp932 mapping
|
||
|
// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
|
||
|
// symbol (¥) and 0x7e ('~') to overline (¯)
|
||
|
//
|
||
|
// CP932 double-byte character codes:
|
||
|
//
|
||
|
// eb-ec, ef, f0-f9:
|
||
|
// Marked as DBCS LEAD BYTEs in the unicode mapping data
|
||
|
// obtained from:
|
||
|
// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
|
||
|
//
|
||
|
// but there are no defined mappings for codes in this range.
|
||
|
// It is not clear whether or not an implementation should
|
||
|
// consume one or two bytes before emitting an error char.
|
||
|
|
||
|
const (
|
||
|
kanaPages = 1
|
||
|
kanaPageSize = 63
|
||
|
kanaChar0 = 0xa1
|
||
|
|
||
|
cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
|
||
|
cp932PageSize = 189 // 40..fc (including 7f)
|
||
|
cp932Char0 = 0x40
|
||
|
)
|
||
|
|
||
|
type jisTables struct {
|
||
|
page0 [256]rune
|
||
|
dbcsoff [256]int
|
||
|
cp932 []rune
|
||
|
}
|
||
|
|
||
|
type translateFromCP932 struct {
|
||
|
tables *jisTables
|
||
|
scratch []byte
|
||
|
}
|
||
|
|
||
|
func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
|
||
|
tables := p.tables
|
||
|
p.scratch = p.scratch[:0]
|
||
|
n := 0
|
||
|
for i := 0; i < len(data); i++ {
|
||
|
b := data[i]
|
||
|
r := tables.page0[b]
|
||
|
if r != -1 {
|
||
|
p.scratch = appendRune(p.scratch, r)
|
||
|
n++
|
||
|
continue
|
||
|
}
|
||
|
// DBCS
|
||
|
i++
|
||
|
if i >= len(data) {
|
||
|
break
|
||
|
}
|
||
|
pnum := tables.dbcsoff[b]
|
||
|
ix := int(data[i]) - cp932Char0
|
||
|
if pnum == -1 || ix < 0 || ix >= cp932PageSize {
|
||
|
r = utf8.RuneError
|
||
|
} else {
|
||
|
r = tables.cp932[pnum*cp932PageSize+ix]
|
||
|
}
|
||
|
p.scratch = appendRune(p.scratch, r)
|
||
|
n += 2
|
||
|
}
|
||
|
return n, p.scratch, nil
|
||
|
}
|
||
|
|
||
|
type cp932Key bool
|
||
|
|
||
|
func fromCP932(arg string) (Translator, error) {
|
||
|
shiftJIS := arg == "shiftjis"
|
||
|
tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
|
||
|
tables := new(jisTables)
|
||
|
kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// jisx0201kana is mapped into 0xA1..0xDF
|
||
|
for i := 0; i < kanaPageSize; i++ {
|
||
|
tables.page0[i+kanaChar0] = kana[i]
|
||
|
}
|
||
|
|
||
|
// 00..7f same as ascii in cp932
|
||
|
for i := rune(0); i < 0x7f; i++ {
|
||
|
tables.page0[i] = i
|
||
|
}
|
||
|
|
||
|
if shiftJIS {
|
||
|
// shift-jis uses JIS X 0201 for the ASCII range
|
||
|
// this is the same as ASCII apart from
|
||
|
// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
|
||
|
tables.page0['\\'] = '¥'
|
||
|
tables.page0['~'] = '¯'
|
||
|
}
|
||
|
|
||
|
// pre-calculate DBCS page numbers to mapping file page numbers
|
||
|
// and mark codes in page0 that are DBCS lead bytes
|
||
|
pnum := 0
|
||
|
for i := 0x81; i <= 0x84; i++ {
|
||
|
tables.page0[i] = -1
|
||
|
tables.dbcsoff[i] = pnum
|
||
|
pnum++
|
||
|
}
|
||
|
for i := 0x87; i <= 0x9f; i++ {
|
||
|
tables.page0[i] = -1
|
||
|
tables.dbcsoff[i] = pnum
|
||
|
pnum++
|
||
|
}
|
||
|
for i := 0xe0; i <= 0xea; i++ {
|
||
|
tables.page0[i] = -1
|
||
|
tables.dbcsoff[i] = pnum
|
||
|
pnum++
|
||
|
}
|
||
|
if shiftJIS {
|
||
|
return tables, nil
|
||
|
}
|
||
|
// add in cp932 extensions
|
||
|
for i := 0xed; i <= 0xee; i++ {
|
||
|
tables.page0[i] = -1
|
||
|
tables.dbcsoff[i] = pnum
|
||
|
pnum++
|
||
|
}
|
||
|
for i := 0xfa; i <= 0xfc; i++ {
|
||
|
tables.page0[i] = -1
|
||
|
tables.dbcsoff[i] = pnum
|
||
|
pnum++
|
||
|
}
|
||
|
return tables, nil
|
||
|
})
|
||
|
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return &translateFromCP932{tables: tables.(*jisTables)}, nil
|
||
|
}
|
||
|
|
||
|
func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
|
||
|
data, err := readFile(name)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
m := []rune(string(data))
|
||
|
if len(m) != pgsize*npages {
|
||
|
return nil, fmt.Errorf("%q: incorrect length data", name)
|
||
|
}
|
||
|
return m, nil
|
||
|
}
|