116 lines
3.1 KiB
Go
116 lines
3.1 KiB
Go
|
// Copyright 2015 The Go Authors. All rights reserved.
|
|||
|
// Use of this source code is governed by a BSD-style
|
|||
|
// license that can be found in the LICENSE file.
|
|||
|
|
|||
|
// +build ignore
|
|||
|
|
|||
|
// This program generates the trie for width operations. The generated table
|
|||
|
// includes width category information as well as the normalization mappings.
|
|||
|
package main
|
|||
|
|
|||
|
import (
|
|||
|
"bytes"
|
|||
|
"fmt"
|
|||
|
"io"
|
|||
|
"log"
|
|||
|
"math"
|
|||
|
"unicode/utf8"
|
|||
|
|
|||
|
"golang.org/x/text/internal/gen"
|
|||
|
"golang.org/x/text/internal/triegen"
|
|||
|
)
|
|||
|
|
|||
|
// See gen_common.go for flags.
|
|||
|
|
|||
|
func main() {
|
|||
|
gen.Init()
|
|||
|
genTables()
|
|||
|
genTests()
|
|||
|
gen.Repackage("gen_trieval.go", "trieval.go", "width")
|
|||
|
gen.Repackage("gen_common.go", "common_test.go", "width")
|
|||
|
}
|
|||
|
|
|||
|
func genTables() {
|
|||
|
t := triegen.NewTrie("width")
|
|||
|
// fold and inverse mappings. See mapComment for a description of the format
|
|||
|
// of each entry. Add dummy value to make an index of 0 mean no mapping.
|
|||
|
inverse := [][4]byte{{}}
|
|||
|
mapping := map[[4]byte]int{[4]byte{}: 0}
|
|||
|
|
|||
|
getWidthData(func(r rune, tag elem, alt rune) {
|
|||
|
idx := 0
|
|||
|
if alt != 0 {
|
|||
|
var buf [4]byte
|
|||
|
buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
|
|||
|
s := string(r)
|
|||
|
buf[buf[0]] ^= s[len(s)-1]
|
|||
|
var ok bool
|
|||
|
if idx, ok = mapping[buf]; !ok {
|
|||
|
idx = len(mapping)
|
|||
|
if idx > math.MaxUint8 {
|
|||
|
log.Fatalf("Index %d does not fit in a byte.", idx)
|
|||
|
}
|
|||
|
mapping[buf] = idx
|
|||
|
inverse = append(inverse, buf)
|
|||
|
}
|
|||
|
}
|
|||
|
t.Insert(r, uint64(tag|elem(idx)))
|
|||
|
})
|
|||
|
|
|||
|
w := &bytes.Buffer{}
|
|||
|
gen.WriteUnicodeVersion(w)
|
|||
|
|
|||
|
sz, err := t.Gen(w)
|
|||
|
if err != nil {
|
|||
|
log.Fatal(err)
|
|||
|
}
|
|||
|
|
|||
|
sz += writeMappings(w, inverse)
|
|||
|
|
|||
|
fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
|
|||
|
|
|||
|
gen.WriteGoFile(*outputFile, "width", w.Bytes())
|
|||
|
}
|
|||
|
|
|||
|
const inverseDataComment = `
|
|||
|
// inverseData contains 4-byte entries of the following format:
|
|||
|
// <length> <modified UTF-8-encoded rune> <0 padding>
|
|||
|
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
|
|||
|
// UTF-8 encoding of the original rune. Mappings often have the following
|
|||
|
// pattern:
|
|||
|
// A -> A (U+FF21 -> U+0041)
|
|||
|
// B -> B (U+FF22 -> U+0042)
|
|||
|
// ...
|
|||
|
// By xor-ing the last byte the same entry can be shared by many mappings. This
|
|||
|
// reduces the total number of distinct entries by about two thirds.
|
|||
|
// The resulting entry for the aforementioned mappings is
|
|||
|
// { 0x01, 0xE0, 0x00, 0x00 }
|
|||
|
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
|
|||
|
// E0 ^ A1 = 41.
|
|||
|
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
|
|||
|
// E0 ^ A2 = 42.
|
|||
|
// Note that because of the xor-ing, the byte sequence stored in the entry is
|
|||
|
// not valid UTF-8.`
|
|||
|
|
|||
|
func writeMappings(w io.Writer, data [][4]byte) int {
|
|||
|
fmt.Fprintln(w, inverseDataComment)
|
|||
|
fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
|
|||
|
for _, x := range data {
|
|||
|
fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
|
|||
|
}
|
|||
|
fmt.Fprintln(w, "}")
|
|||
|
return len(data) * 4
|
|||
|
}
|
|||
|
|
|||
|
func genTests() {
|
|||
|
w := &bytes.Buffer{}
|
|||
|
fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
|
|||
|
getWidthData(func(r rune, tag elem, alt rune) {
|
|||
|
if alt != 0 {
|
|||
|
fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
|
|||
|
}
|
|||
|
})
|
|||
|
fmt.Fprintln(w, "}")
|
|||
|
gen.WriteGoFile("runes_test.go", "width", w.Bytes())
|
|||
|
}
|