// +build ignore // 本程序生成 zhmakeindex/CJK 使用的汉字数据表。 package main import ( "archive/zip" "bufio" "bytes" "flag" "fmt" "io" "io/ioutil" "log" "net/http" "os" "path" "strconv" "strings" "unicode" ) func init() { log.SetFlags(log.Lshortfile) } func main() { outdir := flag.String("d", ".", "输出目录") output_stroke := flag.Bool("stroke", true, "输出笔顺表") output_reading := flag.Bool("reading", true, "输出读音表") output_radical := flag.Bool("radical", true, "输出部首表") flag.Parse() // 数据文件 Unihan.zip var unihan *zip.Reader if *output_stroke || *output_reading || *output_radical { unihan = readUnihan() } if *output_stroke { make_stroke_table(*outdir, unihan) } if *output_reading { make_reading_table(*outdir, unihan) } if *output_radical { make_radical_table(*outdir, unihan) } } // 读取 Unihan 数据文件 func readUnihan() *zip.Reader { resp, err := http.Get("http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip") if err != nil { log.Fatalln(err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { log.Fatalf("bad GET status for Unihan.zip: %d", resp.Status) } buffer, err := ioutil.ReadAll(resp.Body) if err != nil { log.Fatalln(err) } unihan, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) if err != nil { log.Fatalln(err) } return unihan } func getUnihanFile(unihan *zip.Reader, file string) io.ReadCloser { var unihan_file io.ReadCloser var err error for _, f := range unihan.File { if f.Name == file { unihan_file, err = f.Open() if err != nil { log.Fatalln(err) } break } } return unihan_file } const MAX_CODEPOINT = 0x40000 // 覆盖 Unicode 第 0、1、2、3 平面 func make_stroke_table(outdir string, unihan *zip.Reader) { var CJKstrokes [MAX_CODEPOINT][]byte var maxStroke int = 0 var unicodeVersion string // 使用海峰五笔码表数据,生成笔顺表 sunwb_file, err := os.Open("sunwb_strokeorder.txt") if err != nil { log.Fatalln(err) } defer sunwb_file.Close() scanner := bufio.NewScanner(sunwb_file) for i := 1; scanner.Scan(); i++ { if scanner.Err() != nil { log.Fatalln(scanner.Err()) } line := scanner.Text() line = strings.TrimSpace(line) if strings.HasPrefix(line, "#") { continue } fields := strings.Split(line, "\t") if len(fields) != 2 || len([]rune(fields[0])) != 1 || strings.IndexFunc(fields[1], isNotDigit) != -1 { log.Printf("笔顺文件第 %d 行语法错误,忽略。\n", i) continue } var r rune = []rune(fields[0])[0] var order []byte for _, rdigit := range fields[1] { digit, _ := strconv.ParseInt(string(rdigit), 10, 8) order = append(order, byte(digit)) } CJKstrokes[r] = order if len(order) > maxStroke { maxStroke = len(order) } } // 使用 Unihan 数据库,读取笔画数补全其他字符 unihan_file := getUnihanFile(unihan, "Unihan_DictionaryLikeData.txt") defer unihan_file.Close() scanner = bufio.NewScanner(unihan_file) for scanner.Scan() { if scanner.Err() != nil { log.Fatalln(scanner.Err()) } line := scanner.Text() if strings.Contains(line, "Unicode version:") { unicodeVersion = strings.TrimPrefix(line, "# ") } if strings.HasPrefix(line, "U+") && strings.Contains(line, "kTotalStrokes") { fields := strings.Split(line, "\t") var r rune fmt.Sscanf(fields[0], "U+%X", &r) var stroke int fmt.Sscanf(fields[2], "%d", &stroke) if CJKstrokes[r] != nil { // 笔顺数据已有,检查一致性 if stroke != len(CJKstrokes[r]) { log.Printf("U+%04X (%c) 的笔顺数据(%d 画)与 unihan 笔画数(%d 画)不一致,跳过 unihan 数据\n", r, r, len(CJKstrokes[r]), stroke) } } else { // 无笔顺数据,假定每个笔画都是 6 号(未知) var order = make([]byte, stroke) for i := range order { order[i] = 6 } CJKstrokes[r] = order if stroke > maxStroke { maxStroke = stroke } } } } // 输出笔顺表 outfile, err := os.Create(path.Join(outdir, "strokes.go")) if err != nil { log.Fatalln(err) } defer outfile.Close() fmt.Fprintln(outfile, `// 这是由程序自动生成的文件,请不要直接编辑此文件 // 笔顺来源:sunwb_strokeorder.txt // 笔画数来源:Unihan_DictionaryLikeData.txt`) fmt.Fprintf(outfile, "// Unicode 版本:%s\n", unicodeVersion) fmt.Fprintln(outfile, "\n"+`package CJK`) fmt.Fprintln(outfile, "\n"+`// Strokes 从字符取得笔顺信息。 var Strokes map[rune]string = strokes var strokes = map[rune]string{`) for r, order := range CJKstrokes { if order == nil { continue } fmt.Fprintf(outfile, "\t%#x: \"", r) for _, s := range order { fmt.Fprintf(outfile, "\\x%02x", s) } fmt.Fprintf(outfile, "\", // %c\n", r) } fmt.Fprintln(outfile, `}`) fmt.Fprintf(outfile, "\nconst MAX_STROKE = %d\n", maxStroke) } func isNotDigit(r rune) bool { return !unicode.IsDigit(r) } func make_reading_table(outdir string, unihan *zip.Reader) { // 读取 Unihan 读音表 reading_table := make(map[rune]*ReadingEntry) reading_file := getUnihanFile(unihan, "Unihan_Readings.txt") defer reading_file.Close() scanner := bufio.NewScanner(reading_file) largest := rune(0) var version string for scanner.Scan() { if scanner.Err() != nil { log.Fatalln(scanner.Err()) } line := scanner.Text() if strings.Contains(line, "Unicode version:") { version = strings.TrimPrefix(line, "# ") } if strings.HasPrefix(line, "U+") { fields := strings.Split(line, "\t") var r rune fmt.Sscanf(fields[0], "U+%X", &r) if reading_table[r] == nil { reading_table[r] = &ReadingEntry{} } switch fields[1] { case "kHanyuPinyin": reading_table[r].HanyuPinyin = fields[2] case "kMandarin": reading_table[r].Mandarin = fields[2] } if r > largest { largest = r } } } // 整理所有汉字的拼音表 out_reading_table := make([]string, largest+1) for k, v := range reading_table { pinyin := v.regular() numbered := NumberedPinyin(pinyin) out_reading_table[k] = numbered } // 单独增加数字“〇”的读音 if out_reading_table['〇'] == "" { out_reading_table['〇'] = "ling2" } // 输出 outfile, err := os.Create(path.Join(outdir, "readings.go")) if err != nil { log.Fatalln(err) } defer outfile.Close() fmt.Fprintln(outfile, `// 这是由程序自动生成的文件,请不要直接编辑此文件 // 来源:Unihan_Readings.txt`) fmt.Fprintln(outfile, `//`, version) fmt.Fprintln(outfile, "\n"+`package CJK`) fmt.Fprintln(outfile, "\n"+`// Readings 从字符取得常用读音。 var Readings map[rune]string = readings var readings = map[rune]string{`) for k, v := range out_reading_table { if v != "" { fmt.Fprintf(outfile, "\t%#x: %s, // %c\n", k, strconv.Quote(v), k) } } fmt.Fprintln(outfile, `}`) } type ReadingEntry struct { HanyuPinyin string Mandarin string } // 取出最常用的一个拼音 // 以 Mandarin 为主,不足的以 HanyuPinyin 补全 func (entry *ReadingEntry) regular() string { // kMandarin Syntax: [a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+ // 如 lüè if entry.Mandarin != "" { // 目前文件中没有多值情况,不过按 UAX #38 允许多值 return strings.Split(entry.Mandarin, " ")[0] } // kHanyuPinyin Syntax: (\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:([a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+,)*[a-z\x{300}-\x{302}\x{304}\x{308}\x{30C}]+ // 如 10093.130:xī,lǔ 74609.020:lǔ,xī if entry.HanyuPinyin != "" { // 第一个冒号后,逗号/空格或词尾前的部分 b := strings.Index(entry.HanyuPinyin, ":") e := strings.IndexAny(entry.HanyuPinyin[b:], " ,") if e > 0 { return entry.HanyuPinyin[b+1 : b+e] } else { return entry.HanyuPinyin[b+1:] } } // 没有汉语读音 return "" } // 把拼音转换为无声调的拼音加数字声调 // 其中 ü 变为 v,轻声调号为 5,如 lǎo 转换为 lao3,lǘ 转换为 lv2 func NumberedPinyin(pinyin string) string { if pinyin == "" { return "" } numbered := []rune{} tone := 5 for _, r := range pinyin { if Vowel[r] == 0 { numbered = append(numbered, r) } else { numbered = append(numbered, Vowel[r]) } if Tones[r] != 0 { tone = Tones[r] } } numbered = append(numbered, []rune(strconv.Itoa(tone))...) return string(numbered) } var Vowel = map[rune]rune{ 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a', 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o', 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e', 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i', 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u', 'ǖ': 'v', 'ǘ': 'v', 'ǚ': 'v', 'ǜ': 'v', 'ü': 'v', 'ń': 'n', 'ň': 'n', 'ǹ': 'n', } var Tones = map[rune]int{ 'ā': 1, 'ō': 1, 'ē': 1, 'ī': 1, 'ū': 1, 'ǖ': 1, 'á': 2, 'ó': 2, 'é': 2, 'í': 2, 'ú': 2, 'ǘ': 2, 'ń': 2, 'ǎ': 3, 'ǒ': 3, 'ě': 3, 'ǐ': 3, 'ǔ': 3, 'ǚ': 3, 'ň': 3, 'à': 4, 'ò': 4, 'è': 4, 'ì': 4, 'ù': 4, 'ǜ': 4, 'ǹ': 4, 'ü': 5, } func make_radical_table(outdir string, unihan *zip.Reader) { // 读入部首 CJKRadical := read_radicals() // 读入部首、除部首笔画 version, CJKRadicalStrokes := read_radical_strokes(unihan) // 单独增加数字“〇”的部首、除部首笔画(乙部 0 画) CJKRadicalStrokes['〇'] = MakeRadicalStroke('〇', 5, 0) // 输出 outfile, err := os.Create(path.Join(outdir, "radicalstrokes.go")) if err != nil { log.Fatalln(err) } defer outfile.Close() fmt.Fprintln(outfile, `// 这是由程序自动生成的文件,请不要直接编辑此文件 // 部首来源:CJKRadicals.txt // 部首笔画数来源:Unihan_IRGSources.txt`) fmt.Fprintln(outfile, `//`, version) fmt.Fprintln(outfile, "\n"+`package CJK // Radical 是康熙字典部首类型。 // 此类型未包括 U+2F00 至 U+2FD5 等部首专用符号。 type Radical struct { Origin rune // 原部首的对应汉字 Simplified rune // 简化部首 } const MAX_RADICAL = 214 // Radicals 是所有部首。 var Radicals [MAX_RADICAL + 1]Radical = radicals var radicals = [MAX_RADICAL + 1]Radical{`) for i := 1; i < MAX_RADICAL+1; i++ { fmt.Fprintf(outfile, "\t%d: {%#x, %#x}, // %c", i, CJKRadical[i].Origin, CJKRadical[i].Simplified, CJKRadical[i].Origin) if CJKRadical[i].Simplified == 0 { fmt.Fprintln(outfile) } else { fmt.Fprintf(outfile, " (%c)\n", CJKRadical[i].Simplified) } } fmt.Fprintln(outfile, "}\n") fmt.Fprintln(outfile, `// RadicalStroke 为部首与除部首笔画数。 // 前两个字节分别放部首和除部首笔画数,后面放字符本身的 UTF-8 编码,可直接排序。 type RadicalStroke string // Radical 取得部首编号。 func (rs RadicalStroke) Radical() int { return int(rs[0]) } // Stroke 取得除部首笔画数。 func (rs RadicalStroke) Stroke() int { return int(rs[1]) } // RadicalStrokes 从字符取得部首与除部首笔画数信息。 var RadicalStrokes map[rune]RadicalStroke = radicalStrokes var radicalStrokes = map[rune]RadicalStroke{`) for r, rs := range CJKRadicalStrokes { if rs != "" { fmt.Fprintf(outfile, "\t%#x: %+q, // %c\n", r, rs, r) } } fmt.Fprintln(outfile, "}") } // 康熙字典部首 // 未包括 U+2F00 至 U+2FD5 等部首专用符号 type Radical struct { Origin rune // 原部首的对应汉字 Simplified rune // 简化部首 } const MAX_RADICAL = 214 // 读取 CJKRadicals.txt 获取康熙字典部首表 func read_radicals() [MAX_RADICAL + 1]Radical { resp, err := http.Get("http://www.unicode.org/Public/UCD/latest/ucd/CJKRadicals.txt") if err != nil { log.Fatalln(err) } if resp.StatusCode != http.StatusOK { log.Fatalf("bad GET status for CJKRadicals.txt: %d", resp.Status) } defer resp.Body.Close() var CJKRadical [MAX_RADICAL + 1]Radical scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { if scanner.Err() != nil { log.Fatalln(scanner.Err()) } line := strings.TrimSpace(scanner.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } fields := strings.Split(line, ";") indexstr := fields[0] var index int fmt.Sscanf(fields[0], "%d", &index) var char rune fmt.Sscanf(fields[2], "%X", &char) if strings.HasSuffix(indexstr, "'") { CJKRadical[index].Simplified = char } else { CJKRadical[index].Origin = char } } return CJKRadical } // 部首与除部首笔画数 // 前两个字节分别放部首和除部首笔画数,后面放字符本身的 UTF-8 编码,可直接排序 type RadicalStroke string func (rs RadicalStroke) Radical() int { return int(rs[0]) } func (rs RadicalStroke) Stroke() int { return int(rs[1]) } func MakeRadicalStroke(r rune, radical, stroke int) RadicalStroke { buf := []byte{byte(radical), byte(stroke)} return RadicalStroke(buf) + RadicalStroke(r) } // 读取 Unihan_IRGSources.txt 获取部首笔画数表 func read_radical_strokes(unihan *zip.Reader) (version string, CJKRadicalStrokes []RadicalStroke) { radical_file := getUnihanFile(unihan, "Unihan_IRGSources.txt") defer radical_file.Close() CJKRadicalStrokes = make([]RadicalStroke, MAX_CODEPOINT) scanner := bufio.NewScanner(radical_file) for scanner.Scan() { if scanner.Err() != nil { log.Fatalln(scanner.Err()) } line := strings.TrimSpace(scanner.Text()) if strings.Contains(line, "Unicode version:") { version = strings.TrimPrefix(line, "# ") } if strings.HasPrefix(line, "U+") { fields := strings.Split(line, "\t") // kRSUnicode 语法:[1-9][0-9]{0,2}\'?\.-?[0-9]{1,2} // 点前面是部首编号,加撇表示简化字部首; // 点后面是除部首笔画数,可能为负数表示是部首简化的部分,但这里将负笔画计为 0 if fields[1] == "kRSUnicode" { var r rune fmt.Sscanf(fields[0], "U+%X", &r) var radical, stroke int if strings.ContainsRune(fields[2], '\'') { fmt.Sscanf(fields[2], "%d'.%d", &radical, &stroke) } else { fmt.Sscanf(fields[2], "%d.%d", &radical, &stroke) } if stroke < 0 { stroke = 0 } CJKRadicalStrokes[r] = MakeRadicalStroke(r, radical, stroke) } } } return }