Files
swiftadmin/vendor/overtrue/pinyin/bin/build
2023-04-25 20:11:49 +08:00

97 lines
3.1 KiB
PHP

#!/usr/bin/env php
<?php
require __DIR__ . '/utils.php';
$polyphones = explode(',', file_get_contents(__DIR__ . '/../sources/polyphones.txt'));
$charsSouce = __DIR__ . '/../sources/chars.txt';
$charsPathes = __DIR__.'/../sources/pathes/chars.txt';
$wordsSouce = __DIR__ . '/../sources/words.txt';
$wordsPathes = __DIR__ . '/../sources/pathes/words.txt';
$surnamesSource = file(__DIR__.'/../sources/surnames.txt');
if (!file_exists($charsSouce)) {
file_put_contents($charsSouce, file_get_contents('https://raw.githubusercontent.com/mozillazg/pinyin-data/master/pinyin.txt'));
}
if (!file_exists($wordsSouce)) {
file_put_contents($wordsSouce, file_get_contents('https://raw.githubusercontent.com/mozillazg/phrase-pinyin-data/master/large_pinyin.txt'));
}
// ------------------------------------------------
$surnames = [];
foreach ($surnamesSource as $line) {
[$surname, $pinyin] = explode(',', trim($line));
$surnames[trim($surname)] = join("\t", ["", ...preg_split('/\s+/', trim($pinyin)), ""]);
}
// ------------------------------------------------
// 单字,带多音
$charWithPolyphones = [];
// 单字,不带多音
$chars = [];
foreach (parse_chars($charsSouce) as $char => $pinyin) {
$charWithPolyphones[$char] = $pinyin;
$chars[$char] = "\t{$pinyin[0]}\t";
}
// 补丁部分
foreach (parse_chars($charsPathes, fn ($p) => "\t{$p[0]}\t") as $char => $pinyin) {
$chars[$char] = $pinyin;
}
// ------------------------------------------------
$words = [];
foreach (parse_words($wordsSouce) as $word => $pinyin) {
$wordChars = preg_split('//u', $word, -1, PREG_SPLIT_NO_EMPTY);
try {
$pinyinSegments = array_combine($wordChars, $pinyin);
} catch (Throwable $e) {
throw new Exception("行解析错误:$line");
}
// 多音字处理
$polyphoneChars = array_intersect_key($wordChars, $polyphones);
foreach ($polyphoneChars as $char) {
// 如果词里的任何一个多音字在词里的读音和常用读音不一致,则需要加入词典,否则抛弃该词
if (isset($charWithPolyphones[$char]) && $pinyinSegments[$char] != $charWithPolyphones[$char][0]) {
$words[$word] = join("\t", ["", ...$pinyin, ""]);
break;
}
}
}
foreach (parse_words($wordsPathes) as $word => $pinyin) {
$words[$word] = join("\t", ["", ...$pinyin, ""]);
}
// 清理
exec('rm -rf ' . __DIR__ . '/../data/*');
// 姓氏
file_put_contents(__DIR__ . '/../data/surnames.php', "<?php\nreturn ".var_export($surnames, true).";\n");
echo count($surnames)." surnames saved.\n";
// 单字:带多音
file_put_contents(__DIR__ . '/../data/chars.php', "<?php\nreturn ".var_export($charWithPolyphones, true).";\n");
echo count($chars)." chars saved.\n";
// 词:从长到短 + 单字
$words = array_merge($words, $chars);
uksort($words, fn ($a, $b) => strlen($b) <=> strlen($a));
foreach (array_chunk($words, 8000, true) as $index => $group) {
file_put_contents(__DIR__ . "/../data/words-{$index}.php", "<?php\nreturn ".var_export($group, true).";\n");
echo count($group)." words saved in ".__DIR__ . "/../data/words-{$index}.php \n";
}