97 lines
3.1 KiB
PHP
97 lines
3.1 KiB
PHP
#!/usr/bin/env php
|
|
<?php
|
|
|
|
require __DIR__ . '/utils.php';
|
|
|
|
$polyphones = explode(',', file_get_contents(__DIR__ . '/../sources/polyphones.txt'));
|
|
$charsSouce = __DIR__ . '/../sources/chars.txt';
|
|
$charsPathes = __DIR__.'/../sources/pathes/chars.txt';
|
|
$wordsSouce = __DIR__ . '/../sources/words.txt';
|
|
$wordsPathes = __DIR__ . '/../sources/pathes/words.txt';
|
|
$surnamesSource = file(__DIR__.'/../sources/surnames.txt');
|
|
|
|
|
|
if (!file_exists($charsSouce)) {
|
|
file_put_contents($charsSouce, file_get_contents('https://raw.githubusercontent.com/mozillazg/pinyin-data/master/pinyin.txt'));
|
|
}
|
|
|
|
if (!file_exists($wordsSouce)) {
|
|
file_put_contents($wordsSouce, file_get_contents('https://raw.githubusercontent.com/mozillazg/phrase-pinyin-data/master/large_pinyin.txt'));
|
|
}
|
|
|
|
|
|
// ------------------------------------------------
|
|
$surnames = [];
|
|
foreach ($surnamesSource as $line) {
|
|
[$surname, $pinyin] = explode(',', trim($line));
|
|
|
|
$surnames[trim($surname)] = join("\t", ["", ...preg_split('/\s+/', trim($pinyin)), ""]);
|
|
}
|
|
|
|
|
|
// ------------------------------------------------
|
|
|
|
// 单字,带多音
|
|
$charWithPolyphones = [];
|
|
// 单字,不带多音
|
|
$chars = [];
|
|
|
|
foreach (parse_chars($charsSouce) as $char => $pinyin) {
|
|
$charWithPolyphones[$char] = $pinyin;
|
|
$chars[$char] = "\t{$pinyin[0]}\t";
|
|
}
|
|
|
|
// 补丁部分
|
|
foreach (parse_chars($charsPathes, fn ($p) => "\t{$p[0]}\t") as $char => $pinyin) {
|
|
$chars[$char] = $pinyin;
|
|
}
|
|
|
|
// ------------------------------------------------
|
|
|
|
$words = [];
|
|
|
|
foreach (parse_words($wordsSouce) as $word => $pinyin) {
|
|
$wordChars = preg_split('//u', $word, -1, PREG_SPLIT_NO_EMPTY);
|
|
|
|
try {
|
|
$pinyinSegments = array_combine($wordChars, $pinyin);
|
|
} catch (Throwable $e) {
|
|
throw new Exception("行解析错误:$line");
|
|
}
|
|
|
|
// 多音字处理
|
|
$polyphoneChars = array_intersect_key($wordChars, $polyphones);
|
|
|
|
foreach ($polyphoneChars as $char) {
|
|
// 如果词里的任何一个多音字在词里的读音和常用读音不一致,则需要加入词典,否则抛弃该词
|
|
if (isset($charWithPolyphones[$char]) && $pinyinSegments[$char] != $charWithPolyphones[$char][0]) {
|
|
$words[$word] = join("\t", ["", ...$pinyin, ""]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach (parse_words($wordsPathes) as $word => $pinyin) {
|
|
$words[$word] = join("\t", ["", ...$pinyin, ""]);
|
|
}
|
|
|
|
// 清理
|
|
exec('rm -rf ' . __DIR__ . '/../data/*');
|
|
|
|
// 姓氏
|
|
file_put_contents(__DIR__ . '/../data/surnames.php', "<?php\nreturn ".var_export($surnames, true).";\n");
|
|
echo count($surnames)." surnames saved.\n";
|
|
|
|
// 单字:带多音
|
|
file_put_contents(__DIR__ . '/../data/chars.php', "<?php\nreturn ".var_export($charWithPolyphones, true).";\n");
|
|
echo count($chars)." chars saved.\n";
|
|
|
|
// 词:从长到短 + 单字
|
|
$words = array_merge($words, $chars);
|
|
uksort($words, fn ($a, $b) => strlen($b) <=> strlen($a));
|
|
|
|
foreach (array_chunk($words, 8000, true) as $index => $group) {
|
|
file_put_contents(__DIR__ . "/../data/words-{$index}.php", "<?php\nreturn ".var_export($group, true).";\n");
|
|
echo count($group)." words saved in ".__DIR__ . "/../data/words-{$index}.php \n";
|
|
}
|