'\p{Z}', 'mark' => '\p{M}', 'tab' => "\t" ]; public const REGEXPS = [ 'number' => '0-9', 'alphabet' => 'a-zA-Z', // 中文不带符号 'hans' => '\x{3007}\x{2E80}-\x{2FFF}\x{3100}-\x{312F}\x{31A0}-\x{31EF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{F900}-\x{FAFF}', // 符号: !"#$%&'()*+,-./:;<=>?@[\]^_{|}~` 'punctuation' => '\p{P}', ]; public function __construct() { $this->regexps = \array_merge($this->regexps, self::REGEXPS); } public static function make(): static { return new static(); } public function polyphonic(): static { $this->polyphonic = true; return $this; } public function surname(): static { $this->asSurname = true; return $this; } public function noWords(): static { $this->noWords = true; return $this; } public function onlyHans(): static { // 中文汉字不含符号 $this->regexps['hans'] = self::REGEXPS['hans']; return $this->noAlpha()->noNumber()->noPunctuation(); } public function noAlpha(): static { unset($this->regexps['alphabet']); return $this; } public function noNumber(): static { unset($this->regexps['number']); return $this; } public function noPunctuation(): static { unset($this->regexps['punctuation']); return $this; } public function withToneStyle(string $toneStyle): static { $this->toneStyle = $toneStyle; return $this; } public function noTone(): static { $this->toneStyle = self::TONE_STYLE_NONE; return $this; } public function useNumberTone(): static { $this->toneStyle = self::TONE_STYLE_NUMBER; return $this; } public function yuToV(): static { $this->yuTo = 'v'; return $this; } public function yuToU(): static { $this->yuTo = 'u'; return $this; } public function when(bool $condition, callable $callback): static { if ($condition) { $callback($this); } return $this; } public function convert(string $string, callable $beforeSplit = null): Collection { // 把原有的数字和汉字分离,避免拼音转换时被误作声调 $string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) { return "\t" . $matches[0]; }, $string); // 过滤掉不保留的字符 $string = \preg_replace(\sprintf('~[^%s]~u', \implode($this->regexps)), '', $string); // 多音字 if ($this->polyphonic) { return $this->convertAsChars($string, true); } if ($this->noWords) { return $this->convertAsChars($string); } // 替换姓氏 if ($this->asSurname) { $string = $this->convertSurname($string); } for ($i = 0; $i < self::SEGMENTS_COUNT; $i++) { $string = strtr($string, require sprintf(self::WORDS_PATH, $i)); } return $this->split($beforeSplit ? $beforeSplit($string) : $string); } public function convertAsChars(string $string, bool $polyphonic = false): Collection { $map = require self::CHARS_PATH; // split string as chinese chars $chars = preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY); $items = []; foreach ($chars as $char) { if (isset($map[$char])) { if ($polyphonic) { $items[$char] = \array_map(fn ($pinyin) => $this->formatTone($pinyin, $this->toneStyle), $map[$char]); } else { $items[$char] = $this->formatTone($map[$char][0], $this->toneStyle); } } } return new Collection($items); } protected function convertSurname(string $name): string { static $surnames = null; $surnames ??= require self::SURNAMES_PATH; foreach ($surnames as $surname => $pinyin) { if (\str_starts_with($name, $surname)) { return $pinyin . \mb_substr($name, \mb_strlen($surname)); } } return $name; } protected function split(string $item): Collection { $items = \array_values(array_filter(preg_split('/\s+/i', $item))); foreach ($items as $index => $item) { $items[$index] = $this->formatTone($item, $this->toneStyle); } return new Collection($items); } protected function formatTone(string $pinyin, string $style): string { $replacements = [ 'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4], 'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['yu', 1], 'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['yu', 2], 'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['yu', 3], 'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['yu', 4], ]; foreach ($replacements as $unicode => $replacement) { if (\str_contains($pinyin, $unicode)) { $umlaut = $replacement[0]; if ($umlaut !== 'yu' && $style === self::TONE_STYLE_SYMBOL) { continue; } // https://zh.wikipedia.org/wiki/%C3%9C if ($this->yuTo !== 'yu') { $umlaut = $this->yuTo; } $pinyin = \str_replace($unicode, $umlaut, $pinyin); if ($this->toneStyle === self::TONE_STYLE_NUMBER) { $pinyin .= $replacement[1]; } } } return $pinyin; } }