328 lines
8.5 KiB
PHP
328 lines
8.5 KiB
PHP
|
|
<?php
|
|||
|
|
declare (strict_types = 1);
|
|||
|
|
/**
|
|||
|
|
* 敏感词类库.
|
|||
|
|
* User: Lustre
|
|||
|
|
* Date: 17/3/9
|
|||
|
|
* Time: 上午9:11
|
|||
|
|
*/
|
|||
|
|
namespace SensitiveHelper;
|
|||
|
|
|
|||
|
|
class SensitiveHelper
|
|||
|
|
{
|
|||
|
|
/**
|
|||
|
|
* 待检测语句长度
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
protected $contentLength = 0;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 敏感词单例
|
|||
|
|
*
|
|||
|
|
* @var object|null
|
|||
|
|
*/
|
|||
|
|
private static $_instance = null;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 敏感词库树
|
|||
|
|
*
|
|||
|
|
* @var HashMap|null
|
|||
|
|
*/
|
|||
|
|
protected $wordTree = null;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 存放待检测语句敏感词
|
|||
|
|
*
|
|||
|
|
* @var array|null
|
|||
|
|
*/
|
|||
|
|
protected static $badWordList = null;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 获取单例
|
|||
|
|
*
|
|||
|
|
* @return self
|
|||
|
|
*/
|
|||
|
|
public static function instance()
|
|||
|
|
{
|
|||
|
|
if (!self::$_instance instanceof self) {
|
|||
|
|
self::$_instance = new self();
|
|||
|
|
}
|
|||
|
|
return self::$_instance;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 构建敏感词树【文件模式】
|
|||
|
|
* @param string $filepath
|
|||
|
|
* @return $this
|
|||
|
|
* @throws \Exception
|
|||
|
|
*/
|
|||
|
|
public function setTreeByFile($filepath = null)
|
|||
|
|
{
|
|||
|
|
if (!file_exists($filepath)) {
|
|||
|
|
throw new \Exception('没有词库');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 词库树初始化
|
|||
|
|
$this->wordTree = $this->wordTree ?: new HashMap();
|
|||
|
|
foreach ($this->yieldToReadFile($filepath) as $word) {
|
|||
|
|
$this->buildWordToTree(trim($word));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 构建敏感词树【数组模式】
|
|||
|
|
* @param null $sensitiveWords
|
|||
|
|
* @return $this
|
|||
|
|
* @throws \Exception
|
|||
|
|
*/
|
|||
|
|
public function setTree($sensitiveWords = null, bool $type = true)
|
|||
|
|
{
|
|||
|
|
$this->wordTree = new HashMap();
|
|||
|
|
foreach ($sensitiveWords as $word) {
|
|||
|
|
$this->buildWordToTree($word);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $this;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 检测文字中的敏感词
|
|||
|
|
*
|
|||
|
|
* @param string $content 待检测内容
|
|||
|
|
* @param int $matchType 匹配类型 [默认为最小匹配规则]
|
|||
|
|
* @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
|
|||
|
|
* @return array
|
|||
|
|
*/
|
|||
|
|
public function getBadWord($content, $matchType = 1, $wordNum = 0)
|
|||
|
|
{
|
|||
|
|
$this->contentLength = mb_strlen($content, 'utf-8');
|
|||
|
|
$badWordList = array();
|
|||
|
|
for ($length = 0; $length < $this->contentLength; $length++) {
|
|||
|
|
$matchFlag = 0;
|
|||
|
|
$flag = false;
|
|||
|
|
$tempMap = $this->wordTree;
|
|||
|
|
for ($i = $length; $i < $this->contentLength; $i++) {
|
|||
|
|
|
|||
|
|
$keyChar = mb_substr($content, $i, 1, 'utf-8');
|
|||
|
|
|
|||
|
|
// 获取指定节点树
|
|||
|
|
$nowMap = $tempMap->get($keyChar);
|
|||
|
|
|
|||
|
|
// 不存在节点树,直接返回
|
|||
|
|
if (empty($nowMap)) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 存在,则判断是否为最后一个
|
|||
|
|
$tempMap = $nowMap;
|
|||
|
|
|
|||
|
|
// 找到相应key,偏移量+1
|
|||
|
|
$matchFlag++;
|
|||
|
|
|
|||
|
|
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
|
|||
|
|
if (false === $nowMap->get('ending')) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$flag = true;
|
|||
|
|
|
|||
|
|
// 最小规则,直接退出
|
|||
|
|
if (1 === $matchType) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!$flag) {
|
|||
|
|
$matchFlag = 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 找到相应key
|
|||
|
|
if ($matchFlag <= 0) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
|
|||
|
|
|
|||
|
|
// 有返回数量限制
|
|||
|
|
if ($wordNum > 0 && count($badWordList) == $wordNum) {
|
|||
|
|
return $badWordList;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 需匹配内容标志位往后移
|
|||
|
|
$length = $length + $matchFlag - 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $badWordList;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 替换敏感字字符
|
|||
|
|
*
|
|||
|
|
* @param $content 文本内容
|
|||
|
|
* @param string $replaceChar 替换字符
|
|||
|
|
* @param bool $repeat true=>重复替换为敏感词相同长度的字符
|
|||
|
|
* @param int $matchType
|
|||
|
|
* @return mixed
|
|||
|
|
*/
|
|||
|
|
public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
|
|||
|
|
{
|
|||
|
|
if (empty($content)) {
|
|||
|
|
throw new \Exception('请填写检测的内容');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
|
|||
|
|
|
|||
|
|
// 未检测到敏感词,直接返回
|
|||
|
|
if (empty($badWordList)) {
|
|||
|
|
return $content;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
foreach ($badWordList as $badWord) {
|
|||
|
|
$hasReplacedChar = $replaceChar;
|
|||
|
|
if ($repeat) {
|
|||
|
|
$hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
|
|||
|
|
}
|
|||
|
|
$content = str_replace($badWord, $hasReplacedChar, $content);
|
|||
|
|
}
|
|||
|
|
return $content;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 标记敏感词
|
|||
|
|
* @param $content 文本内容
|
|||
|
|
* @param string $sTag 标签开头,如<mark>
|
|||
|
|
* @param string $eTag 标签结束,如</mark>
|
|||
|
|
* @param int $matchType
|
|||
|
|
* @return mixed
|
|||
|
|
*/
|
|||
|
|
public function mark($content, $sTag, $eTag, $matchType = 1)
|
|||
|
|
{
|
|||
|
|
if (empty($content)) {
|
|||
|
|
throw new \Exception('请填写检测的内容');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
$badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
|
|||
|
|
|
|||
|
|
// 未检测到敏感词,直接返回
|
|||
|
|
if (empty($badWordList)) {
|
|||
|
|
return $content;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
foreach ($badWordList as $badWord) {
|
|||
|
|
$replaceChar = $sTag . $badWord . $eTag;
|
|||
|
|
$content = str_replace($badWord, $replaceChar, $content);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return $content;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 被检测内容是否合法
|
|||
|
|
* @param $content
|
|||
|
|
* @return bool
|
|||
|
|
*/
|
|||
|
|
public function islegal($content)
|
|||
|
|
{
|
|||
|
|
$this->contentLength = mb_strlen($content, 'utf-8');
|
|||
|
|
for ($length = 0; $length < $this->contentLength; $length++) {
|
|||
|
|
|
|||
|
|
$matchFlag = 0;
|
|||
|
|
$tempMap = $this->wordTree;
|
|||
|
|
for ($i = $length; $i < $this->contentLength; $i++) {
|
|||
|
|
|
|||
|
|
$keyChar = mb_substr($content, $i, 1, 'utf-8');
|
|||
|
|
|
|||
|
|
// 获取指定节点树
|
|||
|
|
$nowMap = $tempMap->get($keyChar);
|
|||
|
|
|
|||
|
|
// 不存在节点树,直接返回
|
|||
|
|
if (empty($nowMap)) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 找到相应key,偏移量+1
|
|||
|
|
$tempMap = $nowMap;
|
|||
|
|
$matchFlag++;
|
|||
|
|
|
|||
|
|
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
|
|||
|
|
if (false === $nowMap->get('ending')) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 找到相应key
|
|||
|
|
if ($matchFlag <= 0) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 需匹配内容标志位往后移
|
|||
|
|
$length = $length + $matchFlag - 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
protected function yieldToReadFile($filepath)
|
|||
|
|
{
|
|||
|
|
$fp = fopen($filepath, 'r');
|
|||
|
|
while (!feof($fp)) {
|
|||
|
|
yield fgets($fp);
|
|||
|
|
}
|
|||
|
|
fclose($fp);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 将单个敏感词构建成树结构
|
|||
|
|
protected function buildWordToTree($word = '')
|
|||
|
|
{
|
|||
|
|
if ('' === $word) {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
$tree = $this->wordTree;
|
|||
|
|
$wordLength = mb_strlen($word, 'utf-8');
|
|||
|
|
for ($i = 0; $i < $wordLength; $i++) {
|
|||
|
|
$keyChar = mb_substr($word, $i, 1, 'utf-8');
|
|||
|
|
|
|||
|
|
// 获取子节点树结构
|
|||
|
|
$tempTree = $tree->get($keyChar);
|
|||
|
|
|
|||
|
|
if ($tempTree) {
|
|||
|
|
$tree = $tempTree;
|
|||
|
|
} else {
|
|||
|
|
// 设置标志位
|
|||
|
|
$newTree = new HashMap();
|
|||
|
|
$newTree->put('ending', false);
|
|||
|
|
|
|||
|
|
// 添加到集合
|
|||
|
|
$tree->put($keyChar, $newTree);
|
|||
|
|
$tree = $newTree;
|
|||
|
|
}
|
|||
|
|
// 到达最后一个节点
|
|||
|
|
if ($i == $wordLength - 1) {
|
|||
|
|
$tree->put('ending', true);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 敏感词替换为对应长度的字符
|
|||
|
|
* @param $word
|
|||
|
|
* @param $char
|
|||
|
|
* @return string
|
|||
|
|
*/
|
|||
|
|
protected function dfaBadWordConversChars($word, $char)
|
|||
|
|
{
|
|||
|
|
$str = '';
|
|||
|
|
$length = mb_strlen($word, 'utf-8');
|
|||
|
|
for ($counter = 0; $counter < $length; ++$counter) {
|
|||
|
|
$str .= $char;
|
|||
|
|
}
|
|||
|
|
return $str;
|
|||
|
|
}
|
|||
|
|
}
|