您的位置:首页 > 编程语言 > Java开发

[Java Web]敏感词过滤算法

2015-01-19 13:26 381 查看
1.DFA算法

DFA算法的原理可以参考这里,简单来说就是通过Map构造出一颗敏感词树,树的每一条由根节点到叶子节点的路径构成一个敏感词,例如下图:



代码简单实现如下:

public class TextFilterUtil {

//日志
private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);
//敏感词库
private static HashMap sensitiveWordMap = null;
//默认编码格式
private static final String ENCODING = "gbk";
//敏感词库的路径
private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");

/**
* 初始化敏感词库
*/
private static void init() {
//读取文件
Set<String> keyWords = readSensitiveWords();
//创建敏感词库
sensitiveWordMap = new HashMap<>(keyWords.size());
for (String keyWord : keyWords) {
createKeyWord(keyWord);
}
}

/**
* 构建敏感词库
*
* @param keyWord
*/
private static void createKeyWord(String keyWord) {
if (sensitiveWordMap == null) {
LOG.error("sensitiveWordMap 未初始化!");
return;
}
Map nowMap = sensitiveWordMap;
for (Character c : keyWord.toCharArray()) {
Object obj = nowMap.get(c);
if (obj == null) {
Map<String, Object> childMap = new HashMap<>();
childMap.put("isEnd", "false");
nowMap.put(c, childMap);
nowMap = childMap;
} else {
nowMap = (Map) obj;
}
}
nowMap.put("isEnd", "true");
}

/**
* 读取敏感词文件
*
* @return
*/
private static Set<String> readSensitiveWords() {
Set<String> keyWords = new HashSet<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, ENCODING));
String line;
while ((line = reader.readLine()) != null) {
keyWords.add(line.trim());
}
} catch (UnsupportedEncodingException e) {
LOG.error("敏感词库文件转码失败!");
} catch (FileNotFoundException e) {
LOG.error("敏感词库文件不存在!");
} catch (IOException e) {
LOG.error("敏感词库文件读取失败!");
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
reader = null;
}
}
return keyWords;
}

/**
* 检查敏感词
*
* @return
*/
private static List<String> checkSensitiveWord(String text) {
if (sensitiveWordMap == null) {
init();
}
List<String> sensitiveWords = new ArrayList<>();
Map nowMap = sensitiveWordMap;
for (int i = 0; i < text.length(); i++) {
Character word = text.charAt(i);
Object obj = nowMap.get(word);
if (obj == null) {
continue;
}
int j = i + 1;
Map childMap = (Map) obj;
while (j < text.length()) {
if ("true".equals(childMap.get("isEnd"))) {
sensitiveWords.add(text.substring(i, j));
}
obj = childMap.get(text.charAt(j));
if (obj != null) {
childMap = (Map) obj;
} else {
break;
}
j++;
}
}
return sensitiveWords;
}
}


2.TTMP算法
TTMP算法由网友原创,关于它的起源可以查看这里,TTMP算法的原理是将敏感词拆分成“脏字”的序列,只有待比对字符串完全由“脏字”组成时,才去判断它是否为敏感词,减少了比对次数。这个算法的简单实现如下:

public class TextFilterUtil {

//日志
private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);
//默认编码格式
private static final String ENCODING = "gbk";
//敏感词库的路径
private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");
//脏字库
private static Set<Character> sensitiveCharSet = null;
//敏感词库
private static Set<String> sensitiveWordSet = null;

/**
* 初始化敏感词库
*/
private static void init() {
//初始化容器
sensitiveCharSet = new HashSet<>();
sensitiveWordSet = new HashSet<>();
//读取文件 创建敏感词库
readSensitiveWords();
}

/**
* 读取本地的敏感词文件
*
* @return
*/
private static void readSensitiveWords() {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, ENCODING));
String line;
while ((line = reader.readLine()) != null) {
String word = line.trim();
sensitiveWordSet.add(word);
for (Character c : word.toCharArray()) {
sensitiveCharSet.add(c);
}
}
} catch (UnsupportedEncodingException e) {
LOG.error("敏感词库文件转码失败!");
} catch (FileNotFoundException e) {
LOG.error("敏感词库文件不存在!");
} catch (IOException e) {
LOG.error("敏感词库文件读取失败!");
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
reader = null;
}
}
return;
}

/**
* 检查敏感词
*
* @return
*/
private static List<String> checkSensitiveWord(String text) {
if (sensitiveWordSet == null || sensitiveCharSet == null) {
init();
}
List<String> sensitiveWords = new ArrayList<>();
for (int i = 0; i < text.length(); i++) {
Character word = text.charAt(i);
if (!sensitiveCharSet.contains(word)) {
continue;
}
int j = i;
while (j < text.length()) {
if (!sensitiveCharSet.contains(word)) {
break;
}
String key = text.substring(i, j + 1);
if (sensitiveWordSet.contains(key)) {
sensitiveWords.add(key);
}
j++;
}
}
return sensitiveWords;
}
}


注:以上代码实现仅用于展示思路,在实际使用中还有很多地方可以优化。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: