您的位置:首页 > 编程语言

敏感词过滤代码

2016-07-12 11:39 309 查看
public class AtomicPattern {
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.length() - this.pattern.str.length();
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
return true;
return false;
}

AtomicPattern(Pattern pattern) {
this.pattern = pattern;
};

private Pattern pattern;

public UnionPattern belongUnionPattern;

public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
}

public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
}

public Pattern getPattern() {
return pattern;
}

public void setPattern(Pattern pattern) {
this.pattern = pattern;
}

}

public class MutiPatternParser {

private boolean initFlag = false;

//private UnionPatternSet unionPatternSet = new UnionPatternSet();

private int maxIndex = (int) java.lang.Math.pow(2, 16);

private int shiftTable[] = new int[maxIndex];

public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];

private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();

public boolean addFilterKeyWord(String keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
String[] strArray = keyWord.split(" ");
for (int i = 0; i < strArray.length; i++) {
Pattern pattern = new Pattern(strArray[i]);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
}
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
}

private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')|| (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
return true;// 简体中文汉字编码
return false;
}

public String parse(String content, Vector<Integer> levelSet) {
if (initFlag == false)
init();
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
String preContent = preConvert(content);
for (int i = 0; i < preContent.length();) {
char checkChar = preContent.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
tmpAps = findMathAps(preContent.substring(0, i + 1),hashTable[checkChar]);
aps.addAll(tmpAps);
i++;
} else
i = i + shiftTable[checkChar];
}
parseAtomicPatternSet(aps, levelSet);
return content;
}

private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
Vector<Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps) == true) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
}

private Vector<AtomicPattern> findMathAps(String src,
Vector<AtomicPattern> destAps) {
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
for (int i = 0; i < destAps.size(); i++) {
AtomicPattern ap = destAps.get(i);
if (ap.findMatchInString(src) == true)

aps.add(ap);
}
return aps;
}

private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch) == true) {
retStr = retStr + ch;
}
}
return retStr;
}

// shift table and hash table of initialize
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector<AtomicPattern>();
shiftTableInit();
hashTableInit();
}

public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
}

private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = 2;
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
}

private void hashTableInit() {
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
}
}
}
}

}

public class Pattern {
Pattern(String str) {
this.str = str;
}

public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
}

public String str;

public String getStr() {
return str;
};

}

public class SameAtomicPatternSet {
SameAtomicPatternSet() {
SAPS = new Vector<AtomicPattern>();
};

public Vector<AtomicPattern> SAPS;

}

public class TxtReader {
public TxtReader() {
super();
}

public static BufferedReader keywordReader(String fileName) {
File file = new File(fileName);
BufferedReader br = null;
try {
FileInputStream in = new FileInputStream(file);
InputStreamReader inReader = new InputStreamReader(in, "UTF-8");

br = new BufferedReader(inReader);

} catch (FileNotFoundException e) {
System.out.println("你想加载的文件没有找到!!!");
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
System.out.println("你指定的编码类型不支持哦!!!");
e.printStackTrace();
}
return br;

}

}

public class UnionPattern {
UnionPattern() {
this.apSet = new Vector<AtomicPattern>();
}

public Vector<AtomicPattern> apSet;

public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
}

public Vector<AtomicPattern> getSet() {
return apSet;
}

public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
if (isInAps(ap, inAps) == false)
return false;
}
return true;
}

private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
for (int i = 0; i < inAps.size(); i++) {
AtomicPattern destAp = inAps.get(i);
if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true)
return true;
}
return false;
}

public void setLevel(int level) {
this.level = level;
}

public int getLevel() {
return this.level;
}

private int level;

}

public class UnionPatternSet {
UnionPatternSet() {
this.unionPatternSet = new Vector<UnionPattern>();
}

public void addNewUnionPattrn(UnionPattern up) {
this.unionPatternSet.add(up);
}

public Vector<UnionPattern> unionPatternSet;

public Vector<UnionPattern> getSet() {
return unionPatternSet;
}

public void clear() {
unionPatternSet.clear();
}

}

public class FilterTest {
public static void main(String args[]) {
MutiPatternParser filterEngine = new MutiPatternParser();
BufferedReader brKeyword = TxtReader.keywordReader("D://file/illegalkeyword.txt");//关键字的文件,文件太肮脏了,这里就不上传了
BufferedReader brArticle = TxtReader.keywordReader("D://file/article.txt");//待验证的文章
String keyword = null;
String article = null;
StringBuffer buffer = new StringBuffer();
Vector<Integer> levelSet = new Vector<Integer>();
try {
while ((keyword = brKeyword.readLine()) != null) {
filterEngine.addFilterKeyWord(keyword, 1);
}
while ((article = brArticle.readLine()) != null) {
buffer.append(article);
}
} catch (IOException e) {
System.out.println("读取文件IO异常!!!");
e.printStackTrace();
}

String content = filterEngine.parse(buffer.toString(), levelSet);

levelSet.clear();
filterEngine.parse(content, levelSet);
System.out.println("有违法字符" + levelSet.size()+"处

levelSet.clear();
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  敏感词过滤