您的位置:首页 > 编程语言 > Java开发

java实现正向最大匹配分词

2017-03-05 17:38 633 查看
1、下载mmseg4j-1.8.5分词器,取其中words.dic词典
下载地址

package com.yj.nlp_common.seg.MyMMSeg;

import java.util.HashMap;

/**
* 构建内存词典的Trie树结点
*
*/
public class TrieNode {
/** 结点关键字,其值为中文词中的一个字 */
public char key = '\0';

/** 如果该字在词语的末尾,则bound=true */
public boolean bound = false;

/** 指向下一个结点的指针结构,用来存放当前字在词中的下一个字的位置 */
public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

public TrieNode() {}

public TrieNode(char key) {
this.key = key;
}
}
package com.yj.nlp_common.seg.MyMMSeg;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;

/**
* Created by Administrator on 2017/3/3.
*/
public class TrieDictionary {

private static TrieDictionary trieDictionary = null;

private static List<String> wordlist = null;

private static TrieNode root = null;

public TrieNode getRoot(){
return root;
}

public static TrieDictionary getInstance(String dictionaryName){
if (trieDictionary==null) {
trieDictionary = new TrieDictionary(dictionaryName);
}
return trieDictionary;
}

public static TrieDictionary getInstance(){
String dictionaryName = TrieDictionary.class.getClassLoader().getResource("MyMM//words.dic").getPath();
if (trieDictionary==null) {
trieDictionary = new TrieDictionary(dictionaryName);
}
return trieDictionary;
}

private TrieDictionary(String dictionaryName){
String filePath = TrieDictionary.class.getClassLoader().getResource("MyMM//words.dic").getPath();
try {
wordlist = FileUtils.readLines(new File(filePath));
root = new TrieNode();
for(String word: wordlist){
addWord(word);
}
} catch (IOException e) {
e.printStackTrace();
}
}

private void addWord(String word){
TrieNode current = root;
for (int i=0; i<word.length();++i) {
char c = word.charAt(i);
TrieNode node = new TrieNode(c);
if (i == word.length() - 1) {
node.bound=true;
}

HashMap<Character, TrieNode> childs = current.childs;
if (childs.containsKey(c)) {
current = childs.get(c);
}else{
childs.put(c, node);
current = node;
}
}

}

}



package com.yj.nlp_common.seg.MyMMSeg;

/**
* Created by Administrator on 2017/3/3.
*/
public class CharacterType {

/**
* 是分隔符
* @param c
* @return
*/
public static boolean isCharSeperator(char c) {
return "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ".indexOf(c) != -1;
}

/**
* 是中文
* @param c
* @return
*/
public static boolean isCharChinese(char c) {
return c >= '\u4E00' && c <= '\u9FBF';
}

/**
* 其他字符
* @param c
* @return
*/
public static boolean isCharOther(char c) {
return !isCharSeperator(c) && !isCharChinese(c);
}

//private static final String C_E_SEPERATOR = "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ";
//private static final String str = "。!?:;、,()《》【】{}“”‘’!?:;,()<>[]{}\"'\n\r\t ";
}
package com.yj.nlp_common.seg.MyMMSeg;

import java.io.IOException;
/**
* 分词
*/
public class MMSegmenter {
public static TrieDictionary dict = null;

static { //加载词典
dict = TrieDictionary.getInstance();
}

/**
*
* @param sentence
* @return
*/
public String segment(String sentence) {
StringBuffer segBuffer = new StringBuffer();

TrieNode root = dict.getRoot();

TrieNode cur = root;

int length = sentence.length();

for (int i = 0; i < length; ++i) {
char c = sentence.charAt(i);
if (CharacterType.isCharChinese(c)) {//识别出一个中文词
cur = cur.childs.get(c);
if(cur==null){// 不在词典中的中文字符
segBuffer.append(c);
segBuffer.append('|'); //添加分词标记
cur = root;
}else{// 在词典中的中文字符
do {
segBuffer.append(c);
if (++i==length) {
break;
}
c = sentence.charAt(i);
cur = cur.childs.get(c);
}while(CharacterType.isCharChinese(c) && cur!=null);
if (i!=length) --i;//还原现场
segBuffer.append('|'); //添加分词标记
cur = root;
}
}else if(CharacterType.isCharOther(c)){//识别出一个其他语言单词
do{
segBuffer.append(c);
if (++i==length) {
break;
}
c = sentence.charAt(i);
}while(CharacterType.isCharOther(c));
if (i!=length) --i;//还原现场
segBuffer.append('|'); //添加分词标记
cur = root;
}else if(CharacterType.isCharSeperator(c)){//可以多个连续分隔符

}

}

return new String(segBuffer);
}

public static void main(String args[]) throws IOException {
MMSegmenter mmsegger = new MMSegmenter();
System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello中国人,,,我是中国人"));

//System.out.println(CharacterType.isCharSeperator(' '));
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: