您的位置:首页 > 编程语言 > Java开发

一种中文文本的快速分词方法(二)

2014-02-05 16:39 357 查看
package org.zhukovasky.chineseSeg;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;

import org.zhukovasky.HashBinaryClass.HashBinaryContainer;
import org.zhukovasky.HashBinaryClass.Maps;
import org.zhukovasky.fileutil.WordCount;
import org.zhukovasky.fileutil.WordDictUtil;
import org.zhukovasky.invertedindex.MapWords;

/**
* 以下是中文文本的分词工具,
* 文本的编码为UTF-8
* @author zhukovasky
* @version 1.0
* @since 2013.12
* @email zhukovasky@163.com
* */
public class chineseSeg {
/**
* 以下方法为对中文文本的分词写入到倒排索引中
* @param afterprocess经过预处理后的文本
* @param invertedIndex 存放倒排索引的地址
* @param 字典所在的地址
*
* */
public final static int MAXLENGTH=10;
public static void FileSeg(File afterprocess,File invertedIndex,File dict){
MapWords mapwords=new MapWords();
Reader r=null;
BufferedReader bf=null;
ObjectOutputStream oos=null;
OutputStream output=null;
String Line=null;
Maps map=WordCount.getDict(dict);
int i=0;
try {
r=new FileReader(afterprocess);
bf=new BufferedReader(r);
Line=bf.readLine();
int Kase=0;
if(Line.length()<=MAXLENGTH+1){
Kase=1;
}else{
Kase=2;
}
switch(Kase){
case 1:{
while(i<=Line.length()-1){
String str=null;
String str1=null;
String str2=null;
str=Line.substring(0);
str1=Line.substring(0, 1);
str2=Line.substring(1, 2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] maxletemp=WordDictUtil.getStringLengthArray(temp);
if(maxletemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
i=i+2;
}else{
int length=maxletemp[0].length();
String str3=str.substring(1, length+1);
String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3, maxletemp)){
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
i++;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
i++;
}
}
};
break;
case 2:{
while(i<=Line.length()-1){
String str=null;
String str1=null;
String str2=null;
if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){
str=Line.substring(i);
if(i>=Line.length()-1&&Line.length()-1>0){
if(i>Line.length()){
break;
}
str1=Line.substring(i);
str2=null;
break;
}else{
str1=str.substring(0,1);
str2=str.substring(1,2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
if(MaxLeTemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
i=i+2;
}else{
int length=MaxLeTemp[0].length();
if(str.length()<length){
break;
}
String str3=str.substring(1, length+1);

String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
seek=1;
i=i+seek;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
seek=1;
i=i+seek;
}
}
}else{
str=Line.substring(i, i+MAXLENGTH);
str1=str.substring(0, 1);
str2=str.substring(1, 2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
if(MaxLeTemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
i=i+2;
}else{
int length=MaxLeTemp[0].length();
String str3=str.substring(1, length+1);
String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
seek=1;
i=i+seek;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
seek=1;
i=i+seek;
}
}
}
};
break;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
r.close();
bf.close();
} catch (IOException e) {
e.printStackTrace();
}
}

try {
output=new FileOutputStream(invertedIndex);
oos=new ObjectOutputStream(output);
oos.writeObject(mapwords);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {

e.printStackTrace();
}finally{
try {
oos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 以下方法为对中文文本的分词写入到倒排索引中
* @param afterprocess[k]经过预处理后的文本
* @param invertedIndex 存放倒排索引的地址
* @param 字典所在的地址
* */
public static void FileArraysSeg(File[] afterprocess,File invertedIndex,File dict){
MapWords mapwords=new MapWords();
Reader r=null;
BufferedReader bf=null;
ObjectOutputStream oos=null;
OutputStream output=null;
String Line=null;
Maps map=WordCount.getDict(dict);
int i=0;
int MAXLENGTH=9;			//取决于词典中最大长度词条
for(int k=0;k<afterprocess.length;k++){
try {
r=new FileReader(afterprocess[k]);
bf=new BufferedReader(r);
Line=bf.readLine();
int Kase=0;
if(Line.length()<=MAXLENGTH+1){
Kase=1;
}else{
Kase=2;
}
switch(Kase){
case 1:{
while(i<=Line.length()-1){
String str=null;
String str1=null;
String str2=null;
str=Line.substring(0);
str1=Line.substring(0, 1);
str2=Line.substring(1, 2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] maxletemp=WordDictUtil.getStringLengthArray(temp);
if(maxletemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
i=i+2;
}else{
int length=maxletemp[0].length();
String str3=str.substring(1, length+1);
String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3, maxletemp)){
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
i++;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
i++;
}
}
};
break;
case 2:{
while(i<=Line.length()-1){
String str=null;
String str1=null;
String str2=null;
if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){
str=Line.substring(i);
if(i>=Line.length()-1&&Line.length()-1>0){
if(i>Line.length()){
break;
}
str1=Line.substring(i);
str2=null;
break;
}else{
str1=str.substring(0,1);
str2=str.substring(1,2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
if(MaxLeTemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
i=i+2;
}else{
int length=MaxLeTemp[0].length();
if(str.length()<length){
break;
}
String str3=str.substring(1, length+1);

String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3,
ad0e
MaxLeTemp)){
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
seek=1;
i=i+seek;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
seek=1;
i=i+seek;
}
}
}else{
str=Line.substring(i, i+MAXLENGTH);
str1=str.substring(0, 1);
str2=str.substring(1, 2);
int seek=0;
if(map.isCwordExist(str1)){
if(map.getHBC(str1).isSecondWordExist(str2)){
HashBinaryContainer hbc=map.getHBC(str1);
String[] temp=hbc.getMatchArray(str2);
String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
if(MaxLeTemp[0].length()==1){
String segword=str1+str2;
seek=2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
i=i+2;
}else{
int length=MaxLeTemp[0].length();
String str3=str.substring(1, length+1);
String segword=str1+str3;
if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
seek=segword.length();
i=i+seek;
}else{
i=i+2;
segword=str1+str2;
mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
}
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
seek=1;
i=i+seek;
}
}else{
mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
seek=1;
i=i+seek;
}
}
}
};
break;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
r.close();
bf.close();
} catch (IOException e) {
e.printStackTrace();
}
}

}
try {
output=new FileOutputStream(invertedIndex);
oos=new ObjectOutputStream(output);
oos.writeObject(mapwords);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {

e.printStackTrace();
}finally{
try {
oos.close();
} catch (IOException e) {
e.printStackTrace();
}
}

}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java map 索引 string