一种中文文本的快速分词方法(二)
2014-02-05 16:39
357 查看
package org.zhukovasky.chineseSeg; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.Reader; import org.zhukovasky.HashBinaryClass.HashBinaryContainer; import org.zhukovasky.HashBinaryClass.Maps; import org.zhukovasky.fileutil.WordCount; import org.zhukovasky.fileutil.WordDictUtil; import org.zhukovasky.invertedindex.MapWords; /** * 以下是中文文本的分词工具, * 文本的编码为UTF-8 * @author zhukovasky * @version 1.0 * @since 2013.12 * @email zhukovasky@163.com * */ public class chineseSeg { /** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 * * */ public final static int MAXLENGTH=10; public static void FileSeg(File afterprocess,File invertedIndex,File dict){ MapWords mapwords=new MapWords(); Reader r=null; BufferedReader bf=null; ObjectOutputStream oos=null; OutputStream output=null; String Line=null; Maps map=WordCount.getDict(dict); int i=0; try { r=new FileReader(afterprocess); bf=new BufferedReader(r); Line=bf.readLine(); int Kase=0; if(Line.length()<=MAXLENGTH+1){ Kase=1; }else{ Kase=2; } switch(Kase){ case 1:{ while(i<=Line.length()-1){ String str=null; String str1=null; String str2=null; str=Line.substring(0); str1=Line.substring(0, 1); str2=Line.substring(1, 2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] maxletemp=WordDictUtil.getStringLengthArray(temp); if(maxletemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); i=i+2; }else{ int length=maxletemp[0].length(); String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, maxletemp)){ mapwords.addNewNodeElement(segword, afterprocess.getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); i++; } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); i++; } } }; break; case 2:{ while(i<=Line.length()-1){ String str=null; String str1=null; String str2=null; if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){ str=Line.substring(i); if(i>=Line.length()-1&&Line.length()-1>0){ if(i>Line.length()){ break; } str1=Line.substring(i); str2=null; break; }else{ str1=str.substring(0,1); str2=str.substring(1,2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp); if(MaxLeTemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); i=i+2; }else{ int length=MaxLeTemp[0].length(); if(str.length()<length){ break; } String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){ mapwords.addNewNodeElement(segword, afterprocess.getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); seek=1; i=i+seek; } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); seek=1; i=i+seek; } } }else{ str=Line.substring(i, i+MAXLENGTH); str1=str.substring(0, 1); str2=str.substring(1, 2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp); if(MaxLeTemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); i=i+2; }else{ int length=MaxLeTemp[0].length(); String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){ mapwords.addNewNodeElement(segword, afterprocess.getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess.getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); seek=1; i=i+seek; } }else{ mapwords.addNewNodeElement(str1, afterprocess.getName(), i); seek=1; i=i+seek; } } } }; break; } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { r.close(); bf.close(); } catch (IOException e) { e.printStackTrace(); } } try { output=new FileOutputStream(invertedIndex); oos=new ObjectOutputStream(output); oos.writeObject(mapwords); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { oos.close(); } catch (IOException e) { e.printStackTrace(); } } } /** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess[k]经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 * */ public static void FileArraysSeg(File[] afterprocess,File invertedIndex,File dict){ MapWords mapwords=new MapWords(); Reader r=null; BufferedReader bf=null; ObjectOutputStream oos=null; OutputStream output=null; String Line=null; Maps map=WordCount.getDict(dict); int i=0; int MAXLENGTH=9; //取决于词典中最大长度词条 for(int k=0;k<afterprocess.length;k++){ try { r=new FileReader(afterprocess[k]); bf=new BufferedReader(r); Line=bf.readLine(); int Kase=0; if(Line.length()<=MAXLENGTH+1){ Kase=1; }else{ Kase=2; } switch(Kase){ case 1:{ while(i<=Line.length()-1){ String str=null; String str1=null; String str2=null; str=Line.substring(0); str1=Line.substring(0, 1); str2=Line.substring(1, 2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] maxletemp=WordDictUtil.getStringLengthArray(temp); if(maxletemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); i=i+2; }else{ int length=maxletemp[0].length(); String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, maxletemp)){ mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); i++; } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); i++; } } }; break; case 2:{ while(i<=Line.length()-1){ String str=null; String str1=null; String str2=null; if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){ str=Line.substring(i); if(i>=Line.length()-1&&Line.length()-1>0){ if(i>Line.length()){ break; } str1=Line.substring(i); str2=null; break; }else{ str1=str.substring(0,1); str2=str.substring(1,2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp); if(MaxLeTemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); i=i+2; }else{ int length=MaxLeTemp[0].length(); if(str.length()<length){ break; } String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, ad0e MaxLeTemp)){ mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); seek=1; i=i+seek; } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); seek=1; i=i+seek; } } }else{ str=Line.substring(i, i+MAXLENGTH); str1=str.substring(0, 1); str2=str.substring(1, 2); int seek=0; if(map.isCwordExist(str1)){ if(map.getHBC(str1).isSecondWordExist(str2)){ HashBinaryContainer hbc=map.getHBC(str1); String[] temp=hbc.getMatchArray(str2); String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp); if(MaxLeTemp[0].length()==1){ String segword=str1+str2; seek=2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); i=i+2; }else{ int length=MaxLeTemp[0].length(); String str3=str.substring(1, length+1); String segword=str1+str3; if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){ mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); seek=segword.length(); i=i+seek; }else{ i=i+2; segword=str1+str2; mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i); } } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); seek=1; i=i+seek; } }else{ mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i); seek=1; i=i+seek; } } } }; break; } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { r.close(); bf.close(); } catch (IOException e) { e.printStackTrace(); } } } try { output=new FileOutputStream(invertedIndex); oos=new ObjectOutputStream(output); oos.writeObject(mapwords); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { oos.close(); } catch (IOException e) { e.printStackTrace(); } } } }
相关文章推荐
- 一种中文文本的快速分词方法(一)(未完待续)
- 一种中文文本的快速分词方法(三)
- KTDictSeg 一种简单快速准确的中文分词方法
- 中文文本分词,关键词提取工具jcseg使用方法
- Python点滴02_Python3打开中文文本时报错的一种处理方法
- MFC用CStdioFile类读取中文文本时乱码问题的一种解决方法
- 用 FPGA 产生高斯白噪声序列的一种快速方法
- R文本挖掘-中文分词Rwordseg
- 一种没有语料字典的分词方法
- 概率中国一种没有语料字典的分词方法
- 一种超级快速的图像二值化方法
- 快速导入AndrioidStudio项目的一种方法
- 推荐一种快速提高英语口语的方法
- 一种快速可预制的随机数组产生方法
- 一种快速刷新richedit中内嵌动画的方法的实现
- 一种快速自适应的图像二值化方法介绍 (Wellner 1993)
- 从GIMP的Retinex算法里发现了一种高斯模糊的快速实现方法
- 文本分析之中文分词
- 文本分词方法