您的位置:首页 > 编程语言 > Java开发

java读取中文分词工具(四)

2014-08-01 20:52 537 查看
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.StringTokenizer;

/*
* 文件格式:已分词的中文文本,每个词语空格分割,每行一个段落。
* 这个类适合读取每行数量较少的文本,比如分好段落的文本,一个段落一行存储。
* 读取一行,步长为1,返回词组。不会跨段落生成词组。
* 3种模式:
* 1 读到文件末尾,结束
* 2 读到文件末尾,从头再来
* 3 一行循环多次,浏览到文本末尾就完了
*/
public class ParaWordReader implements Reader
{
static final int normalMode = 0;//浏览到文本末尾就完了
static final int againMode = 1;//浏览到文本末尾,从头再来
static final int paraAgainMode = 2 ;//一行循环多次,浏览到文本末尾就完了
private int currentMode = 0;

private RandomAccessFile raf= null;
private File file;
private ArrayList<String> paraWords = null;

private StringTokenizer tokenizer;
private int currentPara = -1;
private int paraPos = 0;
private int paraIter = 0;
private int paraIters = 1;
public  ParaWordReader(String fileName) throws IOException
{
file=new File(fileName);
raf = new RandomAccessFile(file,"r") ;
paraWords = new ArrayList<String>();
}

public void setMode(int m)
{
currentMode = m;
}

public void setParaIters(int iters)
{
paraIters = iters;
setMode(paraAgainMode);
}

public int paraIndex()
{
return currentPara;
}
private boolean readPara() throws IOException
{
String line = raf.readLine();
if(line == null)//到文件末尾了
{
if(currentMode == normalMode || currentMode == paraAgainMode)
{
return false;
}
else
{
System.out.println("文件太大可能不支持");
raf.seek(0);
currentPara = -1;
return readPara();
}
}
paraWords.clear();
line = new String(line.getBytes("iso8859-1"),"utf-8");
tokenizer= new StringTokenizer(line," ");
while(tokenizer.hasMoreTokens())
{
paraWords.add(tokenizer.nextToken());
}
currentPara++;
paraPos = 0;
return true;
}

public String[] getNextWords(int count) throws IOException
{
if(paraPos+count >= paraWords.size())//到了段落末尾
{
if(currentMode == paraAgainMode && paraIter< paraIters)//段落从头再来
{
paraPos = 0;
paraIter++;
return getNextWords(count);
}
else
{
paraIter =0;
if(readPara())//读取新的段落
return getNextWords(count);
else return null;
}
}
String[] words = new String[count];
for(int i=0;i<count;i++)
{
words[i] = paraWords.get(paraPos+i);

}
paraPos++;
return words;
}

public static void main(String[] args) throws IOException
{
// TODO Auto-generated method stub
ParaWordReader wordReader = new ParaWordReader("/media/linger/G/sources/ParaModel/electronic_seg.txt");
wordReader.currentMode = ParaWordReader.againMode;
//while(true)//614005行
for(int i=0;i<614005*2;i++)
{
String[] words = wordReader.getNextWords(5);
if(words == null) break;
System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]);
}
System.out.println(wordReader.currentPara);

}

}


本文作者:linger

本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337707
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息