您的位置:首页 > 编程语言 > Java开发

Java过滤停用词源码

2016-12-04 16:13 106 查看
package SimilarityCompution;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.util.HashSet;

import java.util.Set;

import ICTCLAS.I3S.AC.ICTCLAS50;

public class FileExcludeStopWord {

//停用词词表

public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable.txt";

public static void main(String[] args) {

//源文件和目的文件

String srcFile = "." + File.separator + "srcFile" + File.separator + "如何正确的使用化妆品效.txt";

String destFile = "." + File.separator + "destFile" + File.separator + "如何正确的使用化妆品效.txt";

new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile);

}

public void fileExcludeStopWord(String srcFile,StringdestFile){

try {

//读取原文件和停用词表

BufferedReadersrcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));

BufferedReaderStopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable))));

//将去除停用词的文本信息存入输出文件

BufferedWriterdestFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));

//用来存放停用词的集合 Set stopWordSet = new HashSet<String>(); //初如化停用词集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } //分词工具 ICTCLAS50 ICTCLAS = new ICTCLAS50();

// 初始化分词所用库的路径

String argu = ".";

if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) {

System.out.println("分词所用库初始化失败。");

return;

}

String paragraph = null;

for(; (paragraph = srcFileBr.readLine()) != null;){

//对读入的文本进行分词

byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0);

String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312");

//得到分词后的词汇数组,以便后续比较

String[] resultArray = spiltResultStr.split(" ");

//过滤停用词

for(int i = 0; i<resultArray.length; i++){

if(stopWordSet.contains(resultArray[i])){

resultArray[i] = null;

}

}

//把过滤后的字符串数组存入到一个字符串中

StringBufferfinalStr = new StringBuffer();

for(int i = 0; i<resultArray.length; i++){

if(resultArray[i] != null){

finalStr = finalStr.append(resultArray[i]).append(" ");

}

}
} } } //将过滤后的文本信息写入到指定文件中 destFileBw.write(finalStr.toString()); destFileBw.newLine(); //关闭输入流 destFileBw.close(); StopWordFileBr.close();
srcFileBr.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch(Exception e){ e.printStackTrace(); }

对于学习有困难不知道如何提升自己可以加扣:578024144进行交流得到帮助,获取学习资料
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息