Java过滤停用词源码
2016-12-04 16:13
106 查看
package SimilarityCompution;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;
import ICTCLAS.I3S.AC.ICTCLAS50;
public class FileExcludeStopWord {
//停用词词表
public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable.txt";
public static void main(String[] args) {
//源文件和目的文件
String srcFile = "." + File.separator + "srcFile" + File.separator + "如何正确的使用化妆品效.txt";
String destFile = "." + File.separator + "destFile" + File.separator + "如何正确的使用化妆品效.txt";
new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile);
}
public void fileExcludeStopWord(String srcFile,StringdestFile){
try {
//读取原文件和停用词表
BufferedReadersrcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));
BufferedReaderStopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable))));
//将去除停用词的文本信息存入输出文件
BufferedWriterdestFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));
//用来存放停用词的集合 Set stopWordSet = new HashSet<String>(); //初如化停用词集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } //分词工具 ICTCLAS50 ICTCLAS = new ICTCLAS50();
// 初始化分词所用库的路径
String argu = ".";
if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) {
System.out.println("分词所用库初始化失败。");
return;
}
String paragraph = null;
for(; (paragraph = srcFileBr.readLine()) != null;){
//对读入的文本进行分词
byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0);
String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312");
//得到分词后的词汇数组,以便后续比较
String[] resultArray = spiltResultStr.split(" ");
//过滤停用词
for(int i = 0; i<resultArray.length; i++){
if(stopWordSet.contains(resultArray[i])){
resultArray[i] = null;
}
}
//把过滤后的字符串数组存入到一个字符串中
StringBufferfinalStr = new StringBuffer();
for(int i = 0; i<resultArray.length; i++){
if(resultArray[i] != null){
finalStr = finalStr.append(resultArray[i]).append(" ");
}
}
} } } //将过滤后的文本信息写入到指定文件中 destFileBw.write(finalStr.toString()); destFileBw.newLine(); //关闭输入流 destFileBw.close(); StopWordFileBr.close();
srcFileBr.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch(Exception e){ e.printStackTrace(); }
对于学习有困难不知道如何提升自己可以加扣:578024144进行交流得到帮助,获取学习资料
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;
import ICTCLAS.I3S.AC.ICTCLAS50;
public class FileExcludeStopWord {
//停用词词表
public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable.txt";
public static void main(String[] args) {
//源文件和目的文件
String srcFile = "." + File.separator + "srcFile" + File.separator + "如何正确的使用化妆品效.txt";
String destFile = "." + File.separator + "destFile" + File.separator + "如何正确的使用化妆品效.txt";
new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile);
}
public void fileExcludeStopWord(String srcFile,StringdestFile){
try {
//读取原文件和停用词表
BufferedReadersrcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));
BufferedReaderStopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable))));
//将去除停用词的文本信息存入输出文件
BufferedWriterdestFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));
//用来存放停用词的集合 Set stopWordSet = new HashSet<String>(); //初如化停用词集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } //分词工具 ICTCLAS50 ICTCLAS = new ICTCLAS50();
// 初始化分词所用库的路径
String argu = ".";
if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) {
System.out.println("分词所用库初始化失败。");
return;
}
String paragraph = null;
for(; (paragraph = srcFileBr.readLine()) != null;){
//对读入的文本进行分词
byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0);
String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312");
//得到分词后的词汇数组,以便后续比较
String[] resultArray = spiltResultStr.split(" ");
//过滤停用词
for(int i = 0; i<resultArray.length; i++){
if(stopWordSet.contains(resultArray[i])){
resultArray[i] = null;
}
}
//把过滤后的字符串数组存入到一个字符串中
StringBufferfinalStr = new StringBuffer();
for(int i = 0; i<resultArray.length; i++){
if(resultArray[i] != null){
finalStr = finalStr.append(resultArray[i]).append(" ");
}
}
} } } //将过滤后的文本信息写入到指定文件中 destFileBw.write(finalStr.toString()); destFileBw.newLine(); //关闭输入流 destFileBw.close(); StopWordFileBr.close();
srcFileBr.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch(Exception e){ e.printStackTrace(); }
对于学习有困难不知道如何提升自己可以加扣:578024144进行交流得到帮助,获取学习资料
相关文章推荐
- Java过滤停用词源码
- Java版色情图像过滤入门示例及源码-0.1.0 (模拟GreenDam过滤机制)
- Java权限框架Shiro过滤连源码解读
- Java版色情图像过滤入门示例及源码-0.1.0 (模拟***过滤机制)
- Java权限框架Shiro过滤连源码解读
- Java版色情图像过滤入门示例及源码-0.1.0 (模拟绿坝过滤机制)
- 源码分析:LinkedList和Java中的指针操作
- JAVA开放源码项目与工具在企业应用开发中的运用(ZZ)
- Java 2源码解读:java.util.ArrayList
- Java 2源码解读:java.util.ArrayList
- Java源码分析:深入探讨Iterator模式(转自http://blog.csdn.net/kalex)
- 再谈将C++语言源码转成html的方法(vim实现,可用于java,perl,python等等多种语言)
- 实例源码:利用Java调用可执行命令
- 开放源码-SMTP发信客户端 for Java
- Java 2源码解读:java.util.ArrayList
- Java 2源码解读1:java.util.ArrayList (版本:2.0)
- enoeht的Java源码系列(6)--调试信息与日志文件
- 使DataGrid中的CheckBox列全选(javacript)源码
- 做了一个简陋的javaIDE,附源码
- enoeht的Java源码系列(5)--字符串加解密 enoeht [原作]