您的位置:首页 > 编程语言 > Java开发

一种中文文本的快速分词方法(一)(未完待续)

2014-02-05 14:49 363 查看
这是本人梦寐以求的东西,终于搞出来了。这是写智能程序的第一步啊!
下面是中文分词的方法供给大家看看。

package org.zhukovasky.fileutil;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringU
4000
tils;
/**
* 以下是中文文本的处理工具,用来清洗文本的各种乱码
* 文本的编码为UTF-8
* @author zhukovasky
* @version 1.0
* @since 2013.12
* @email zhukovasky@163.com
* */
public class FileProcess {
public final static Map<String,String> mapDemo;
static{
mapDemo=new HashMap<String,String>();
String punctuation1[]={"㈠",
"㈡","㈢","㈣","㈤",
"㈥","㈦","㈧","㈨","㈩"};
String punctuation2[]={"⑴","⑵","⑶","⑷","⑸","⑹","⑺","⑻","⑼","⑽"};
String punctuation3[]={"⒈","⒉","⒊","⒋","⒌","⒍","⒎","⒏","⒐","⒑"};
String punctuation4[]={"Ⅰ","Ⅱ","Ⅲ",
"Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ"};
String punctuation5[]={"ⅰ","ⅱ","ⅲ","ⅳ","ⅴ","ⅵ","ⅶ","ⅷ","ⅸ","ⅹ"};
String punctuation6[]={"1","2","3","4","5","6","7","8","9","10"};
mapDemo.put(",", ",");
mapDemo.put("。", ".");
mapDemo.put("〈", "<");
mapDemo.put("〉", ">");
mapDemo.put("‖", "|");
mapDemo.put("《", "<");
mapDemo.put("》", ">");
mapDemo.put("〔", "[");
mapDemo.put("〕", "]");
mapDemo.put("﹖", "?");
mapDemo.put("?", "?");
mapDemo.put("“", "\"");
mapDemo.put("”", "\"");
mapDemo.put(":", ":");
mapDemo.put("、", ",");
mapDemo.put("(", "(");
mapDemo.put(")", ")");
mapDemo.put("【", "[");
mapDemo.put("】", "]");
mapDemo.put("—", "-");
mapDemo.put("~", "~");
mapDemo.put("!", "!");
mapDemo.put("‵", "'");
mapDemo.put("①", "1");
mapDemo.put("②", "2");
mapDemo.put("③", "3");
mapDemo.put("④", "4");
mapDemo.put("⑤", "5");
mapDemo.put("⑥", "6");
mapDemo.put("⑦", "7");
mapDemo.put("⑧", "8");
mapDemo.put("⑨", "9");
mapDemo.put("\", "\\");
mapDemo.put("0", "0");
mapDemo.put("/", "/");
mapDemo.put(".",  ".");
mapDemo.put("7", "7");
String[] numberArray={"1","2","3","4","5","6","7","8","9","10"};
for(int i=0;i<10;i++){
mapDemo.put(punctuation1[i], numberArray[i]);
}
for(int i=0;i<10;i++){
mapDemo.put(punctuation2[i], numberArray[i]);
}
for(int i=0;i<10;i++){
mapDemo.put(punctuation3[i], numberArray[i]);
}
for(int i=0;i<10;i++){
mapDemo.put(punctuation4[i], numberArray[i]);
}
for(int i=0;i<10;i++){
mapDemo.put(punctuation5[i], numberArray[i]);
}
for(int i=0;i<10;i++){
mapDemo.put(punctuation6[i], numberArray[i]);
}

}
private static String replace(String line){
for(int i=0;i<line.length();i++){
String charat=line.substring(i, i+1);
if(mapDemo.get(charat) != null){
line=line.replace(charat,(String)mapDemo.get(charat));
}
}
return line;
}
private static String replacePunctuation(String line){
Map<Integer,String> mapDemo=new HashMap<Integer,String>();
mapDemo.put(1, ",");
mapDemo.put(2, ".");
mapDemo.put(3, "<");
mapDemo.put(4, ">");
mapDemo.put(5, "|");
mapDemo.put(6, "<");
mapDemo.put(7, ">");
mapDemo.put(8, "[");
mapDemo.put(9, "]");
mapDemo.put(10, "?");
mapDemo.put(11, "?");
mapDemo.put(12, "\"");
mapDemo.put(13, "\"");
mapDemo.put(14, ":");
mapDemo.put(15, ",");
mapDemo.put(16, "(");
mapDemo.put(17, ")");
mapDemo.put(18, "[");
mapDemo.put(19, "]");
mapDemo.put(20, "-");
mapDemo.put(21, "~");
mapDemo.put(22, "!");
mapDemo.put(23, "'");
mapDemo.put(24, "1");
mapDemo.put(25, "2");
mapDemo.put(26, "3");
mapDemo.put(27, "4");
mapDemo.put(28, "5");
mapDemo.put(29, "6");
mapDemo.put(30, "7");
mapDemo.put(31, "8");
mapDemo.put(32, "9");
mapDemo.put(33, "0");
mapDemo.put(34, "●");
mapDemo.put(35, "→");
mapDemo.put(36, "※");
mapDemo.put(37,"·");
mapDemo.put(38,"=");
mapDemo.put(39, "==");
mapDemo.put(40, "'");
mapDemo.put(41, "'");
mapDemo.put(42, ";");
mapDemo.put(43, "(");
mapDemo.put(44, "-");
mapDemo.put(45, "");
mapDemo.put(46, " ");
mapDemo.put(47, " ");
mapDemo.put(48, ",");
mapDemo.put(49, "(");
mapDemo.put(50, ")");
mapDemo.put(51, "{");
mapDemo.put(52, "}");
mapDemo.put(53, "★");
mapDemo.put(54, "㊣");
mapDemo.put(55, "¶");
mapDemo.put(56, "∮");
mapDemo.put(57, "€");
mapDemo.put(58, "☀");
mapDemo.put(59, "Θ");
mapDemo.put(60, "○");
mapDemo.put(61, "№");
mapDemo.put(62,"∷");
mapDemo.put(63, "♂");
mapDemo.put(64,"♀");
mapDemo.put(65, "§");
mapDemo.put(66,";");
mapDemo.put(67, "「");
mapDemo.put(68, "」");
mapDemo.put(69, "!");
mapDemo.put(70, "!");
mapDemo.put(71, "│");
mapDemo.put(72,"|");
mapDemo.put(73, " (");
mapDemo.put(74, " )");
mapDemo.put(75,"%");
mapDemo.put(76, "——");
mapDemo.put(77, "+");
mapDemo.put(78, "×");
mapDemo.put(79, "☆");
mapDemo.put(80,".");
mapDemo.put(81, "’");
mapDemo.put(82, "…");
mapDemo.put(83, "‘");
mapDemo.put(84, ",");
mapDemo.put(85, "?");
for(int i=0;i<line.length();i++){
String charat=line.substring(i, i+1);
if(mapDemo.containsValue(charat)){
line=line.replace(charat,"");
}
}
return line;
}
private static String dropBlank(String line){
String dest="";
if(line!=null){
Pattern p=Pattern.compile("\\s*|\t|\r|\n");
Matcher m=p.matcher(line);
dest=m.replaceAll("");
}
return dest;
}
private static String dropNumber(String line){
String dest="";
if(line!=null){
Pattern p=Pattern.compile("[a-zA-Z0-9]");
Matcher m=p.matcher(line);
dest=m.replaceAll("");
}
return dest;
}
/**
* 该方法实现清除旧文本中的字符标点等处理
* @param origin	旧文本
* @param newFile	新文本
* @return 返回新文本
* */
public static File processFile(File origin,File newFile){
Reader fr=null;
Writer wr=null;
BufferedReader reader=null;
BufferedWriter writer=null;
try {
fr = new FileReader(origin);
wr=new FileWriter(newFile);
reader=new BufferedReader(fr);
writer=new BufferedWriter(wr);
String line=reader.readLine();
while(line!=null){
String newLine1=replace(line);
String newLine2=newLine1.trim();
String newLine3=dropBlank(newLine2);
String newLine4=dropNumber(newLine3);
String newLine6=newLine4.replace("●", "");
String newLine7=newLine6.replace("[", "");
String newLine8=newLine7.replace("]","");
String newLine9=newLine8.replace("/", "");
String newLine10=replacePunctuation(newLine9.replaceAll(" +",""));
String newLine=StringUtils.trim(newLine10);
writer.write(newLine);
line=reader.readLine();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
reader.close();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}

}

return newFile;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java utf-8 map