一种中文文本的快速分词方法(一)(未完待续)
2014-02-05 14:49
363 查看
这是本人梦寐以求的东西,终于搞出来了。这是写智能程序的第一步啊!
下面是中文分词的方法供给大家看看。
下面是中文分词的方法供给大家看看。
package org.zhukovasky.fileutil; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringU 4000 tils; /** * 以下是中文文本的处理工具,用来清洗文本的各种乱码 * 文本的编码为UTF-8 * @author zhukovasky * @version 1.0 * @since 2013.12 * @email zhukovasky@163.com * */ public class FileProcess { public final static Map<String,String> mapDemo; static{ mapDemo=new HashMap<String,String>(); String punctuation1[]={"㈠", "㈡","㈢","㈣","㈤", "㈥","㈦","㈧","㈨","㈩"}; String punctuation2[]={"⑴","⑵","⑶","⑷","⑸","⑹","⑺","⑻","⑼","⑽"}; String punctuation3[]={"⒈","⒉","⒊","⒋","⒌","⒍","⒎","⒏","⒐","⒑"}; String punctuation4[]={"Ⅰ","Ⅱ","Ⅲ", "Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ"}; String punctuation5[]={"ⅰ","ⅱ","ⅲ","ⅳ","ⅴ","ⅵ","ⅶ","ⅷ","ⅸ","ⅹ"}; String punctuation6[]={"1","2","3","4","5","6","7","8","9","10"}; mapDemo.put(",", ","); mapDemo.put("。", "."); mapDemo.put("〈", "<"); mapDemo.put("〉", ">"); mapDemo.put("‖", "|"); mapDemo.put("《", "<"); mapDemo.put("》", ">"); mapDemo.put("〔", "["); mapDemo.put("〕", "]"); mapDemo.put("﹖", "?"); mapDemo.put("?", "?"); mapDemo.put("“", "\""); mapDemo.put("”", "\""); mapDemo.put(":", ":"); mapDemo.put("、", ","); mapDemo.put("(", "("); mapDemo.put(")", ")"); mapDemo.put("【", "["); mapDemo.put("】", "]"); mapDemo.put("—", "-"); mapDemo.put("~", "~"); mapDemo.put("!", "!"); mapDemo.put("‵", "'"); mapDemo.put("①", "1"); mapDemo.put("②", "2"); mapDemo.put("③", "3"); mapDemo.put("④", "4"); mapDemo.put("⑤", "5"); mapDemo.put("⑥", "6"); mapDemo.put("⑦", "7"); mapDemo.put("⑧", "8"); mapDemo.put("⑨", "9"); mapDemo.put("\", "\\"); mapDemo.put("0", "0"); mapDemo.put("/", "/"); mapDemo.put(".", "."); mapDemo.put("7", "7"); String[] numberArray={"1","2","3","4","5","6","7","8","9","10"}; for(int i=0;i<10;i++){ mapDemo.put(punctuation1[i], numberArray[i]); } for(int i=0;i<10;i++){ mapDemo.put(punctuation2[i], numberArray[i]); } for(int i=0;i<10;i++){ mapDemo.put(punctuation3[i], numberArray[i]); } for(int i=0;i<10;i++){ mapDemo.put(punctuation4[i], numberArray[i]); } for(int i=0;i<10;i++){ mapDemo.put(punctuation5[i], numberArray[i]); } for(int i=0;i<10;i++){ mapDemo.put(punctuation6[i], numberArray[i]); } } private static String replace(String line){ for(int i=0;i<line.length();i++){ String charat=line.substring(i, i+1); if(mapDemo.get(charat) != null){ line=line.replace(charat,(String)mapDemo.get(charat)); } } return line; } private static String replacePunctuation(String line){ Map<Integer,String> mapDemo=new HashMap<Integer,String>(); mapDemo.put(1, ","); mapDemo.put(2, "."); mapDemo.put(3, "<"); mapDemo.put(4, ">"); mapDemo.put(5, "|"); mapDemo.put(6, "<"); mapDemo.put(7, ">"); mapDemo.put(8, "["); mapDemo.put(9, "]"); mapDemo.put(10, "?"); mapDemo.put(11, "?"); mapDemo.put(12, "\""); mapDemo.put(13, "\""); mapDemo.put(14, ":"); mapDemo.put(15, ","); mapDemo.put(16, "("); mapDemo.put(17, ")"); mapDemo.put(18, "["); mapDemo.put(19, "]"); mapDemo.put(20, "-"); mapDemo.put(21, "~"); mapDemo.put(22, "!"); mapDemo.put(23, "'"); mapDemo.put(24, "1"); mapDemo.put(25, "2"); mapDemo.put(26, "3"); mapDemo.put(27, "4"); mapDemo.put(28, "5"); mapDemo.put(29, "6"); mapDemo.put(30, "7"); mapDemo.put(31, "8"); mapDemo.put(32, "9"); mapDemo.put(33, "0"); mapDemo.put(34, "●"); mapDemo.put(35, "→"); mapDemo.put(36, "※"); mapDemo.put(37,"·"); mapDemo.put(38,"="); mapDemo.put(39, "=="); mapDemo.put(40, "'"); mapDemo.put(41, "'"); mapDemo.put(42, ";"); mapDemo.put(43, "("); mapDemo.put(44, "-"); mapDemo.put(45, ""); mapDemo.put(46, " "); mapDemo.put(47, " "); mapDemo.put(48, ","); mapDemo.put(49, "("); mapDemo.put(50, ")"); mapDemo.put(51, "{"); mapDemo.put(52, "}"); mapDemo.put(53, "★"); mapDemo.put(54, "㊣"); mapDemo.put(55, "¶"); mapDemo.put(56, "∮"); mapDemo.put(57, "€"); mapDemo.put(58, "☀"); mapDemo.put(59, "Θ"); mapDemo.put(60, "○"); mapDemo.put(61, "№"); mapDemo.put(62,"∷"); mapDemo.put(63, "♂"); mapDemo.put(64,"♀"); mapDemo.put(65, "§"); mapDemo.put(66,";"); mapDemo.put(67, "「"); mapDemo.put(68, "」"); mapDemo.put(69, "!"); mapDemo.put(70, "!"); mapDemo.put(71, "│"); mapDemo.put(72,"|"); mapDemo.put(73, " ("); mapDemo.put(74, " )"); mapDemo.put(75,"%"); mapDemo.put(76, "——"); mapDemo.put(77, "+"); mapDemo.put(78, "×"); mapDemo.put(79, "☆"); mapDemo.put(80,"."); mapDemo.put(81, "’"); mapDemo.put(82, "…"); mapDemo.put(83, "‘"); mapDemo.put(84, ","); mapDemo.put(85, "?"); for(int i=0;i<line.length();i++){ String charat=line.substring(i, i+1); if(mapDemo.containsValue(charat)){ line=line.replace(charat,""); } } return line; } private static String dropBlank(String line){ String dest=""; if(line!=null){ Pattern p=Pattern.compile("\\s*|\t|\r|\n"); Matcher m=p.matcher(line); dest=m.replaceAll(""); } return dest; } private static String dropNumber(String line){ String dest=""; if(line!=null){ Pattern p=Pattern.compile("[a-zA-Z0-9]"); Matcher m=p.matcher(line); dest=m.replaceAll(""); } return dest; } /** * 该方法实现清除旧文本中的字符标点等处理 * @param origin 旧文本 * @param newFile 新文本 * @return 返回新文本 * */ public static File processFile(File origin,File newFile){ Reader fr=null; Writer wr=null; BufferedReader reader=null; BufferedWriter writer=null; try { fr = new FileReader(origin); wr=new FileWriter(newFile); reader=new BufferedReader(fr); writer=new BufferedWriter(wr); String line=reader.readLine(); while(line!=null){ String newLine1=replace(line); String newLine2=newLine1.trim(); String newLine3=dropBlank(newLine2); String newLine4=dropNumber(newLine3); String newLine6=newLine4.replace("●", ""); String newLine7=newLine6.replace("[", ""); String newLine8=newLine7.replace("]",""); String newLine9=newLine8.replace("/", ""); String newLine10=replacePunctuation(newLine9.replaceAll(" +","")); String newLine=StringUtils.trim(newLine10); writer.write(newLine); line=reader.readLine(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { reader.close(); writer.close(); } catch (IOException e) { e.printStackTrace(); } } return newFile; } }
相关文章推荐
- 一种中文文本的快速分词方法(二)
- 一种中文文本的快速分词方法(三)
- KTDictSeg 一种简单快速准确的中文分词方法
- Python点滴02_Python3打开中文文本时报错的一种处理方法
- MFC用CStdioFile类读取中文文本时乱码问题的一种解决方法
- 中文文本分词,关键词提取工具jcseg使用方法
- 中文文本挖掘课程笔记之jieba分词(1)
- 日期差值——一种快速的求解方法(Hash的思想)
- 文本分词方法
- 数据库分词查询的优缺点以及英文和中文各自的分词方法(二)
- Python处理中文文本字符时提取某个汉字或字符的方法
- 一种快速可预制的随机数组产生方法
- j2me里面读取txt文本的方法(支持中文)
- 一种快速文件传输的方法
- 中文分词方法总结
- [python] jieba 库 -- 给中文文本分词
- 快速排序的一种方法
- Android开发布局中margin和padding的区别以及文本的快速居中的方法
- 一种利用Cmake,使得低版本Visual Studio IDE快速运行高版本VS项目的方法~