您的位置:首页 > 其它

[NLP自然语言处理]读取UTF8字符并实现汉字和单词的识别,计算熵和KL距离

2016-12-08 15:58 375 查看
原贴:http://www.tuicool.com/articles/6nyQje

1 import java.io.BufferedReader;
2 import java.io.FileInputStream;
3 import java.io.FileReader;
4 import java.io.FileWriter;
5 import java.util.HashMap;
6 import java.util.Iterator;
7 import java.util.Map.Entry;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10
11 public class NLPFileUnit {
12     public HashMap<String, Integer> WordOccurrenceNumber;//The Occurrence Number of the single Chinese character
13     //or Single English word in the file
14     public HashMap<String, Float> WordProbability;//The probability of single Chinese character or English word
15     public HashMap<String, Integer> Punctuations;//The punctuation that screened out from the file
16     public float entropy;//熵,本文主要计算单个汉字,或者单个英文单词的熵值
17     private String filePath;
18
19     //构造函数
20     public NLPFileUnit(String filePath) throws Exception {
21         this.filePath = filePath;
22         WordOccurrenceNumber = createHash(createReader(filePath));
23         Punctuations = filterPunctuation(WordOccurrenceNumber);
24         WordProbability = calProbability(WordOccurrenceNumber);
25         this.entropy = calEntropy(this.WordProbability);
26
27         System.out.println("all punctuations were saved at " + filePath.replace(".", "_punctuation.") + "!");
28         this.saveFile(Punctuations, filePath.replace(".", "_punctuation."));
29         System.out.println("all words(En & Ch) were saved at " + filePath.replace(".", "_AllWords.") + "!");
30         this.saveFile(this.WordOccurrenceNumber, filePath.replace(".", "_AllWords."));
31     }
32
33     /**
34      * get the English words form the file to HashMap
35      * @param hash
36      * @param path
37      * @throws Exception
38      */
39     public void getEnWords(HashMap<String, Integer> hash, String path) throws Exception {
40         FileReader fr = new FileReader(path);
41         BufferedReader br = new BufferedReader(fr);
42
43         //read all lines into content
44         String content = "";
45         String line = null;
46         while((line = br.readLine())!=null){
47             content+=line;
48         }
49         br.close();
50
51         //extract words by regex正则表达式
52         Pattern enWordsPattern = Pattern.compile("([A-Za-z]+)");
53         Matcher matcher = enWordsPattern.matcher(content);
54         while (matcher.find()) {
55             String word = matcher.group();
56             if(hash.containsKey(word))
57                 hash.put(word, 1 + hash.get(word));
58             else{
59                 hash.put(word, 1);
60             }
61         }
62     }
63
64     private boolean isPunctuation(String tmp) {
65         //Punctuation should not be EN words/ Chinese
66         final String cnregex = "\\p{InCJK Unified Ideographs}";
67         final String enregex = "[A-Za-z]+";
68         return !(tmp.matches(cnregex) || tmp.matches(enregex)) ;
69     }
70
71     /**
72      * judge whether the file is encoded by UTF-8 (UCS Transformation Format)format.
73      * @param fs
74      * @return
75      * @throws Exception
76      */
77     private boolean isUTF8(FileInputStream fs) throws Exception {
78         if (fs.read() == 0xEF && fs.read() == 0xBB && fs.read() == 0xBF)//所有utf8编码的文件前三个字节为0xEFBBBF
79             return true;
80         return false;
81     }
82
83     /**
84      * utf8格式编码的字符,其第一个byte的二进制编码可以判断该字符的长度(汉字一般占三个字节)ASCII占一byte
85      * @param b
86      * @return
87      */
88     private int getlength(byte b) {
89         int v = b & 0xff;//byte to 十六进制数
90         if (v > 0xF0) {
91             return 4;
92         }
93         // 110xxxxx
94         else if (v > 0xE0) {
95             return 3;
96         } else if (v > 0xC0) {
97             return 2;//该字符长度占2byte
98         }
99         return 1;
100     }
101
102     /**
103      * 通过读取头一个byte来判断该字符占用字节数,并读取该字符,如1110xxxx,表示这个字符占三个byte
104      * @param fs
105      * @return
106      * @throws Exception
107      */
108     private String readUnit(FileInputStream fs) throws Exception {
109         byte b = (byte) fs.read();
110         if (b == -1)
111             return null;
112         int len = getlength(b);
113         byte[] units = new byte[len];
114         units[0] = b;
115         for (int i = 1; i < len; i++) {
116             units[i] = (byte) fs.read();
117         }
118         String ret = new String(units, "UTF-8");
119         return ret;
120     }
121
122     /**
123      * 把单词,标点,汉字等全都读入hashmap
124      * @param inputStream
125      * @return
126      * @throws Exception
127      */
128     private HashMap<String, Integer> createHash(FileInputStream inputStream)
129             throws Exception {
130         HashMap<String, Integer> hash = new HashMap<String, Integer>();
131         String key = null;
132         while ((key = readUnit(inputStream)) != null) {
133             if (hash.containsKey(key)) {
134                 hash.put(key, 1 + (int) hash.get(key));
135             } else {
136                 hash.put(key, 1);
137             }
138         }
139         inputStream.close();
140         getEnWords(hash, this.filePath);
141         return hash;
142     }
143
144     /**
145      * FileInputStream读取文件,若文件不是UTF8编码,返回null
146      * @param path
147      * @return
148      * @throws Exception
149      */
150     private FileInputStream createReader(String path) throws Exception {
151         FileInputStream br = new FileInputStream(path);
152         if (!isUTF8(br))
153             return null;
154         return br;
155     }
156
157     /**
158      * save punctuation filtered form (HashMap)hash into (HashMap)puncs,
159      * @param hash;remove punctuation form (HashMap)hash at the same time
160      * @return
161      */
162     private HashMap<String, Integer> filterPunctuation(
163             HashMap<String, Integer> hash) {
164         HashMap<String, Integer> puncs = new HashMap<String, Integer>();
165         Iterator<?> iterator = hash.entrySet().iterator();
166
167         while (iterator.hasNext()) {
168             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
169             String key = entry.getKey().toString();
170             if (isPunctuation(key)) {
171                 puncs.put(key, hash.get(key));
172                 iterator.remove();
173             }
174         }
175         return puncs;
176     }
177
178     /**
179      * calculate the probability of the word in hash
180      * @param hash
181      * @return
182      */
183     private HashMap<String, Float> calProbability(HashMap<String, Integer> hash) {
184         float count = countWords(hash);
185         HashMap<String, Float> prob = new HashMap<String, Float>();
186         Iterator<?> iterator = hash.entrySet().iterator();
187         while (iterator.hasNext()) {
188             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
189             String key = entry.getKey().toString();
190             prob.put(key, hash.get(key) / count);
191         }
192         return prob;
193     }
194
195     /**
196      * save the content in the hash into file.txt
197      * @param hash
198      * @param path
199      * @throws Exception
200      */
201     private void saveFile(HashMap<String, Integer> hash, String path)
202             throws Exception {
203         FileWriter fw = new FileWriter(path);
204         fw.write(hash.toString());
205         fw.close();
206     }
207
208     /**
209      * calculate the total words in hash
210      * @param hash
211      * @return
212      */
213     private int countWords(HashMap<String, Integer> hash) {
214         int count = 0;
215         for (Entry<String, Integer> entry : hash.entrySet()) {
216             count += entry.getValue();
217         }
218         return count;
219     }
220
221     /**
222      * calculate the entropy(熵) of the characters
223      * @param hash
224      * @return
225      */
226     private float calEntropy(HashMap<String, Float> hash) {
227         float entropy = 0;
228         Iterator<Entry<String, Float>> iterator = hash.entrySet().iterator();
229         while (iterator.hasNext()) {
230             Entry<String, Float> entry = (Entry<String, Float>) iterator.next();
231             Float prob = entry.getValue();//get the probability of the characters
232             entropy += 0 - (prob * Math.log(prob));//calculate the entropy of the characters
233         }
234         return entropy;
235     }
236 }
237
238
239
240
241
242
243
244 import java.io.BufferedReader;
245 import java.io.FileNotFoundException;
246 import java.io.IOException;
247 import java.io.InputStreamReader;
248 import java.util.HashMap;
249 import java.util.Iterator;
250 import java.util.Map.Entry;
251
252 public class NLPWork {
253
254     /**
255      * calculate the KL distance form file u1 to file u2
256      * @param u1
257      * @param u2
258      * @return
259      */
260     public static float calKL(NLPFileUnit u1, NLPFileUnit u2) {
261         HashMap<String, Float> hash1 = u1.WordProbability;
262         HashMap<String, Float> hash2 = u2.WordProbability;
263         float KLdistance = 0;
264         Iterator<Entry<String, Float>> iterator = hash1.entrySet().iterator();
265         while (iterator.hasNext()) {
266             Entry<String, Float> entry = iterator.next();
267             String key = entry.getKey().toString();
268
269             if (hash2.containsKey(key)) {
270                 Float value1 = entry.getValue();
271                 Float value2 = hash2.get(key);
272                 KLdistance += value1 * Math.log(value1 / value2);
273             }
274         }
275         return KLdistance;
276     }
277
278     public static void main(String[] args) throws IOException, Exception {
279         //all punctuation will be saved under working directory
280         System.out.println("Now only UTF8 encoded file is supported!!!");
281         System.out.println("PLS input file 1 path:");
282         BufferedReader cin = new BufferedReader(
283                 new InputStreamReader(System.in));
284         String file1 = cin.readLine();
285         System.out.println("PLS input file 2 path:");
286         String file2 = cin.readLine();
287         NLPFileUnit u1 = null;
288         NLPFileUnit u2 = null;
289         try{
290             u1 = new NLPFileUnit(file1);//NLP:Nature Language Processing
291             u2 = new NLPFileUnit(file2);
292         }
293         catch(FileNotFoundException e){
294             System.out.println("File Not Found!!");
295             e.printStackTrace();
296             return;
297         }
298         float KLdistance = calKL(u1, u2);
299         System.out.println("KLdistance is :" + KLdistance);
300         System.out.println("File 1 Entropy: " + u1.entropy);
301         System.out.println("File 2 Entropy: " + u2.entropy);
302     }
303 }
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: