您的位置:首页 > 其它

基于.Net Framework 3.5的Lucene.Net 中文词组匹配分词器

2008-03-25 01:32 531 查看
可以自己看看是不是很高效。为了加快速度,尽量精简了算法。测试表明,精确度还可以。
由于没有实现完整的一套字典机制,而是普通的文本字典,所以就不提供完整源码下载了,贴出核心的源码。从版本完整度上来说只能算是0.6版。
另外,本分词系统使用的词库是ShootAnalyzer的词库。

使用方法:

参考以下代码

1 [TestMethod]
2 public void TestMethod1()
3 {
4 //
5 // TODO: 在此 添加测试逻辑
6 //
7
8 Participle p = new Participle();
9 p.Init(@"D:\labs\xxxx");
10 string txt = @"天下真的有神吗?我不是呀";
11 string outstr = string.Empty;
12 Stopwatch st = new Stopwatch();
13 st.Start();
14 outstr = p.TextSpliter(txt);
15 st.Stop();
16
17 Stopwatch st2 = new Stopwatch();
18 st2.Start();
19 List<string> hs = p.TextArray(txt);
20 st2.Stop();
21 Console.WriteLine(outstr);
22 Console.WriteLine(st.ElapsedMilliseconds.ToString("f2"));
23 Console.WriteLine(st2.ElapsedMilliseconds.ToString("f2"));
24
25 YurowAnalyzer.YurowAnalyzer y = new YurowAnalyzer.YurowAnalyzer(@"D:\labs\xxxx");
26 TokenStream t = y.TokenStream(null, new StringReader(txt));
27
28 Token token = t.Next();
29 while (token != null)
30 {
31 Console.WriteLine(token.TermText() + "\t" + token.StartOffset() + "\t" + token.EndOffset());
32 token = t.Next();
33 }
34 t.Close();
35 }
在Lucene.Net 索引或者搜索中直接使用YurowAnalyzer.YurowAnalyzer 分析器。

下载地址:
http://files.cnblogs.com/birdshover/YurowAnalyzer.rar

下面贴上些关键源码:
Participle类(分词类)

1
2 public List<int> StartArr;
3
4 public List<string> TextArray(string text)
5 {
6 List<string> hs = new List<string>();
7 StartArr = new List<int>();
8 int start = 0;
9 for (int i = 0; i < text.Length; i++)
10 {
11 char nowchar = text[i];
12 char nextchar = (i == text.Length - 1) ? '\0' : text[i + 1];
13 if (DataCatch.EnglishChar.Contains(nowchar))
14 {
15 if (start < 1)
16 start = i;
17 if (DataCatch.EnglishChar.Contains(nextchar))
18 i++;
19 else
20 {
21 hs.Add(text.Substring(start, i - start));
22 StartArr.Add(start);
23 start = 0;
24 }
25 continue;
26 }
27
28 if (DataCatch.Num.Contains(nowchar))
29 {
30 if (start < 1)
31 start = i;
32 if (DataCatch.Num.Contains(nextchar))
33 {
34 i++;
35 }
36 else
37 {
38 hs.Add(text.Substring(start, i - start));
39 StartArr.Add(start);
40 start = 0;
41 }
42 continue;
43 }
44 if (nowchar == ' ')
45 {
46 continue;
47 }
48 if (nextchar == ' ' || nextchar == '\0')
49 {
50 hs.Add(nowchar.ToString());
51 StartArr.Add(i);
52 i++;
53 continue;
54 }
55 if (DataCatch.GetDict().ContainsKey(nowchar) && DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
56 {
57 HashSet<string> list = DataCatch.GetDict()[nowchar][nextchar];
58 if (list.Count == 0)
59 {
60 hs.Add(nowchar.ToString() + nextchar.ToString());
61 StartArr.Add(i);
62 i++;
63 continue;
64 }
65 int maxnum = 0;
66 string temp = string.Empty;
67 string outstr = string.Empty;
68 foreach (string item in list)
69 {
70 if (text.Length - i > item.Length + 1)
71 {
72 temp = text.Substring(i + 2, item.Length);
73 if (list.Contains(temp))
74 {
75 if (maxnum > item.Length)
76 continue;
77 else
78 {
79 maxnum = item.Length;
80 outstr = temp;
81 }
82 }
83 }
84 }
85 if (!string.IsNullOrEmpty(outstr))
86 {
87 hs.Add(nowchar.ToString() + nextchar.ToString() + outstr);
88 StartArr.Add(i);
89 i = i + maxnum + 1;
90 }
91 else
92 {
93 hs.Add(nowchar.ToString() + nextchar.ToString());
94 StartArr.Add(i);
95 i++;
96 }
97 }
98 else
99 {
100 hs.Add(nowchar.ToString());
101 StartArr.Add(i);
102 }
103 }
104 return hs;
105 }

DefaultDict类(加载分词具体实现)

private Dictionary<char, Dictionary<char, HashSet<string>>> dictMemory = new Dictionary<char, Dictionary<char, HashSet<string>>>(DataCatch.InitPage);

1 protected virtual void DoFormat()
2 {
3 Stream stream = new FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
4 StreamReader sr = new StreamReader(stream, Encoding.Default);
5 while (sr.Peek() > -1)
6 {
7 string line = sr.ReadLine();
8 if (line.Length > 1)
9 {
10 char charfirst = line[0];
11 char charseconde = line[1];
12 string other = line.Length > 2 ? line.Remove(0, 2) : null;
13 if (dictMemory.ContainsKey(charfirst))
14 {
15 if (dictMemory[charfirst].ContainsKey(charseconde))
16 {
17 HashSet<string> list = dictMemory[charfirst][charseconde];
18 if (!string.IsNullOrEmpty(other) && !list.Contains(other))
19 list.Add(other);
20 }
21 else
22 {
23 HashSet<string> list = new HashSet<string>();
24 if (!string.IsNullOrEmpty(other))
25 list.Add(other);
26 dictMemory[charfirst].Add(charseconde, list);
27 }
28 }
29 else
30 {
31 Dictionary<char, HashSet<string>> d = new Dictionary<char, HashSet<string>>();
32 HashSet<string> list = new HashSet<string>();
33 if (!string.IsNullOrEmpty(other))
34 list.Add(other);
35 d.Add(charseconde, list);
36 dictMemory.Add(charfirst, d);
37 }
38 }
39 }
40 }
转换到Lucene接口

1 public class YurowTokenizer : Tokenizer
2 {
3 private string text;
4 private List<string> list;
5 int current = 0;
6 private string path;
7 static Participle p;
8 bool isfirstrun = true;
9
10 public YurowTokenizer(TextReader textreader, string path)
11 {
12 text = textreader.ReadToEnd();
13 this.path = path;
14 if (p == null)
15 {
16 p = new Participle();
17 p.Init(path);
18 }
19 }
20
21 public override Token Next()
22 {
23 if (string.IsNullOrEmpty(text))
24 return null;
25
26 if (isfirstrun)
27 {
28 list = p.TextArray(text);
29 isfirstrun = false;
30 }
31 if (list.Count < 1 || current >= list.Count)
32 return null;
33 int start = p.StartArr[current];
34 string currentstr = list[current];
35 Token token = new Token(currentstr, start, start + currentstr.Length);
36 current++;
37 return token;
38 }
39 }

有兴趣的朋友可以自己反编译查看源码。暂时不提供完整源码。

http://www.cnblogs.com/birdshover/ by yurow
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: