在DotLucene/Lucene.net中, 增加自己的中文分词Analyzer
2008-12-25 09:37
441 查看
一种非常简单,但是不是很优化的方法,继承Lucene.Net.Analysis.Analyzer,实现了Lucene.Net.Analysis.Analyzer,Lucene.Net.Analysis.Tokenizer,Lucene.Net.Analysis.TokenFilter的子类.参考了Lucene.Net.Analysis.Cn的实现,该项目采用对汉语进行一元分词. ChineseAnalyer类,继承自Lucene.Net.Analysis.Analyzer using System; using System.IO; using System.Text; using System.Collections; using ShootSeg; //分词类的命名空间,该分词组件来源于http://www.shootsoft.net,开源项目,感谢作者 using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { public class ChineseAnalyzer : Analyzer { private Segment segment = new Segment(); //这个是自己的中文分词类 public ChineseAnalyzer() { segment.InitWordDics(); //在构造函数装载词典 segment.Separator = "|"; //分词间隔符号 } public override sealed TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ChineseTokenizer(reader,segment); //把分词类传引用进去 result = new ChineseFilter(result); //对处理好的结果进行过滤 return result; } } } ChineseTokenizer类继承自Lucene.Net.Analysis.Tokenizer using System; using System.IO; using System.Text; using System.Collections; using System.Globalization; using ShootSeg; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { public sealed class ChineseTokenizer : Tokenizer { private Segment segment; private string[] Wordlist; //切好的词放入此数组中 private string Allstr; //对传入的流转成此string private int offset = 0; int start = 0; int step = 0; //offset偏移量,start开始位置,step次数 public ChineseTokenizer(TextReader _in,Segment segment) { input = _in; Allstr = input.ReadToEnd(); //把流读到Allstr this.segment = segment; //继续传引用,这会才发现写的时候糊涂了,完全可以不用写 Wordlist = segment.SegmentText(Allstr).Split('|'); //把分好的词装入wordlist } private Token Flush(string str) { if (str.Length > 0) { return new Token(str,start, start + str.Length); //返回一个Token 包含词,词在流中的开始位置和结束位置. } else return null; } public override Token Next() //重载Next函数,就是返回Token { Token token = null; if (step <= Wordlist.Length) { start = Allstr.IndexOf(Wordlist[step], offset); //从Allstr里找每个分出来词汇的开始位置 offset = start + 1; //计算偏移量 token = Flush(Wordlist[step]); //返回已分词汇 step = step + 1; //变量+1,移动到wordlist的下一个词汇 } return token; } } } 这个ChineseFilter继承自Lucene.Net.Analysis.TokenFilter,完全照抄Lucene.Net.Analysis.Cn工程的同名类(此类过滤了数字及符号,英文助词,需要过滤其他相应增加代码) using System; using System.IO; using System.Collections; using System.Globalization; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { /// <summary> /// Title: ChineseFilter /// Description: Filter with a stop word table /// Rule: No digital is allowed. /// English word/token should larger than 1 character. /// One Chinese character as one Chinese word. /// TO DO: /// 1. Add Chinese stop words, such as \ue400 /// 2. Dictionary based Chinese word extraction /// 3. Intelligent Chinese word extraction /// /// Copyright: Copyright (c) 2001 /// Company: /// @author Yiyi Sun /// @version $Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $ /// </summary> public sealed class ChineseFilter : TokenFilter { // Only English now, Chinese to be added later. public static String[] STOP_WORDS = { "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private Hashtable stopTable; public ChineseFilter(TokenStream _in) : base(_in) { stopTable = new Hashtable(STOP_WORDS.Length); for (int i = 0; i < STOP_WORDS.Length; i++) stopTable[STOP_WORDS[i]] = STOP_WORDS[i]; } public override Token Next() { for (Token token = input.Next(); token != null; token = input.Next()) { String text = token.TermText(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable[text] == null) { switch (Char.GetUnicodeCategory(text[0])) { case UnicodeCategory.LowercaseLetter: case UnicodeCategory.UppercaseLetter: // English word/token should larger than 1 character. if (text.Length > 1) { return token; } break; case UnicodeCategory.OtherLetter: // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. return token; } } } return null; } } }
以上基本没什么技术含量,好处就是增加新的中文分词不管什么算法,只需要简单几行代码搞定.中文分词完全和DotLucene/Lucene.net本身无关. 使用的时候用ChineseAnalyzer替换 StandardAnalyzer就OK了.
Click Here To Download是编译好的lucene.net 1.91 Lucene.Net.Analysis.CnByKing.dll ShootSeg.dll引用这三玩意就可以搞定简单的中文搜索了
相关文章推荐
- 在DotLucene/Lucene.net中, 增加自己的中文分词Analyzer
- 在DotLucene/Lucene.net中, 增加自己的中文分词Analyzer
- 给Lucene.NET增加中文分词
- 给lucene.net增加SCWS中文分词功能
- 应用工具 .NET Portability Analyzer 分析迁移dotnet core
- 通过Swashbukle给DotNet Core Web API 增加自动文档功能
- DotNetNuke的可增加模块
- Lucene.Net+盘古分词->开发自己的搜索引擎
- DotLucene类库(1) : Lucene.Net.Analysis
- DotNetNuke(DNN)从入门到进阶(1)-怎样写自己的模块
- 如何往自己的网站增加Asp.net Ajax
- relaxlife.net发布一个自己开发的中文分词程序
- Lucene.net(4.8.0)+PanGu分词器 问题记录一 分词器Analyzer的构造和内部成员ReuseStategy
- 红五月重镑奉献: ASP.NET 2.0 AJAX 与开发自己的搜索引擎Lucene2.0+Heritrix
- DotLucene(Lucene.Net)研究[转]
- 自己封装的数据通信服务组件DotNet.ServiceModel
- 用Lucene.net建立自己的网站搜索
- 强! .NET程序搜索引擎:DotLucene(Open Source)
- 修复bug,增加BasePath参数 - dotnetFlexGrid 1.21beta更新 争取做ASP.NET中最好的Ajax开源表格控件
- relaxlife.net发布一个自己开发的中文分词程序