Lucene基础(三)-- 中文分词及高亮显示
2015-05-31 15:42
459 查看
Lucene分词器及高亮
分词器
在lucene中我们按照分词方式把文档进行索引,不同的分词器索引的效果不太一样,之前的例子使用的都是标准分词器,对于英文的效果很好,但是中文分词效果就不怎么样,他会按照汉字的字直接分词,没有词语的概念。使用分词的地方只需要把Analyzer实例化成我们第三方的分词器即可
中文分词有很多,这里使用IKAnalyzer 为例,
下载地址 https://git.oschina.net/wltea/IK-Analyzer-2012FF 现在下来后里面有一篇教程。
高亮
导入lucene-highlighter-xxx.jar 在对查询出来的结果实现高亮显示// 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 内容增加高亮显示 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); }
Lucene中文分词器
实例:package lucene_demo04; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; /** *中文分词,IKAnalayzer,对索引结果实现高亮显示 * @author YipFun */ public class LuceneDemo04 { private static final Version version = Version.LUCENE_4_9; private Directory directory = null; private DirectoryReader ireader = null; private IndexWriter iwriter = null; private IKAnalyzer analyzer; //测试数据 private String[] content = { "你好,我是中共人", "中华人民共和国", "中国人民从此站起来了", "Lucene是一个不错的全文检索的工具", "全文检索中文分词" }; /** * 构造方法 */ public LuceneDemo04() { directory = new RAMDirectory(); } private IKAnalyzer getAnalyzer(){ if(analyzer == null){ return new IKAnalyzer(); }else{ return analyzer; } } /** * 创建索引 */ public void createIndex(){ Document doc = null; try { IndexWriterConfig iwConfig = new IndexWriterConfig(version, getAnalyzer()); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory,iwConfig); for(String text : content){ doc = new Document(); doc.add(new TextField("content", text,Field.Store.YES)); iwriter.addDocument(doc); } } catch (IOException e) { e.printStackTrace(); }finally{ try { if(iwriter != null) iwriter.close(); } catch (IOException e) { e.printStackTrace(); } } } public IndexSearcher getSearcher(){ try { if(ireader==null) { ireader = DirectoryReader.open(directory); } else { DirectoryReader tr = DirectoryReader.openIfChanged(ireader) ; if(tr!=null) { ireader.close(); ireader = tr; } } return new IndexSearcher(ireader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } public void searchByTerm(String field,String keyword,int num) throws InvalidTokenOffsetsException{ IndexSearcher isearcher = getSearcher(); Analyzer analyzer = getAnalyzer(); //使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(version, field,analyzer); //这句所起效果? qp.setDefaultOperator(QueryParser.OR_OPERATOR); try { Query query = qp.parse(keyword); ScoreDoc[] hits; //注意searcher的几个方法 hits = isearcher.search(query, null, num).scoreDocs; // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 内容增加高亮显示 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } } /** * 使用过滤器查询 * @param field * @param keyword * @param num * @throws InvalidTokenOffsetsException */ public void searchByTermFilter(String field,String keyword,int num) throws InvalidTokenOffsetsException{ IndexSearcher isearcher = getSearcher(); Analyzer analyzer = getAnalyzer(); //使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(version, field,analyzer); //这句所起效果? qp.setDefaultOperator(QueryParser.OR_OPERATOR); try { Query query = qp.parse(keyword); Query q2 = qp.parse("全文检索"); ScoreDoc[] hits; QueryWrapperFilter filter=new QueryWrapperFilter(q2); //注意searcher的几个方法 hits = isearcher.search(query, filter, num).scoreDocs; // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); // 内容增加高亮显示 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(doc.get("content"))); String content = highlighter.getBestFragment(tokenStream, doc.get("content")); System.out.println(content); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } } public static void main(String[] args) throws InvalidTokenOffsetsException { System.out.println("start"); LuceneDemo04 ld = new LuceneDemo04(); ld.createIndex(); long start = System.currentTimeMillis(); ld.searchByTerm("content","人民",500); System.out.println("end search use "+(System.currentTimeMillis()-start)+"ms"); } }
运行结果:
start 加载扩展词典:ext.dic 加载扩展停止词典:stopword.dic 中华<span style='color:red'>人民</span>共和国 中国<span style='color:red'>人民</span>从此站起来了 end search use 129ms
相关文章推荐
- excel vba 高亮显示当前行代码
- ASP 高亮显示不区分大小写的关键字
- PHPAnalysis中文分词类详解
- 奇偶行高亮显示及鼠标划过高亮显示类
- javascript实现table选中的行以指定颜色高亮显示的方法
- Jquery如何实现点击时高亮显示代码
- javascript简单实现表格行间隔显示颜色并高亮显示
- jquery实现带复选框的表格行选中删除时高亮显示
- js兼容IE6,IE7菜单高亮显示效果代码
- php站内搜索并高亮显示关键字的实现代码
- JS实现简洁(隔行换色、高亮显示)表格特效
- Gridview使用CheckBox全选与单选采用js实现同时高亮显示选择行
- 几款开源的中文分词系统
- 使用正则表达式的格式化与高亮显示json字符串
- 中文搜索引擎关键技术:中文分词技术
- lucene集成IK实现中文分词检索
- MySQL中文索引插件mysqlcft安装及使用
- CRF中文分词开源版发布啦
- MMSEG中文分词算法
- 表格行间隔显示颜色,鼠标移入时当前行高亮显示