TermVector用法:相关搜索功能及提高高亮显示性能
2010-07-08 22:44
471 查看
转自:http://hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html
public class TermVectorTest {
Analyzer analyzer = new SimpleAnalyzer();
Directory ramDir = new RAMDirectory();
public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
Document doc1 = new Document();
doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
Document doc2 = new Document();
doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
Document doc3 = new Document();
doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.optimize();
writer.close();
}
public void search() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term("title","java"); //在title里查询java词条
TermQuery query = new TermQuery(term);
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++)
{
Document doc = hits.doc(i);
System.out.println(doc.get("title"));
System.out.println(doc.get("subject"));
System.out.println("moreLike search: ");
morelikeSearch(reader,hits.id(i));
}
}
private void morelikeSearch(IndexReader reader,int id) throws IOException
{
//根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
TermFreqVector vector = reader.getTermFreqVector(id, "subject");
BooleanQuery query = new BooleanQuery();
for (int i = 0; i < vector.size(); i++)
{
TermQuery tq = new TermQuery(new Term("subject",
vector.getTerms()[i])); //获取每个term保存的Token
query.add(tq, BooleanClause.Occur.SHOULD);
}
IndexSearcher searcher = new IndexSearcher(ramDir);
Hits hits = searcher.search(query);
//显示代码,略
}
//Lucene使用TermVector提高高亮显示性能
public void highterLightSearch() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher = new IndexSearcher(reader);
TermQuery query = new TermQuery(new Term("subject","java"));
Hits hits = searcher.search(query);
//高亮显示设置
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
// 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
highlighter.setTextFragmenter(new SimpleFragmenter(100));
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");
TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);
String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));
System.out.println(doc.get("title"));
System.out.println(result);
}
}
public static void main(String[] args) throws CorruptIndexException, IOException
{
TermVectorTest t = new TermVectorTest();
t.createRamIndex();
t.search();
}
}
public class TermVectorTest {
Analyzer analyzer = new SimpleAnalyzer();
Directory ramDir = new RAMDirectory();
public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
Document doc1 = new Document();
doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
Document doc2 = new Document();
doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
Document doc3 = new Document();
doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.optimize();
writer.close();
}
public void search() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term("title","java"); //在title里查询java词条
TermQuery query = new TermQuery(term);
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++)
{
Document doc = hits.doc(i);
System.out.println(doc.get("title"));
System.out.println(doc.get("subject"));
System.out.println("moreLike search: ");
morelikeSearch(reader,hits.id(i));
}
}
private void morelikeSearch(IndexReader reader,int id) throws IOException
{
//根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
TermFreqVector vector = reader.getTermFreqVector(id, "subject");
BooleanQuery query = new BooleanQuery();
for (int i = 0; i < vector.size(); i++)
{
TermQuery tq = new TermQuery(new Term("subject",
vector.getTerms()[i])); //获取每个term保存的Token
query.add(tq, BooleanClause.Occur.SHOULD);
}
IndexSearcher searcher = new IndexSearcher(ramDir);
Hits hits = searcher.search(query);
//显示代码,略
}
//Lucene使用TermVector提高高亮显示性能
public void highterLightSearch() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher = new IndexSearcher(reader);
TermQuery query = new TermQuery(new Term("subject","java"));
Hits hits = searcher.search(query);
//高亮显示设置
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
// 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
highlighter.setTextFragmenter(new SimpleFragmenter(100));
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");
TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);
String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));
System.out.println(doc.get("title"));
System.out.println(result);
}
}
public static void main(String[] args) throws CorruptIndexException, IOException
{
TermVectorTest t = new TermVectorTest();
t.createRamIndex();
t.search();
}
}
相关文章推荐
- ElasticSearch 高亮显示大文档搜索结果的策略和性能对比(译)
- 1.读取excel文件,将输入存储到数据库中(JXL) 2.完成商品的检索相关功能 1.根据分类,显示分类下所有的商品信息,按照库存量从低到高排序(提供补货依据) 2.模糊搜索,根据商品信息(名
- JavaScript实现的搜索及高亮显示功能示例
- Ztree 实现搜索节点高亮显示,重置功能
- 高亮显示搜索到的关键字-jquery.highlight.js的用法!
- JavaScript简单实现关键字文本搜索高亮显示功能示例
- bind命令_Linux bind 命令用法详解:显示或设置键盘按键与其相关的功能
- C#+AE实现类似IDentify功能及对高亮显示相关接口的总结
- C#+AE实现类似IDentify功能及对高亮显示相关接口的总结
- Android 搜索结果匹配关键字且高亮显示功能
- JS实现静态页面搜索并高亮显示功能完整示例
- 前端html中jQuery实现对文本的搜索功能并把搜索相关内容显示出来
- java简单的实现搜索框的下拉显示相关搜索功能
- C#+AE实现类似IDentify功能及对高亮显示相
- 搜索结果高亮显示
- 给CuteEditor5增加了高亮代码显示功能(C#版)
- 搜索关键字高亮显示方法
- Repeater 的操作,数据的显示,绑定,动态取控件中的值和Repeater1_ItemCommand事件的相关用法
- 对JavaScript的全文搜索实现相关度评分的功能的方法
- 原生js实现音乐播放器功能,可以实时显示歌词并且高亮当前句