您的位置:首页 > 其它

TermVector用法:相关搜索功能及提高高亮显示性能

2010-07-08 22:44 471 查看
转自:http://hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html

public class TermVectorTest {



Analyzer analyzer = new SimpleAnalyzer();

Directory ramDir = new RAMDirectory();



public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{



IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);



Document doc1 = new Document();

doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));

doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));

doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));



Document doc2 = new Document();

doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));

doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));

doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));



Document doc3 = new Document();

doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));

doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));

doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));



writer.addDocument(doc1);

writer.addDocument(doc2);

writer.addDocument(doc3);



writer.optimize();

writer.close();

}



public void search() throws CorruptIndexException, IOException{

IndexReader reader = IndexReader.open(ramDir);

IndexSearcher searcher = new IndexSearcher(reader);

Term term = new Term("title","java"); //在title里查询java词条

TermQuery query = new TermQuery(term);

Hits hits = searcher.search(query);

for (int i = 0; i < hits.length(); i++)

{

Document doc = hits.doc(i);

System.out.println(doc.get("title"));

System.out.println(doc.get("subject"));

System.out.println("moreLike search: ");



morelikeSearch(reader,hits.id(i));

}

}



private void morelikeSearch(IndexReader reader,int id) throws IOException

{

//根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息

TermFreqVector vector = reader.getTermFreqVector(id, "subject");



BooleanQuery query = new BooleanQuery();



for (int i = 0; i < vector.size(); i++)

{

TermQuery tq = new TermQuery(new Term("subject",

vector.getTerms()[i])); //获取每个term保存的Token



query.add(tq, BooleanClause.Occur.SHOULD);



}



IndexSearcher searcher = new IndexSearcher(ramDir);



Hits hits = searcher.search(query);



//显示代码,略





}



//Lucene使用TermVector提高高亮显示性能

public void highterLightSearch() throws CorruptIndexException, IOException{

IndexReader reader = IndexReader.open(ramDir);



IndexSearcher searcher = new IndexSearcher(reader);



TermQuery query = new TermQuery(new Term("subject","java"));



Hits hits = searcher.search(query);



//高亮显示设置

SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");



Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));



// 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容

highlighter.setTextFragmenter(new SimpleFragmenter(100));



for(int i = 0; i < hits.length(); i++){



Document doc = hits.doc(i);



TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");



TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");

TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);



String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));



System.out.println(doc.get("title"));



System.out.println(result);



}





}



public static void main(String[] args) throws CorruptIndexException, IOException

{

TermVectorTest t = new TermVectorTest();

t.createRamIndex();

t.search();

}



}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐