您的位置:首页 > 其它

一个lucene索引初始化,添加,删除,修改功能的实现

2012-12-28 17:17 771 查看
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.springframework.stereotype.Service;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
import org.xml.sax.InputSource;

import javax.annotation.Resource;
import java.io.*;
import java.util.*;

/**
* Created with IntelliJ IDEA.
* User: R
* Date: 12-12-20
* Time: 上午10:05
* To change this template use File | Settings | File Templates.
*/
@Service
public class SiteSearchService implements ISiteSearchService {

private static final Log logger = LogFactory.getLog(SiteSearchService.class);

@Resource(name = "resourceDAO")
private ResourceDAO resourceDAO;

private static String index_FILE_PATH = "d:/indexDir";
private static int MAXNUM = 100;
private static Version VERSION = Version.LUCENE_35;
private Analyzer analyzer = new IKAnalyzer();
private DOMFragmentParser parser = new DOMFragmentParser();

public void createSiteIndex(){
logger.info("--------------lucene-----------createSiteIndex[start:"+(new Date()).toString()+"]");
File file = new File(index_FILE_PATH);

Directory directory = null;
IndexWriter iwriter = null;
try{
if (!file.exists()) {
logger.info("---------lucene--------filecreate------filepath:"+index_FILE_PATH);
List<Resource> resourceList = resourceDAO.getResourceList();
if(resourceList!=null){
// 建立内存索引对象
directory = FSDirectory.open(file);
IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
//使用IK中文分词器建立索引
iwriter = new IndexWriter(directory, config);
for(Resource tempResource : resourceList){
iwriter.addDocument(addtoDoc(tempResource));
}
}
}
}catch(Exception e){
logger.error(e);
}finally {
if (iwriter != null) {
try {
iwriter.close();
} catch (IOException e) {
logger.error(e);
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
logger.error(e);
}
}
}
logger.info("--------------lucene-----------createSiteIndex[end:"+(new Date()).toString()+"]");
}

private Document addtoDoc(ResouceIndexData tempsiteResource){
Document doc = new Document();
//Field.Index.NO 表示不索引
//Field.Index.ANALYZED 表示分词且索引
//Field.Index.NOT_ANALYZED 表示不分词且索引
doc.add(new Field("id", String.valueOf(tempsiteResource.getId()),
Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("siteid", tempsiteResource.getSiteid()==null?"":tempsiteResource.getSiteid(),
Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("typeid", tempsiteResource.getTypeid()==null?"":tempsiteResource.getTypeid(),
Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("price", tempsiteResource.getPrice()==null?"":tempsiteResource.getPrice(),
Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("pictureUrl", tempsiteResource.getPictureUrl()==null?"":tempsiteResource.getPictureUrl(),
Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("status", tempsiteResource.getStatus()==null?"":tempsiteResource.getStatus(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("title", tempsiteResource.getTitle()==null?"":tempsiteResource.getTitle(),
Field.Store.YES, Field.Index.ANALYZED));
String content = "";
if(tempsiteResource.getContent()!=null){
try{
content = "<div>"+tempsiteResource.getContent()+"</div>";
//创建文件片段对象
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
InputStream inputStream = new ByteArrayInputStream(content.getBytes("UTF-8"));
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
//解析HTML内容
parser.parse(new InputSource(inputStream),node);
StringBuffer sb = new StringBuffer();
getText(sb,node);
content = sb.toString();
}catch(Exception e){
e.printStackTrace();
}
}
doc.add(new Field("content", content,
Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("description", tempsiteResource.getDescription()==null?"":tempsiteResource.getDescription(),
Field.Store.YES, Field.Index.ANALYZED));
return doc;
}

private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());//取得结点值,即开始与结束标签之间的信息
}
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
getText(sb, children.item(i));//递归遍历DOM树
}
}
}

public void addDoc(ResouceIndexData tempsiteResource){
try{
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer);
IndexWriter writer = new IndexWriter(directory, writerConfig);
Document doc = addtoDoc(tempsiteResource);
writer.addDocument(doc);
writer.close();
}catch (Exception e){
logger.error(e);
}
}

public void updateDoc(ResouceIndexData tempsiteResource) {
try {
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);

IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer);
IndexWriter writer = new IndexWriter(directory, writerConfig);

Document doc = addtoDoc(tempsiteResource);

Term term = new Term("id", String.valueOf(tempsiteResource.getId()));

writer.updateDocument(term, doc);
writer.close();
} catch (Exception e) {
logger.error(e);
}
}

public void deleteDoc(int id) {
try {
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer);
IndexWriter writer = new IndexWriter(directory, writerConfig);
Term term = new Term("id", String.valueOf(id));
writer.deleteDocuments(term);
writer.close();
} catch (Exception e) {
logger.error(e);
}
}

@Override
public Resouce getDoc(int id) {
Resouce resource = null;
IndexSearcher isearcher = null;
IndexReader ireader = null;
try{
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);
ireader = IndexReader.open(directory);
isearcher = new IndexSearcher(ireader);
Term term = new Term("id", String.valueOf(id));
Query query = new TermQuery(term);
TopDocs docs = isearcher.search(query,1);
if(docs.totalHits>0){
Document targetDoc = isearcher.doc(docs.scoreDocs[0].doc);

siteResource.setId(targetDoc.get("id"));
siteResource.setSiteid(targetDoc.get("siteid"));
siteResource.setTypeid(targetDoc.get("typeid"));
siteResource.setPrice(targetDoc.get("price"));
siteResource.setPictureUrl(targetDoc.get("pictureUrl"));
siteResource.setDescription(targetDoc.get("description"));
siteResource.setTitle(targetDoc.get("title"));
siteResource.setContent(targetDoc.get("content"));
}
}catch (Exception e) {
logger.error(e);
}
return siteResource;  //To change body of implemented methods use File | Settings | File Templates.
}

public List<Resouce> search(String queryWord,String siteid,String typeid,int startPage,int pageSize){
List<Resouce> siteResourceList = new ArrayList<Resouce>();
IndexSearcher isearcher = null;
IndexReader ireader = null;
try{
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);
ireader = IndexReader.open(directory);
isearcher = new IndexSearcher(ireader);
isearcher.setSimilarity(new IKSimilarity());//在索引器中使用IKSimilarity相似度评估器
BooleanQuery query = new BooleanQuery();
String[] keys = {queryWord,queryWord,queryWord};
String[] fields = {"title","content","description"};
Query ikquery = IKQueryParser.parseMultiField(fields, keys);
Query siteidQuery = new TermQuery(new Term("siteid",siteid));
Query typeidQuery = new TermQuery(new Term("typeid",typeid));
query.add(siteidQuery, BooleanClause.Occur.MUST);
query.add(typeidQuery, BooleanClause.Occur.MUST);
query.add(ikquery, BooleanClause.Occur.MUST);
logger.info("---------lucene------queryword:"+query.toString());
Sort sort = new Sort(new SortField("id",SortField.STRING,true));//根据资源ID倒排序
logger.info("---------lucene------sort:"+sort.toString());
//关键字高亮显示的html标签
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
TopDocs topDocs = isearcher.search(query, null,MAXNUM,sort);
int allcount = topDocs.totalHits;
logger.info("---------lucene------search:"+allcount);

//分页取出指定的doc(开始条数, 取几条)
int startRow = startPage-1<0?0:(startPage-1)*pageSize;
int endRow = startPage*pageSize>allcount?allcount:startPage*pageSize;
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = startRow; i < endRow; i++) {
Resouce resource = new Resouce();
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
//标题增加高亮显示
TokenStream tokenStream1 = analyzer.tokenStream("title", new StringReader(targetDoc.get("title")));
String titledata = highlighter.getBestFragment(tokenStream1, targetDoc.get("title"));
if(titledata==null){
titledata = targetDoc.get("title");
}
//内容增加高亮显示
TokenStream tokenStream2 = analyzer.tokenStream("content", new StringReader(targetDoc.get("content")));
String contentdata = highlighter.getBestFragment(tokenStream2, targetDoc.get("content"));
if(contentdata==null){
contentdata = targetDoc.get("content");
}
//简述增加高亮显示
TokenStream tokenStream3 = analyzer.tokenStream("description", new StringReader(targetDoc.get("description")));
String descriptiondata = highlighter.getBestFragment(tokenStream3, targetDoc.get("description"));
if(descriptiondata==null){
descriptiondata = targetDoc.get("description");
}
String iddata = targetDoc.get("id");
//将资源重装
resource.setId(iddata);
resource.setSiteid(targetDoc.get("siteid"));
resource.setTypeid(targetDoc.get("typeid"));
resource.setPrice(targetDoc.get("price"));
resource.setPictureUrl(targetDoc.get("pictureUrl"));
resource.setTitle(titledata);
resource.setContent(contentdata);
resource.setDescription(descriptiondata);
resourceList.add(resource);
}
}catch (Exception e){
logger.error(e);
}finally {
if (isearcher != null) {
try {
isearcher.close();
} catch (IOException e) {
logger.error(e);
}
}
if (ireader != null) {
try {
ireader.close();
} catch (IOException e) {
logger.error(e);
}
}
}

return siteResourceList;
}

@Override
public String searchCount(String queryWord, String siteid, String typeid) {
String allcount = "0";
IndexSearcher isearcher = null;
IndexReader ireader = null;
try{
File file = new File(index_FILE_PATH);
if(!file.exists()){
createSiteIndex();
}
Directory directory = FSDirectory.open(file);
ireader = IndexReader.open(directory);
isearcher = new IndexSearcher(ireader);
isearcher.setSimilarity(new IKSimilarity());//在索引器中使用IKSimilarity相似度评估器
BooleanQuery query = new BooleanQuery();
String[] keys = {queryWord,queryWord,queryWord};
String[] fields = {"title","content","description"};
Query ikquery = IKQueryParser.parseMultiField(fields, keys);
Query siteidQuery = new TermQuery(new Term("siteid",siteid));
Query typeidQuery = new TermQuery(new Term("typeid",typeid));
query.add(siteidQuery, BooleanClause.Occur.MUST);
query.add(typeidQuery, BooleanClause.Occur.MUST);
query.add(ikquery, BooleanClause.Occur.MUST);
logger.info("---------lucene--count----queryword:"+query.toString());
Sort sort = new Sort(new SortField("id",SortField.STRING,true));//根据资源ID倒排序
logger.info("---------lucene------sort:"+sort.toString());
TopDocs topDocs = isearcher.search(query, null,MAXNUM,sort);
allcount = String.valueOf(topDocs.totalHits);
logger.info("---------lucene--count----search:"+allcount);
}catch (Exception e) {
logger.error(e);
}finally {
if (isearcher != null) {
try {
isearcher.close();
} catch (IOException e) {
logger.error(e);
}
}
if (ireader != null) {
try {
ireader.close();
} catch (IOException e) {
logger.error(e);
}
}
}
return allcount;  //To change body of implemented methods use File | Settings | File Templates.
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐