您的位置:首页 > 其它

Lucene第一个入门学习例子

2014-10-14 11:16 471 查看
看Lucene in Action的时候,练习的一个入门例子。 在使用Lucene进行文本内容搜索前,需要先对指定的目录下的文件进行建立索引,代码如下:
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {
public static void main(String[] args) {
if(args.length != 2) {
throw new IllegalArgumentException("Usage : java " + Indexer.class.getName() + " <index dir><data dir>");
}
String indexDir = args[0];
String dataDir = args[1];

long start = System.currentTimeMillis();
Indexer indexer = null;
try {
indexer = new Indexer(indexDir);
} catch (IOException e) {
e.printStackTrace();
}
int numIndexed = 0;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
indexer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + "mi");
}

private IndexWriter writer;

public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_30, new StandardAnalyzer(Version.LUCENE_30)));
}

public void close() throws IOException {
writer.close();
}

public int index(String dataDir, FileFilter filter) throws Exception{
File[] files = new File(dataDir).listFiles();
for(File file : files) {
if(!file.isDirectory() && !file.isHidden() && file.exists() && file.canRead() && (filter == null || filter.accept(file))) {
indexFile(file);
}
}
return writer.numDocs();
}

private static class TextFilesFilter implements FileFilter {
@Override
public boolean accept(File path) {
return path.getName().toLowerCase().endsWith(".txt");
}
}

/**
* 声明三个索引的查询域,一个contents,一个filename,一个fullpath
* @param f
* @return
* @throws Exception
*/
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}

private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}
}
在使用命令方式执行后在传入的目录下会生成如图的文件:

成功生成索引后,便可以进行基本的搜索了,检索代码如下:
import java.io.File;
import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {
public static void main(String[] args) throws IOException, ParseException {
if(args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir><query>");
}
String indexDir = args[0];
String q = args[1];
search(indexDir, q);
}

private static void search(String indexDir, String q) throws IOException, ParseException {
Directory dir = FSDirectory.open(new File(indexDir));
DirectoryReader reader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(reader);
// 每个Term都对应一个Field域
Query query = new TermQuery(new Term("contents", q));
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10);
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " mi) that matched query '" + q + "' :");
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullpath"));
System.out.println(Arrays.toString(doc.getValues("filename")));
System.out.println(doc.get("contents"));
}

Query qu = new TermQuery(new Term("filename", "1.txt"));
TopDocs hits1 = is.search(qu, 10);
for(ScoreDoc score : hits1.scoreDocs) {
System.out.println(is.doc(score.doc).get("fullpath"));
}
reader.close();
dir.close();
}
}
命令中执行如搜索目录下是否有“Hi”的单词,返回结果为:

本文出自 “菜鸟学习” 博客,请务必保留此出处http://biyusheng.blog.51cto.com/2626140/1563809
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: