您的位置：首页 > 其它

lucene4.0入门实例

2015-09-08 13:38 513 查看

先要导入lucene包：
lucene-core-4.0.0.jar
lucene-analyzers-common-4.0.0.jar
这里使用Lucene提供的默认分词器，不支持中文分词

创建索引的代码如下：
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexFile {
public static void main(String []args) throws Exception {

//索引存放位置
//Directory dir =new RAMDirectory(); //建立在内存中
Directory dir = FSDirectory.open(new File("D:\\lucene\\index")); //建立在磁盘的某个目录下

//分词器版本,对英文和中文使用的分词器有可能不一样,在这里我们就用lucene提供的标准的分词器
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_40);
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_40, analyzer);

//创建索引写入对象，可以设第三个参数，如果设为false则以追加方式增加索引，如果设为true则每次都先把原来索引清空再添加
IndexWriter writer=new IndexWriter(dir, iwc);

/*
* 第一条记录，索引字段可以随意变，第一个参数是索引字段名，
* 搜索时要指明对哪个字段搜索，第二个参数是该字段的内容
*/
Document document1 = new Document();
//字段key1，第一个参数是索引字段名，搜索时要指明对哪个字段搜索，第二个参数是该字段的内容
document1.add(new TextField("key1", "Amsterdam has lost of add cancals", Store.YES));
document1.add(new TextField("key2", "add now and tom do at our",Store.YES));//字段key2
document1.add(new TextField("key3", "are you ok jack new",Store.YES));//字段key3
//把第一条内容写入索引文件
writer.addDocument(document1);

//第二条记录
Document document2 = new Document();
document2.add(new TextField("key1", "tom jack ok home when has", Store.YES));
document2.add(new TextField("key2", "do you need coffee now ",Store.YES));
document2.add(new TextField("key3", "add at new ",Store.YES));
//把第二条内容写入索引文件
writer.addDocument(document2);

/*
* 第三条记录，字段可以和前面的字段不同，此时索引中有key1...key6六个字段，索引是通过字段来搜索的，
* 没有该字段的记录自然就不会被找到
*/
Document document3 = new Document();
document3.add(new TextField("key4", "tom jack ok home when has", Store.YES));
document3.add(new TextField("key5", "do you need coffee now ",Store.YES));
document3.add(new TextField("key6", "add at new ",Store.YES));
//把第三条内容写入索引文件
writer.addDocument(document3);
writer.close();
}
}

如果是想通过索引找到某个文件可以用

File f= new File("E:/lucene/example");//找到某个文件夹
for(File
file:f.listFiles()){//从该文件夹读出所有文件，
Document doc=new Document();
doc.add(new TextField("content", new FileReader(file),Store.NO));//文件内容，默认分词
doc.add(new TextField("fileName",file.getName(),Store.YES,Index.NOT_ANALYZED));//文件名
doc.add(new Field("path",file.getAbsolutePath(),Store.YES,Index.NOT_ANALYZED));//文件路径

writer.addDocument(doc);
}

域索引选项:

Index.ANALYZED:进行分词和索引，适用于标题、内容等
Index.NOT_ANALYZED:进行索引，但是不进行分词，如果身份证号，姓名，ID等，适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息，这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引

域存储选项:

Store.YES:将会存储域值，原始字符串的值会保存在索引，以此可以进行相应的恢复操作，即可以用document .get("xxx")取到相关的值，对于主键，标题，文件路径可以是这种方式存储

Store.NO：不会存储域值，但是可以被索引，通常与Index.ANAYLIZED合起来使用，索引一些如文章正文等不需要恢复的文档,不能用document .get("xxx")取到相关的值，如果是文档，我们可以通过文档的路径再次读出文档，虽然要多打开一次文件，但可以节省很多空间，因为文档的内容可能很大，如果全部保存在索引里浪费空间，但是文件内容还是会被分词，还是可以索引，否则索引就没什么意义了。

查询代码如下：
import java.io.File;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexSearch {
public static void main(String[]args) throws Exception {
//在哪搜索
//Directory dir =newRAMDirectory(); //在内存中搜索
Directory dir=FSDirectory.open(new File("D:\\lucene\\index"));

IndexReader reader=DirectoryReader.open(dir);
IndexSearcher searcher=new IndexSearcher(reader);

Term term=new Term("key4", "has");//key2表示查找'key2'字段,has表示要查找包含'has'这个字符串的记录
TermQuery query=new TermQuery(term);
TopDocs topdocs=searcher.search(query, 100);
ScoreDoc[] scoreDocs=topdocs.scoreDocs;
System.out.println("查询结果总数---" + topdocs.totalHits+"最大的评分--"+topdocs.getMaxScore());
for(int i=0; i < scoreDocs.length; i++) {
int doc = scoreDocs[i].doc;
Document document = searcher.doc(doc);
System.out.println("key1===="+document.get("key1"));
System.out.println("key2===="+document.get("key2"));
System.out.println("key3===="+document.get("key3"));
System.out.println("key4===="+document.get("key4"));
System.out.println("key5===="+document.get("key5"));
System.out.println("key6===="+document.get("key6"));
System.out.println("id--" + scoreDocs[i].doc + "---scors--" + scoreDocs[i].score+"---index--"+scoreDocs[i].shardIndex);
}
reader.close();
}
}

索引文件的检索：
索引表规模相对较小，文档集合规模较大。进行检索时，先从检索索引表开始，然后找到相对应的文档。如果查询中仅包含一个关键词，则在索引表中找到该单词，并取出他对应的文档就可以了。如果查询中包含多个关键词，则需要将各个关键字检索出的记录进行合并(即所有结果都列出)，关键是要将一个句子分词。
索引文件的维护：
维护索引用三个操作：插入、删除和更新文档。但是更新操作需要较高的代价，因为文档修改后（即使是很小的修改），就可以会造成文档中的很多的关键词的位置发生了变化，这是需要频繁的读取和修改记录，这种代价是相当高的。因此，一般不进行更新操作，而是使用“先删除，后创建”的方式代替更新操作。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航