您的位置:首页 > 其它

lucene + IKAnalyzer 中文分词及索引,简单实例

2009-08-26 18:43 351 查看
import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.DateTools;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.mira.lucene.analysis.IK_CAnalyzer;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.FileReader;

import java.util.Date;

public class Searcher {

private static String INDEX_DIR = Searcher.class.getResource("/").getPath()+"/data/index";//"c://lucene//index";

private static String DOC_DIR = Searcher.class.getResource("/").getPath();//"c://lucene//doc";

public static void main(String[] args) throws Exception {

String queryString;

queryString = "测试";

File indexDir = new File(INDEX_DIR);

File docDir = new File(DOC_DIR);

Date start = new Date();

//必须先建索引

try {

IndexWriter writer = new IndexWriter(INDEX_DIR, new IK_CAnalyzer(), true);

System.out.println("Indexing to directory '" + INDEX_DIR + "'...");

indexDocs(writer, docDir);

System.out.println("Optimizing...");

writer.optimize();

writer.close();

Date end = new Date();

System.out.println(end.getTime() - start.getTime() + " total milliseconds");

} catch (IOException e) {

System.out.println(" caught a " + e.getClass() +

"/n with message: " + e.getMessage());

}

if (!indexDir.exists() || !indexDir.isDirectory()) {

throw new Exception(indexDir

+ " does not exist or is not a directory.");

}

search(indexDir, queryString);

}

private static void indexDocs(IndexWriter writer, File file)

throws IOException {

if (file.canRead()) {

if (file.isDirectory()) {

String[] files = file.list();

if (files != null) {

for (int i = 0; i < files.length; i++) {

indexDocs(writer, new File(file, files[i]));

}

}

} else {

System.out.println("adding " + file);

try {

writer.addDocument(getDocument(file));

}

catch (FileNotFoundException fnfe) {

//

}

}

}

}

private static Document getDocument(File f)

throws java.io.FileNotFoundException {

Document doc = new Document();

doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));

doc.add(new Field("modified",

DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),

Field.Store.YES, Field.Index.UN_TOKENIZED));

doc.add(new Field("contents", new FileReader(f)));

return doc;

}

public static void search(File indexDir, String q) throws Exception {

Directory fsDir = FSDirectory.getDirectory(indexDir);

IndexSearcher is = new IndexSearcher(fsDir);// ① 打开索引

Query query = new QueryParser("contents", new IK_CAnalyzer()).parse(q); // ② 分析查询

long start = new Date().getTime();

Hits hits = is.search(query);// ③ 搜索索引

long end = new Date().getTime();

System.err.println("Found " + hits.length() + " document(s) (in "

+ (end - start) + "milliseconds) that matched query" + q + ":");

for (int i = 0; i < hits.length(); i++) {

Document doc = hits.doc(i); // ④ 得到匹配的文档

System.out.println("file: " + doc.get("path"));

}

}

}

正向全切分分词器:org.mira.lucene.analysis.IK_CAnalyzer(适合建索引时使用)

正向最大全切分分词器:org.mira.lucene.analysis.MIK_CAnalyzer(适合用户输入检索时使用)

来源:http://hi.baidu.com/happy19840402/blog/item/7f48ce2e462aff554fc226d6.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: