lucene + IKAnalyzer 中文分词及索引,简单实例
2009-08-26 18:43
351 查看
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.mira.lucene.analysis.IK_CAnalyzer;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.FileReader;
import java.util.Date;
public class Searcher {
private static String INDEX_DIR = Searcher.class.getResource("/").getPath()+"/data/index";//"c://lucene//index";
private static String DOC_DIR = Searcher.class.getResource("/").getPath();//"c://lucene//doc";
public static void main(String[] args) throws Exception {
String queryString;
queryString = "测试";
File indexDir = new File(INDEX_DIR);
File docDir = new File(DOC_DIR);
Date start = new Date();
//必须先建索引
try {
IndexWriter writer = new IndexWriter(INDEX_DIR, new IK_CAnalyzer(), true);
System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
indexDocs(writer, docDir);
System.out.println("Optimizing...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"/n with message: " + e.getMessage());
}
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir
+ " does not exist or is not a directory.");
}
search(indexDir, queryString);
}
private static void indexDocs(IndexWriter writer, File file)
throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
System.out.println("adding " + file);
try {
writer.addDocument(getDocument(file));
}
catch (FileNotFoundException fnfe) {
//
}
}
}
}
private static Document getDocument(File f)
throws java.io.FileNotFoundException {
Document doc = new Document();
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("contents", new FileReader(f)));
return doc;
}
public static void search(File indexDir, String q) throws Exception {
Directory fsDir = FSDirectory.getDirectory(indexDir);
IndexSearcher is = new IndexSearcher(fsDir);// ① 打开索引
Query query = new QueryParser("contents", new IK_CAnalyzer()).parse(q); // ② 分析查询
long start = new Date().getTime();
Hits hits = is.search(query);// ③ 搜索索引
long end = new Date().getTime();
System.err.println("Found " + hits.length() + " document(s) (in "
+ (end - start) + "milliseconds) that matched query" + q + ":");
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i); // ④ 得到匹配的文档
System.out.println("file: " + doc.get("path"));
}
}
}
正向全切分分词器:org.mira.lucene.analysis.IK_CAnalyzer(适合建索引时使用)
正向最大全切分分词器:org.mira.lucene.analysis.MIK_CAnalyzer(适合用户输入检索时使用)
来源:http://hi.baidu.com/happy19840402/blog/item/7f48ce2e462aff554fc226d6.html
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.mira.lucene.analysis.IK_CAnalyzer;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.FileReader;
import java.util.Date;
public class Searcher {
private static String INDEX_DIR = Searcher.class.getResource("/").getPath()+"/data/index";//"c://lucene//index";
private static String DOC_DIR = Searcher.class.getResource("/").getPath();//"c://lucene//doc";
public static void main(String[] args) throws Exception {
String queryString;
queryString = "测试";
File indexDir = new File(INDEX_DIR);
File docDir = new File(DOC_DIR);
Date start = new Date();
//必须先建索引
try {
IndexWriter writer = new IndexWriter(INDEX_DIR, new IK_CAnalyzer(), true);
System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
indexDocs(writer, docDir);
System.out.println("Optimizing...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"/n with message: " + e.getMessage());
}
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir
+ " does not exist or is not a directory.");
}
search(indexDir, queryString);
}
private static void indexDocs(IndexWriter writer, File file)
throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
System.out.println("adding " + file);
try {
writer.addDocument(getDocument(file));
}
catch (FileNotFoundException fnfe) {
//
}
}
}
}
private static Document getDocument(File f)
throws java.io.FileNotFoundException {
Document doc = new Document();
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("modified",
DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("contents", new FileReader(f)));
return doc;
}
public static void search(File indexDir, String q) throws Exception {
Directory fsDir = FSDirectory.getDirectory(indexDir);
IndexSearcher is = new IndexSearcher(fsDir);// ① 打开索引
Query query = new QueryParser("contents", new IK_CAnalyzer()).parse(q); // ② 分析查询
long start = new Date().getTime();
Hits hits = is.search(query);// ③ 搜索索引
long end = new Date().getTime();
System.err.println("Found " + hits.length() + " document(s) (in "
+ (end - start) + "milliseconds) that matched query" + q + ":");
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i); // ④ 得到匹配的文档
System.out.println("file: " + doc.get("path"));
}
}
}
正向全切分分词器:org.mira.lucene.analysis.IK_CAnalyzer(适合建索引时使用)
正向最大全切分分词器:org.mira.lucene.analysis.MIK_CAnalyzer(适合用户输入检索时使用)
来源:http://hi.baidu.com/happy19840402/blog/item/7f48ce2e462aff554fc226d6.html
相关文章推荐
- ucene + IKAnalyzer 中文分词及索引,简单实例
- PHP简单实现中文分词全文索引实例(tag专题)
- Beta笔记——搜索引擎的设计与实现(1):使用Lucene.Net建立索引与中文分词
- lucene3.0+版本中文分词测试+搜索结果+创建索引测试
- lucene3.1.0 简单分词实例
- 让中科院中文分词系统ICTCLAS为lucene所用的简单程序(C#版)
- lucene2.4集成几种中文分词实例代码
- Lucene3.3、Lucene3.4中文分词——庖丁解牛分词实例
- 【lucene系列学习一】实现Lucene索引,查询以及中文分词功能
- lucene的建立索引,搜索,中文分词
- Lucene3.3、Lucene3.4中文分词——庖丁解牛分词实例
- [置顶] Lucene 5.2.1 + jcseg 1.9.6中文分词索引(Lucene 学习序列2)
- java之全文索引搜索lucene之增删改查文档与中文分词搜索
- lucene与sql server数据库实现索引的简单实例(vs.net2008)
- 让中科院中文分词系统ICTCLAS为lucene所用的简单程序(C#版)
- lucene初探(二):中文分词,以及系统自带分词简单比较
- 04_java Lucene学习——分词Analyzer(02):lucene4.0_学写简单的中文同义词分词器
- lucene3.6 中文分词 文件索引
- mmseg4j 中文分词简单代码实例
- 【Lucene3.6.2入门系列】第14节_SolrJ操作索引和搜索文档以及整合中文分词