您的位置:首页 > 编程语言 > Java开发

自己动手写搜索引擎(常搜吧历程四#分词#)(Java、Lucene、hadoop)

2013-03-18 09:38 483 查看
分词的基本原理:

1、分词是用来对文本按语言特征按算法进行过滤、分组处理的一种技术。

2、分词的对象是文本,而不是图像动画脚本等等。

3、分词的方式就是过滤和分组。

4、过滤主要把文本中那些没有实际意义的字或词过滤掉。

5、分组就是按照”分词数据库“内已添加好的词,进行匹配。

下面来看Lucene分词器的使用

package com.qianyan.analyzer;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class TestAnalyzer {

public static void main(String[] args) throws IOException {
Analyzer analyzer = new StandardAnalyzer(); //标准 过滤停用次
//Analyzer analyzer = new SimpleAnalyzer();  //简单 过滤空格和符号
//Analyzer analyzer = new WhitespaceAnalyzer();  //过滤空格
//Analyzer analyzer = new ChineseAnalyzer();  //lucene下的中文分词器   拆分每个字符,过滤符号
//Analyzer analyzer = new CJKAnalyzer();  //中文 两字两字拆分 英文和standard功能一样
String input = "this is test lucene analyzer class!";
TokenStream tokenStream = analyzer.tokenStream("", new StringReader(input));
Token token = new Token();
while(null != tokenStream.next(token))
System.out.println(token.term());
}
}


对于初学者,我们只需要掌握这些经典的分词器就足够了。

但在实际的开发过程中,满足我们需要得,是一些基于lucene分词之上的第三方中文分词包,在这里我们只介绍 ”庖丁分词包“,命令借鉴了”庖丁解牛“这个成语。

庖丁解牛,我国古代成语,出自《庄子》,比喻经过反复实践,掌握了事物的客观规律,做事得心应手,运用自如。

下载网址链接:http://code.google.com/p/paoding/

解压后我们需要对项目添加2个jar包,解压目录下的paoding-analysis.jar 和lib下的 commons-logging.jar 。令把dic文件夹复制到我们的项目src目录下。

package com.qianyan.analyzer;

import java.io.IOException;
import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

public class TestPaodingAnalyzer {

public static void main(String[] args) throws IOException {
Analyzer analyzer = new PaodingAnalyzer();
String input = "我爱北京天安门!";
TokenStream ts = analyzer.tokenStream("", new StringReader(input));
Token token = new Token();
while(null != (token = ts.next(null)))
System.out.println(token.term());
}
}


大家通过这个例子可以看到,paoding分词器相当的强大,它的语法在此不过多介绍,有兴趣的朋友可以看解压后的中文操作手册。

下面来看下实际中运用

首先根据paoding分词器建立索引:

package com.qianyan.index;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestPaodingIndex {

public static void main(String[] args) throws IOException{

String[] ids = {"1", "2", "3", "4"};
String[] names = {"张三", "李四", "李五", "赵六"};
String[] addresses = {"居住在北京", "南京", "北京海淀", "南宁"};
String[] birthdays = {"19820720", "19840203", "19770409", "19830130"};
Analyzer analyzer = new PaodingAnalyzer();
String indexDir = "E:/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
for(int i = 0; i < ids.length; i++){
Document document = new Document();
document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("address", addresses[i], Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("birthday", birthdays[i], Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(document);
}
writer.optimize();
writer.close();
}

}


然后来看简单的检索类

package com.qianyan.search;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestPaodingSearch {

public static void main(String[] args) throws IOException {
String indexDir = "E:/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexSearcher searcher = new IndexSearcher(dir);
ScoreDoc[] hits = null;

/*Term term = new Term("address", "北京");
TermQuery query = new TermQuery(term);
*/

/*Term term = new Term("name", "张");
PrefixQuery query = new PrefixQuery(term);*/

Term term = new Term("name", "李*");
WildcardQuery query = new WildcardQuery(term);

TopDocs topDocs = searcher.search(query, 100);
hits = topDocs.scoreDocs;
for(int i = 0; i < hits.length; i++){
Document doc = searcher.doc(hits[i].doc);
System.out.print(hits[i].score + " ");
System.out.print(doc.get("id") + " ");
System.out.print(doc.get("name") + " ");
System.out.print(doc.get("address") + " ");
System.out.println(doc.get("birthday") + " ");
}

searcher.close();
dir.close();
}
}


下面是来看QueryParser检索类

package com.qianyan.search;

import java.io.IOException;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestQueryParser {
public static void main(String[] args) throws IOException, ParseException {
Analyzer analyzer = new PaodingAnalyzer();
String indexDir = "E:/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexSearcher searcher = new IndexSearcher(dir);
ScoreDoc[] hits = null;

QueryParser parser = new QueryParser("address", analyzer);		//name为默认字段检索
Query query = parser.parse("北京");
//Query query = parser.parse("birthday:[19820720 TO 19840203]"); //中括号包含首尾,花括号不包含。TO指范围
//Query query = parser.parse("张~");	//前缀检索
//Query query = parser.parse("上海 北京");
//Query query = parser.parse("(居住 or 北京) and 海淀");
//Query query = parser.parse("上海 北京 AND NOT name:李四");
//Query query = parser.parse("name:李*"); //前缀检索
TopDocCollector topdoc = new TopDocCollector(100);

searcher.search(query, topdoc);
hits = topdoc.topDocs().scoreDocs;

for(int i = 0; i < hits.length; i++){
Document doc = searcher.doc(hits[i].doc);
//System.out.println(hits[i].score);
System.out.print(doc.get("id") + " ");
System.out.print(doc.get("name") + " ");
System.out.print(doc.get("address") + " ");
System.out.println(doc.get("birthday") + " ");
}

searcher.close();
dir.close();
}
}


下面我们来学习Paoding对文件的索引

package com.qianyan.file;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestFileIndex {

public static void main(String[] args) throws IOException {
String dataDir = "E:\\lucenedata";
String indexDir = "E:\\luceneindex";

File[] files = new File(dataDir).listFiles();
Analyzer analyzer = new PaodingAnalyzer();
Directory dir = FSDirectory.getDirectory(indexDir);
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
for(int i = 0; i < files.length; i++){
StringBuffer strBuffer = new StringBuffer();
String line = "";
FileInputStream fs = new FileInputStream(files[i].getCanonicalPath());
BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
line = reader.readLine();
while(null != line){
strBuffer.append(line).append("\n");
line = reader.readLine();
}
Document document = new Document();
document.add(new Field("fileName", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("contents", strBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(document);
fs.close();
reader.close();
}
writer.close();
dir.close();
}
}


然后是对之前索引的检索类:

package com.qianyan.file;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class TestSearch {

public static void main(String[] args) throws IOException {
String indexDir = "E:/luceneindex";
Directory dir = FSDirectory.getDirectory(indexDir);
IndexSearcher searcher = new IndexSearcher(dir);
ScoreDoc[] hits = null;

Term term = new Term("fileName", "星期");
TermQuery query = new TermQuery(term);

TopDocs topDocs = searcher.search(query, 100);
hits = topDocs.scoreDocs;
for(int i = 0; i < hits.length; i++){
Document doc = searcher.doc(hits[i].doc);
System.out.print(hits[i].score + " ");
System.out.println(doc.get("fileName") + " ");
System.out.println(doc.get("contents") + " ");
}

searcher.close();
dir.close();
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐