您的位置:首页 > 其它

lucene 实现word,pdf全文检索源码

2015-08-25 15:15 387 查看
创建索引:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* 创建索引 Lucene 3.0+
* @author Administrator
*
*/
public class indexer {

/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
//将要搜索TXT文件的地方
String dateDir = "data\\test\\dateDir";
IndexWriter indexWriter = null;
//创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建IndexWriter对象,
//第一个参数是Directory,第二个是分词器,
//第三个表示是否是创建,如果为false为在此基础上面修改,
//第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
//一般用IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = new File(dateDir).listFiles();
for (int i = 0; i < files.length; i++) {
Document doc = null;
if(files[i].getName().endsWith(".txt")){
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
}else if(files[i].getName().endsWith(".doc")){
doc = getDocument(files[i]);
}else if(files[i].getName().endsWith(".ppt")){
doc = getPPT(files[i]);
}else if(files[i].getName().endsWith(".xls")){
doc = getExcel(files[i]);
}else if(files[i].getName().endsWith(".pdf")){
doc = getPdf(files[i]);
}else{
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));

}
//写入IndexWriter
if(doc!= null) indexWriter.addDocument(doc);
}
//查看IndexWriter里面有多少个索引
System.out.println("numDocs:"+indexWriter.numDocs());
indexWriter.close();

}

public static Document getDocument(File file) throws Exception {
String docPath = file.getAbsolutePath();
String title = file.getName();

// 创建Document
Document document = new Document();

/*InputStream inputStream = null;
Reader contents = null;
try {
inputStream = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}

WordExtractor extractor = new WordExtractor();
//try{
//	POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
//	DocumentEntry headerProps =
//	         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
//	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
//	byte[] header = new byte[headerProps.getSize()];

//	din.read(header);
//	din.close();

//	int info = LittleEndian.getShort(header, 0xa);
//	if ((info & 0x4) != 0)
//	{
//		throw new FastSavedException("Fast-saved files are unsupported at this time");
//	}
//	if ((info & 0x100) != 0)
//	{
//		throw new PasswordProtectedException("This document is password protected");
//	}
//}finally{

//}

try {
contents = new StringReader(extractor.extractText(inputStream));
} catch (Exception e) {
e.printStackTrace();
}*/

StringBuffer contents = new StringBuffer("");// 文档内容
try {
FileInputStream fs = new FileInputStream(docPath);
HWPFDocument doc = new HWPFDocument(fs);
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
contents.append(pp.text());
}

} catch (Exception e) {

}
String cont = contents.toString().trim();

document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
}

public static Document getPPT(File pptFile) throws IOException{
String docPath = pptFile.getAbsolutePath();
String title = pptFile.getName();

StringBuffer contents = new StringBuffer("");// 文档内容
InputStream is = new FileInputStream(pptFile);
SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
Slide[] slides = ppt.getSlides();
//提取文本信息
/*for (Slide each : slides) {
//System.out.println("title:" + each.getTitle()) ;
//System.out.println("content:") ;
TextRun[] textRuns = each.getTextRuns();
for (int i=0 ;i< textRuns.length; i++ ) {
//System.out.println(textRuns[i].getText());
RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
for (int j = 0; j < richTextRuns.length; j++) {
//System.out.println(richTextRuns[j].getText());
contents.append(richTextRuns[j].getText());
}
}
contents.append(each.getTitle());
}*/
for(int i=0;i <slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int   j=0;j <t.length;j++){
contents.append(t[j].getText());//这里会将文字内容加到content中去
}
//contents.append(slides[i].getTitle());
}

Document document = new Document();
String cont = contents.toString().trim();

document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
}

public static Document getPdf(File pdf) {
String pdfpath = pdf.getAbsolutePath();
// 创建输入流读取pdf文件
String title = pdf.getName();
String result = "";
FileInputStream is = null;
PDDocument doc = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc);

} catch (Exception e) {

e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Document document = new Document();
document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", result, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
}

public static Document getExcel(File fileExcel) throws Exception {

InputStream is = new FileInputStream(fileExcel);
StringBuffer content = new StringBuffer();

HSSFWorkbook workbook = new HSSFWorkbook(is);

for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
HSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
}

for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
HSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
}

if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue().getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
content.append(df.format(date));
}
}
}
}
}

String cont = content.toString();
Document document = new Document();
document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", cont, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
}

public static String readHtml(String urlString) {

StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码

String line = null;

while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
}


  搜索索引

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索索引 Lucene 3.0+
* @author Administrator
*
*/
public class searcher {

public static void main(String[] args) throws IOException, ParseException {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"contents", new StandardAnalyzer(Version.LUCENE_30));
//生成Query对象
Query query = queryParser.parse("arcgis");
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query,10);
//hits.totalHits表示一共搜到多少个
System.out.println("找到了"+hits.totalHits+"个");
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(doc.get("filename"));
}
indexSearch.close();
}
}


  
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: