您的位置:首页 > 其它

最近学习Lucene,在别人基础上,做了一个小例子

2007-06-21 16:32 375 查看
最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习!




import java.io.InputStream;




import lia.handlingtypes.framework.DocumentHandlerException;




import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;


import org.textmining.text.extraction.WordExtractor;






public class DocDocumentHandler implements DocumentHandler ...{






public Document getDocument(InputStream is) throws Exception ...{


// TODO Auto-generated method stub


String bodyText = null;






try ...{


bodyText = new WordExtractor().extractText(is);


}




catch (Exception e) ...{


throw new DocumentHandlerException(


"Cannot extract text from a Word document", e);


}






if ((bodyText != null) && (bodyText.trim().length() > 0)) ...{


Document doc = new Document();


doc.add(Field.UnStored("body", bodyText));


return doc;


}


return null;


}




}






import java.io.InputStream;




import org.apache.lucene.document.Document;








public interface DocumentHandler ...{


Document getDocument(InputStream is)


throws Exception;




}




import java.io.InputStream;




import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;


import org.w3c.dom.Element;


import org.w3c.dom.Node;


import org.w3c.dom.NodeList;


import org.w3c.dom.Text;


import org.w3c.tidy.Tidy;






public class HtmlDocumentHandler implements DocumentHandler ...{






public Document getDocument(InputStream is) throws Exception ...{


// TODO Auto-generated method stub


Tidy tidy = new Tidy();


tidy.setQuiet(true);


tidy.setShowWarnings(false);


org.w3c.dom.Document root = tidy.parseDOM(is, null);


Element rawDoc = root.getDocumentElement();


Document doc = new Document();


String title = getTitle(rawDoc);


String body = getBody(rawDoc);




if ((title != null) && (!title.equals(""))) ...{


doc.add(Field.Text("title", title));


}




if ((body != null) && (!body.equals(""))) ...{


doc.add(Field.Text("body", body));


}




return doc;


}




private String getTitle(Element rawDoc) ...{




if (rawDoc == null) ...{


return null;


}




String title = "";




NodeList children = rawDoc.getElementsByTagName("title");




if (children.getLength() > 0) ...{


Element titleElement = ((Element) children.item(0));


Text text = (Text) titleElement.getFirstChild();




if (text != null) ...{


title = text.getData();


}


}


return title;


}






/** *//**


* Gets the body text of the HTML document.


*


* @rawDoc the DOM Element to extract body Node from


* @return the body text


*/




private String getBody(Element rawDoc) ...{




if (rawDoc == null) ...{


return null;


}




String body = "";


NodeList children = rawDoc.getElementsByTagName("body");




if (children.getLength() > 0) ...{


body = getText(children.item(0));


}


return body;


}






/** *//**


* Extracts text from the DOM node.


*


* @param node a DOM node


* @return the text value of the node


*/




private String getText(Node node) ...{


NodeList children = node.getChildNodes();


StringBuffer sb = new StringBuffer();




for (int i = 0; i < children.getLength(); i++) ...{


Node child = children.item(i);




switch (child.getNodeType()) ...{


case Node.ELEMENT_NODE:


sb.append(getText(child));


sb.append(" ");


break;


case Node.TEXT_NODE:


sb.append(((Text) child).getData());


break;


}


}


return sb.toString();


}


}






import java.io.File;


import java.io.FileInputStream;


import java.io.FileOutputStream;


import java.io.IOException;


import java.io.InputStream;




import lia.handlingtypes.framework.DocumentHandlerException;




import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;


import org.pdfbox.cos.COSDocument;


import org.pdfbox.encryption.DecryptDocument;


import org.pdfbox.exceptions.CryptographyException;


import org.pdfbox.exceptions.InvalidPasswordException;


import org.pdfbox.pdfparser.PDFParser;


import org.pdfbox.pdmodel.PDDocument;


import org.pdfbox.pdmodel.PDDocumentInformation;


import org.pdfbox.searchengine.lucene.LucenePDFDocument;


import org.pdfbox.util.PDFTextStripper;






public class PdfDocumentHandler implements DocumentHandler ...{




public static String password = "-password";




public Document getDocument(InputStream is) throws Exception ...{


// TODO Auto-generated method stub


COSDocument cosDoc = null;




try ...{


cosDoc = parseDocument(is);


}




catch (IOException e) ...{


closeCOSDocument(cosDoc);


throw new DocumentHandlerException(


"Cannot parse PDF document", e);


}




// decrypt the PDF document, if it is encrypted




try ...{




if (cosDoc.isEncrypted()) ...{


DecryptDocument decryptor = new DecryptDocument(cosDoc);


decryptor.decryptDocument(password);


}


}




catch (CryptographyException e) ...{


closeCOSDocument(cosDoc);


throw new DocumentHandlerException(


"Cannot decrypt PDF document", e);


}




catch (InvalidPasswordException e) ...{


closeCOSDocument(cosDoc);


throw new DocumentHandlerException(


"Cannot decrypt PDF document", e);


}




catch (IOException e) ...{


closeCOSDocument(cosDoc);


throw new DocumentHandlerException(


"Cannot decrypt PDF document", e);


}




// extract PDF document's textual content


String docText = null;




try ...{


PDFTextStripper stripper = new PDFTextStripper();


docText = stripper.getText(new PDDocument(cosDoc));


}




catch (IOException e) ...{


closeCOSDocument(cosDoc);


throw new DocumentHandlerException(


"Cannot parse PDF document", e);


// String errS = e.toString();


// if (errS.toLowerCase().indexOf("font") != -1) {


// }


}




Document doc = new Document();




if (docText != null) ...{


doc.add(Field.UnStored("body", docText));


}




// extract PDF document's meta-data


PDDocument pdDoc = null;




try ...{


pdDoc = new PDDocument(cosDoc);


PDDocumentInformation docInfo =


pdDoc.getDocumentInformation();


String author = docInfo.getAuthor();


String title = docInfo.getTitle();


String keywords = docInfo.getKeywords();


String summary = docInfo.getSubject();




if ((author != null) && (!author.equals(""))) ...{


doc.add(Field.Text("author", author));


}




if ((title != null) && (!title.equals(""))) ...{


doc.add(Field.Text("title", title));


}




if ((keywords != null) && (!keywords.equals(""))) ...{


doc.add(Field.Text("keywords", keywords));


}




if ((summary != null) && (!summary.equals(""))) ...{


doc.add(Field.Text("summary", summary));


}


}




catch (Exception e) ...{


closeCOSDocument(cosDoc);


closePDDocument(pdDoc);


System.err.println("Cannot get PDF document meta-data: "


+ e.getMessage());


}




return doc;


}




private static COSDocument parseDocument(InputStream is)




throws IOException ...{


PDFParser parser = new PDFParser(is);


parser.parse();


return parser.getDocument();


}






private void closeCOSDocument(COSDocument cosDoc) ...{




if (cosDoc != null) ...{




try ...{


cosDoc.close();


}




catch (IOException e) ...{


// eat it, what else can we do?


}


}


}






private void closePDDocument(PDDocument pdDoc) ...{




if (pdDoc != null) ...{




try ...{


pdDoc.close();


}




catch (IOException e) ...{


// eat it, what else can we do?


}


}


}


}






import java.io.IOException;


import java.io.InputStream;




import javax.swing.text.BadLocationException;


import javax.swing.text.DefaultStyledDocument;


import javax.swing.text.rtf.RTFEditorKit;




import lia.handlingtypes.framework.DocumentHandlerException;




import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;






public class RtfDocumentHandler implements DocumentHandler ...{






public Document getDocument(InputStream is) throws Exception ...{


// TODO Auto-generated method stub


String bodyText = null;




DefaultStyledDocument styledDoc = new DefaultStyledDocument();




try ...{


new RTFEditorKit().read(is, styledDoc, 0);


bodyText = styledDoc.getText(0, styledDoc.getLength());


}




catch (IOException e) ...{


throw new DocumentHandlerException(


"Cannot extract text from a RTF document", e);


}




catch (BadLocationException e) ...{


throw new DocumentHandlerException(


"Cannot extract text from a RTF document", e);


}






if (bodyText != null) ...{


Document doc = new Document();


doc.add(Field.UnStored("body", bodyText));


return doc;


}


return null;


}




}






import java.io.BufferedReader;


import java.io.IOException;


import java.io.InputStream;


import java.io.InputStreamReader;




import lia.handlingtypes.framework.DocumentHandlerException;




import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;






public class TxtDocumentHandler implements DocumentHandler ...{






public Document getDocument(InputStream is) throws Exception ...{


// TODO Auto-generated method stub


String bodyText = "";






try ...{


BufferedReader br =


new BufferedReader(new InputStreamReader(is));


String line = null;




while ((line = br.readLine()) != null) ...{


bodyText += line;


}


br.close();


}




catch(IOException e) ...{


throw new DocumentHandlerException(


"Cannot read the text document", e);


}






if (!bodyText.equals("")) ...{


Document doc = new Document();


doc.add(Field.UnStored("body", bodyText));


return doc;


}




return null;


}




}






import java.io.File;


import java.util.Date;




import org.apache.lucene.document.Document;


import org.apache.lucene.queryParser.QueryParser;


import org.apache.lucene.search.BooleanQuery;


import org.apache.lucene.search.Hits;


import org.apache.lucene.search.IndexSearcher;


import org.apache.lucene.search.Query;


import org.apache.lucene.store.Directory;


import org.apache.lucene.store.FSDirectory;


import org.mira.lucene.analysis.IK_CAnalyzer;






/** *//**


* This code was originally written for


* Erik's Lucene intro java.net article


*/




public class Searcher ...{






public static void main(String[] args) throws Exception ...{




if (args.length != 1) ...{


throw new Exception("Usage: java " + Searcher.class.getName()


+ " <index dir> <query>");


}




// File indexDir = new File(args[0]);


// String q = args[1];


File indexDir = new File("E:/LUCENE/index");


String q=args[0];




if (!indexDir.exists() || !indexDir.isDirectory()) ...{


throw new Exception(indexDir +


" does not exist or is not a directory.");


}




search(indexDir, q);


}




public static void search(File indexDir, String q)




throws Exception ...{


Directory fsDir = FSDirectory.getDirectory(indexDir, false);


IndexSearcher is = new IndexSearcher(fsDir);




Query query = QueryParser.parse(q, "body",


new IK_CAnalyzer());


//在“body”中查找,必须要已经在create index中已经定义好


//QueryParser .parse(String query, String field, Analyzer analyzer),例如:


//query为检索词, field为检索的字段名, analyzer为分析器




long start = new Date().getTime();




// BooleanQuery m_BooleanQuery = new BooleanQuery();


// m_BooleanQuery.add(query,true,false);


Hits hits = is.search(query); //search


long end = new Date().getTime();




System.err.println("Found " + hits.length() +


" document(s) (in " + (end - start) +


" milliseconds) that matched query '" +


q + "':");






for (int i = 0; i < hits.length(); i++) ...{


Document doc = hits.doc(i);


System.out.println(doc.get("filename"));


// System.out.println(doc.getField("contents"));


}


}


}






import java.io.File;


import java.io.FileInputStream;


import java.io.IOException;


import java.util.Date;




import org.apache.lucene.analysis.standard.StandardAnalyzer;


import org.apache.lucene.document.Document;


import org.apache.lucene.document.Field;


import org.apache.lucene.index.IndexWriter;


//下面两个是网上下载的别人的中文分词器,对文中的中文进行分割


import org.mira.lucene.analysis.MIK_CAnalyzer;//(最大全切分)


import org.mira.lucene.analysis.IK_CAnalyzer;//(细粒度全切分)<------引用类




/** *//**


* This code was originally written for


* Erik's Lucene intro java.net article


*/




public class Indexer ...{


// private static Document doc = new Document();




public static void main(String[] args) throws Exception ...{


// if (args.length != 2) {


// throw new Exception("Usage: java " + Indexer.class.getName()


// + " <index dir> <data dir>");


// }


// File indexDir = new File(args[0]);


// File dataDir = new File(args[1]);


File indexDir = new File("E:/LUCENE/index");


File dataDir = new File("E:/LUCENE/test");


long start = new Date().getTime();


int numIndexed = index(indexDir, dataDir);


long end = new Date().getTime();




System.out.println("Indexing " + numIndexed + " files took "


+ (end - start) + " milliseconds");




// test 分词功能:


// System.out.println(new IK_CAnalyzer().tokenStream("用户本地系统中必须安装有Word的应用程序"," "));






}




public static int index(File indexDir, File dataDir)




throws IOException ...{






if (!dataDir.exists() || !dataDir.isDirectory()) ...{


throw new IOException(dataDir


+ " does not exist or is not a directory");


}




IndexWriter writer = new IndexWriter(indexDir,


new IK_CAnalyzer(), true);


writer.setUseCompoundFile(false);




indexDirectory(writer, dataDir);




int numIndexed = writer.docCount();


writer.optimize();


writer.close();


return numIndexed;


}




private static void indexDirectory(IndexWriter writer, File dir)




throws IOException ...{


Document doc = new Document();


File[] files = dir.listFiles();




for (int i = 0; i < files.length; i++) ...{


File f = files[i];




if (f.isDirectory()) ...{


indexDirectory(writer, f); // recurse


} else




...{




try ...{


doc=Factory(f);




} catch (Exception e) ...{


// TODO Auto-generated catch block


e.printStackTrace();


}


writer.addDocument(doc); //一定要将生成的Document加到Writer中去。


}


}


}








private static Document Factory(File f) throws Exception ...{


Document doc = new Document();


DocumentHandler handler=null;




if (f.getName().endsWith(".txt") || f.getName().endsWith(".java")) ...{


//doc = getTxtDocument(new FileInputStream(f));


handler=new TxtDocumentHandler();






} else if (f.getName().endsWith(".doc")) ...{


// doc = getDocument(new FileInputStream(f));


handler=new DocDocumentHandler();




} else if (f.getName().endsWith(".pdf")) ...{


// doc = LucenePDFDocument.getDocument(f);


handler=new PdfDocumentHandler();




} else if (f.getName().endsWith(".rtf")) ...{


// doc = getRtfDocument(new FileInputStream(f));


handler=new RtfDocumentHandler();


} else if (f.getName().endsWith(".html")




|| f.getName().endsWith(".htm")) ...{


// doc = getHtmlDocument(new FileInputStream(f));


handler=new HtmlDocumentHandler();




}




if(handler!=null)...{


doc=handler.getDocument(new FileInputStream(f));


doc.add(Field.Keyword("filename", f.getCanonicalPath()));


System.out.println("Indexing " + f.getCanonicalPath());


}


return doc;


}






}

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: