Java POI组件——简单提取Word、word转html、text、xml(仅支持doc,不支持docx)
2016-11-20 23:00
841 查看
需要添加的库
poi-3.15.jar
poi-ooxml-3.15.jar
poi-scratchpad-3.15.jar
poi-3.15.jar
poi-ooxml-3.15.jar
poi-scratchpad-3.15.jar
package com.poi.word; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.poi.POITextExtractor; import org.apache.poi.hpsf.ClassID; import org.apache.poi.hpsf.CustomProperties; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.Property; import org.apache.poi.hpsf.Section; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.Thumbnail; import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.AbstractWordConverter; import org.apache.poi.hwpf.converter.WordToFoConverter; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.converter.WordToTextConverter; import org.apache.poi.hwpf.extractor.WordExtractor; import org.w3c.dom.Document; /* poi对word的支持比较差,像word转html、text、xml仅支持doc,不支持docx */ public class PoiWordClass { private static void extract(String path) { InputStream is = null; WordExtractor extractor = null; try { is = new FileInputStream(path); extractor = new WordExtractor(is); System.out.println("\nextractor.getText()"); System.out.println(extractor.getText()); System.out.println("\nextractor.getTextFromPieces()"); System.out.println(extractor.getTextFromPieces()); System.out.println("\nextractor.getHeaderText()"); System.out.println(extractor.getHeaderText()); System.out.println("\nextractor.getFooterText()"); System.out.println(extractor.getFooterText()); System.out.println("\nextractor.getCommentsText()"); String[] commentsText = extractor.getCommentsText(); for (String str : commentsText) { System.out.println(str); } System.out.println("\nextractor.getEndnoteText()"); String[] endnoteText = extractor.getEndnoteText(); for (String str : endnoteText) { System.out.println(str); } System.out.println("\nextractor.getFootnoteText()"); String[] footnoteText = extractor.getFootnoteText(); for (String str : footnoteText) { System.out.println(str); } System.out.println("\nextractor.getMainTextboxText()"); String[] mainTextboxText = extractor.getMainTextboxText(); for (String str : mainTextboxText) { System.out.println(str); } System.out.println("\nextractor.getParagraphText()"); String[] paragraphText = extractor.getParagraphText(); for (String str : paragraphText) { System.out.println(str); } System.out.println("\nextractor.getDocSummaryInformation().toString()"); DocumentSummaryInformation docSummaryInformation = extractor.getDocSummaryInformation(); System.out.println(docSummaryInformation.toString()); System.out.println("\nextractor.getMetadataTextExtractor().toString()"); POITextExtractor metadataTextExtractor = extractor.getMetadataTextExtractor(); System.out.println(metadataTextExtractor.getText()); System.out.println("\nextractor.getSummaryInformation().toString()"); SummaryInformation summaryInformation = extractor.getSummaryInformation(); System.out.println(summaryInformation.toString()); print(docSummaryInformation); } catch (IOException e) { e.printStackTrace(); } finally { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } } private static void print(DocumentSummaryInformation docSummaryInformation) { int applicationVersion = docSummaryInformation.getApplicationVersion(); int byteCount = docSummaryInformation.getByteCount(); int byteOrder = docSummaryInformation.getByteOrder(); String category = docSummaryInformation.getCategory(); int charCountWithSpaces = docSummaryInformation.getCharCountWithSpaces(); Class<? extends DocumentSummaryInformation> cls = docSummaryInformation.getClass(); String company = docSummaryInformation.getCompany(); String contentStatus = docSummaryInformation.getContentStatus(); String contentType = docSummaryInformation.getContentType(); CustomProperties customProperties = docSummaryInformation.getCustomProperties(); // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented. // byte[] docparts = docSummaryInformation.getDocparts(); String documentVersion = docSummaryInformation.getDocumentVersion(); Section section = docSummaryInformation.getFirstSection(); int format = docSummaryInformation.getFormat(); // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented. // byte[] headingPair = docSummaryInformation.getHeadingPair(); int hiddenCount = docSummaryInformation.getHiddenCount(); boolean hyperlinksChanged = docSummaryInformation.getHyperlinksChanged(); String language = docSummaryInformation.getLanguage(); int lineCount = docSummaryInformation.getLineCount(); boolean linksDirty = docSummaryInformation.getLinksDirty(); String manager = docSummaryInformation.getManager(); int mmClipCount = docSummaryInformation.getMMClipCount(); int noteCount = docSummaryInformation.getNoteCount(); int osVersion = docSummaryInformation.getOSVersion(); int parCount = docSummaryInformation.getParCount(); String presentationFormat = docSummaryInformation.getPresentationFormat(); Property[] properties = docSummaryInformation.getProperties(); PropertyIDMap propertyIDMap = docSummaryInformation.getPropertySetIDMap(); boolean scale = docSummaryInformation.getScale(); int sectionCount = docSummaryInformation.getSectionCount(); LinkedList<Section> sections = (LinkedList<Section>) docSummaryInformation.getSections(); // org.apache.poi.hpsf.NoSingleSectionException: Property set contains 2 sections. // Section singleSection = docSummaryInformation.getSingleSection(); int slideCount = docSummaryInformation.getSlideCount(); byte[] vbaDigitalSignature = docSummaryInformation.getVBADigitalSignature(); //由于内部是HashMap<Long, String>,故反射无效 // System.out.println("反射测试"); // Field[] fields = cls.getFields(); // System.out.println("fields.length = " + fields.length); // for (int i = 0; i < fields.length; i++) { // if (!fields[i].isAccessible()) { // fields[i].setAccessible(true); // } // try { // System.out.println(fields[i].getName() + " = " + fields[i].get(docSummaryInformation)); // } catch (IllegalArgumentException e) { // e.printStackTrace(); // } catch (IllegalAccessException e) { // e.printStackTrace(); // } // } } private static void print(POITextExtractor metadataTextExtractor) { Class<? extends POITextExtractor> cls = metadataTextExtractor.getClass(); POITextExtractor poiTextExtractor = metadataTextExtractor.getMetadataTextExtractor(); String text = metadataTextExtractor.getText(); } private static void print(SummaryInformation summaryInformation) { String applicationName = summaryInformation.getApplicationName(); String author = summaryInformation.getAuthor(); int byteOrder = summaryInformation.getByteOrder(); int charCount = summaryInformation.getCharCount(); Class<? extends SummaryInformation> cls = summaryInformation.getClass(); ClassID classID = summaryInformation.getClassID(); String comments = summaryInformation.getComments(); Date createDateTime = summaryInformation.getCreateDateTime(); long editTime = summaryInformation.getEditTime(); Section section = summaryInformation.getFirstSection(); int format = summaryInformation.getFormat(); String keywords = summaryInformation.getKeywords(); String lastAuthor = summaryInformation.getLastAuthor(); Date lastPrinted = summaryInformation.getLastPrinted(); Date lastSaveDateTime = summaryInformation.getLastSaveDateTime(); int osVersion = summaryInformation.getOSVersion(); int pageCount = summaryInformation.getPageCount(); Property[] properties = summaryInformation.getProperties(); PropertyIDMap propertySetIDMap = summaryInformation.getPropertySetIDMap(); String recNumber = summaryInformation.getRevNumber(); int sectionCount = summaryInformation.getSectionCount(); ArrayList<Section> sections = (ArrayList<Section>) summaryInformation.getSections(); int security = summaryInformation.getSecurity(); Section singleSection = summaryInformation.getSingleSection(); String subject = summaryInformation.getSubject(); String template = summaryInformation.getTemplate(); byte[] thumbnail = summaryInformation.getThumbnail(); Thumbnail thumbnailThumbnail = summaryInformation.getThumbnailThumbnail(); String title = summaryInformation.getTitle(); int wordCount = summaryInformation.getWordCount(); } enum ConverterType { HTML, TEXT, XML } private static void convert(String srcPath, String destPathWithoutExtension, ConverterType type) { InputStream is = null; Writer writer = null; try { is = new FileInputStream(srcPath); HWPFDocument hwpfDocument = new HWPFDocument(is); Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); AbstractWordConverter converter = null; String method = null; switch (type) { case HTML: converter = new WordToHtmlConverter(document); method = "html"; destPathWithoutExtension += ".html"; break; case TEXT: converter = new WordToTextConverter(document); method = "text"; destPathWithoutExtension += ".txt"; break; case XML: converter = new WordToFoConverter(document); method = "xml"; destPathWithoutExtension += ".xml"; break; } converter.processDocument(hwpfDocument); Transformer transformer = TransformerFactory.newInstance().newTransformer(); writer = new FileWriter(destPathWithoutExtension); transformer.setOutputProperty(OutputKeys.ENCODING, "gbk"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty(OutputKeys.METHOD, method); DOMSource domSource = new DOMSource(converter.getDocument()); StreamResult streamResult = new StreamResult(writer); transformer.transform(domSource, streamResult); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerFactoryConfigurationError e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } finally { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } private static void convertToHtml(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.HTML); } private static void convertToText(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.TEXT); } private static void convertToXml(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.XML); } public static void main(String[] args) { String path = "test.doc"; extract(path); convertToHtml(path, "test"); convertToText(path, "test"); convertToXml(path, "test"); } }
相关文章推荐
- java docx4j导出HTML为word(.docx)报错:org.docx4j.org.xhtmlrenderer.load INFO:: SAX XMLReader in use (parse
- JAVA使用POI操作word文档实例,兼容doc,docx(附源码)
- java读取word表格导入数据库,支持doc、docx
- 使用Java的POI工具进行Word的DOC文档转为HTML页面技术简介
- Java 使用jacob ppt文件转pptx,doc转docx;word 转html、pdf等
- Java:封装POI实现word的docx文件的简单模板功能
- poi word 转html (.DOC .DOCX )
- poi修改word文档doc/docx不支持图片
- java word转html(03,07) jacob,openoffcie,poi
- 利用POI提取Word(.docx)文件的批注内容
- Java引用POI实现Word转Html方法
- java将html导出成word(利用的poi包导出)
- java将html导出成word(利用的poi包导出)
- JAVA实现word doc docx pdf excel的在线浏览 - 仿百度文库 源码
- poi实现word文档的导入(针对.doc .docx rtf)
- apache poi解析word(doc)文档成xml及导出成html
- poi操作word,简单写docx
- poi将word转换为html (对于word部分格式支持不是很好)
- Using POI to replace elements in WORD(.docx/.doc)(使用POI替换word中的特定字符/文字)【改进】
- 简单的提取html中的TextNode