您的位置:首页 > 编程语言 > Java开发

JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

2017-02-24 14:17 776 查看
首先引入Maven库<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>3.15</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>3.15</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>3.15</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.4</version></dependency>
public class ParseText {// 判断文档类型,调用不同的解析方法public static String parse(byte[] buffer, String suffix) {String text = "";switch (suffix) {case "doc":text = getTextFromWord(buffer);break;case "docx":text = getTextFromWord2007(buffer);break;case "xls":text = getTextFromExcel(buffer);break;case "xlsx":text = getTextFromExcel2007(buffer);break;case "ppt":text = getTextFromPPT(buffer);break;case "pptx":text = getTextFromPPT2007(buffer);break;case "pdf":text = getTextFormPDF(buffer);break;case "txt":text = getTextFormTxt(buffer);break;default:System.out.println("不支持解析的文档类型");}return text.replaceAll("\\s*", "");}// 读取Word97-2003的全部内容 docprivate static String getTextFromWord(byte[] file) {String text = "";InputStream fis = null;WordExtractor ex = null;try {// word 2003: 图片不会被读取fis = new ByteArrayInputStream(file);ex = new WordExtractor(fis);text = ex.getText();ex.close();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return text;}// 读取Word2007+的全部内容 docxprivate static String getTextFromWord2007(byte[] file) {String text = "";InputStream fis = null;XWPFDocument doc = null;XWPFWordExtractor workbook = null;try {fis = new ByteArrayInputStream(file);doc = new XWPFDocument(fis);workbook = new XWPFWordExtractor(doc);text = workbook.getText();workbook.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return text;}// 读取Excel97-2003的全部内容 xlsprivate static String getTextFromExcel(byte[] file) {InputStream is = null;HSSFWorkbook wb = null;String text = "";try {is = new ByteArrayInputStream(file);wb = new HSSFWorkbook(new POIFSFileSystem(is));ExcelExtractor extractor = new ExcelExtractor(wb);extractor.setFormulasNotResults(false);extractor.setIncludeSheetNames(false);text = extractor.getText();extractor.close();} catch (IOException e) {e.printStackTrace();}return text;}// 读取Excel2007+的全部内容 xlsxprivate static String getTextFromExcel2007(byte[] file) {InputStream is = null;XSSFWorkbook workBook = null;String text = "";try {is = new ByteArrayInputStream(file);workBook = new XSSFWorkbook(is);XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);extractor.setIncludeSheetNames(false);text = extractor.getText();extractor.close();} catch (IOException e) {e.printStackTrace();}return text;}// 读取Powerpoint97-2003的全部内容 pptprivate static String getTextFromPPT(byte[] file) {String text = "";InputStream fis = null;PowerPointExtractor ex = null;try {// word 2003: 图片不会被读取fis = new ByteArrayInputStream(file);ex = new PowerPointExtractor(fis);text = ex.getText();ex.close();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return text;}// 抽取幻灯片2007+全部内容 pptxprivate static String getTextFromPPT2007(byte[] file) {InputStream is = null;XMLSlideShow slide = null;String text = "";try {is = new ByteArrayInputStream(file);slide = new XMLSlideShow(is);XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide);text = extractor.getText();extractor.close();} catch (IOException e) {e.printStackTrace();}return text;}// 读取pdf文件全部内容 pdfprivate static String getTextFormPDF(byte[] file) {String text = "";PDDocument pdfdoc = null;InputStream is = null;try {is = new ByteArrayInputStream(file);pdfdoc = PDDocument.load(is);PDFTextStripper stripper = new PDFTextStripper();text = stripper.getText(pdfdoc);} catch (IOException e) {e.printStackTrace();} finally {try {if (pdfdoc != null) {pdfdoc.close();}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}return text;}// 读取txt文件全部内容 txtprivate static String getTextFormTxt(byte[] file) {String text = "";try {String encoding = get_charset(file);text = new String(file, encoding);} catch (UnsupportedEncodingException e) {e.printStackTrace();} catch (IOException e1) {e1.printStackTrace();}return text;}// 获得txt文件编码方式private static String get_charset(byte[] file) throws IOException {String charset = "GBK";byte[] first3Bytes = new byte[3];InputStream bis = null;try {boolean checked = false;bis = new ByteArrayInputStream(file);bis.mark(0);int read = bis.read(first3Bytes, 0, 3);if (read == -1)return charset;if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {charset = "UTF-16LE";checked = true;} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {charset = "UTF-16BE";checked = true;} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB&& first3Bytes[2] == (byte) 0xBF) {charset = "UTF-8";checked = true;}bis.reset();if (!checked) {while ((read = bis.read()) != -1) {if (read >= 0xF0)break;if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBKbreak;if (0xC0 <= read && read <= 0xDF) {read = bis.read();if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)// (0x80 - 0xBF),也可能在GB编码内continue;elsebreak;} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小read = bis.read();if (0x80 <= read && read <= 0xBF) {read = bis.read();if (0x80 <= read && read <= 0xBF) {charset = "UTF-8";break;} elsebreak;} elsebreak;}}}} catch (Exception e) {e.printStackTrace();} finally {if (bis != null) {bis.close();}}return charset;}}
// 读取pdf文件private static String getTextFormPDF(byte[] file) {String text = "";PDDocument pdfdoc = null;InputStream is = null;try {is = new ByteArrayInputStream(file);pdfdoc = PDDocument.load(is);PDFTextStripper stripper = new PDFTextStripper();text = stripper.getText(pdfdoc);} catch (IOException e) {e.printStackTrace();} finally {try {if (pdfdoc != null) {pdfdoc.close();}} catch (I4000OException e) {// TODO Auto-generated catch blocke.printStackTrace();}}return text;}// 读取txt文件private static String getTextFormTxt(byte[] file) {String text = "";try {String encoding = get_charset(file);text = new String(file, encoding);} catch (UnsupportedEncodingException e) {e.printStackTrace();} catch (IOException e1) {e1.printStackTrace();}return text;}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java word excel 文档 pdf
相关文章推荐