您的位置:首页 > 编程语言 > Java开发

[java] 获取pdf/word文档文本内容

2013-03-13 16:24 169 查看
package com;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
//import java.io.FileInputStream;
//import java.io.FileNotFoundException;
//import java.io.IOException;
//import java.util.HashMap;
//import java.util.Iterator;
//import java.util.Map;
//
//import org.apache.poi.hwpf.HWPFDocument;
//import org.apache.poi.hwpf.model.FieldsDocumentPart;
//import org.apache.poi.hwpf.usermodel.Field;
//import org.apache.poi.hwpf.usermodel.Fields;
//import org.apache.poi.hwpf.usermodel.Range;

import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;

public class Test {

public static void main(String[] args) {
/*
try {

//word格式
String path="D:\\workspace\\MyPlatFileNew\\web\\content\\kent\\a6\\uploadattach\\iplat4j01361351007003_20130220170327.doc";
System.out.println("========"+path);
File inputFile = new File(path);

POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
System.out.println("Document Text: ");
System.out.println("====================");
System.out.println(extractor.getText());
System.out.println("====================");
}catch (Exception ex) {
ex.printStackTrace();
}*/

//pdf格式
FileInputStream fis = null;
String path="D://知识积累//EL.pdf";
try {
fis = new FileInputStream(path);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
PDFParser p = null;
try {
p = new PDFParser(fis);
} catch (IOException e) {
e.printStackTrace();
}
try {
p.parse();
} catch (IOException e) {
e.printStackTrace();
}
PDFTextStripper ts = null;
try {
ts = new PDFTextStripper();
} catch (IOException e1) {
e1.printStackTrace();
}
String s = null;
try {
s = ts.getText(p.getPDDocument());
System.out.println("----------begin------------");
System.out.println(s);
System.out.println("-----------end-----------");
} catch (IOException e) {
e.printStackTrace();
}

try {
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}

}

需要用到的jar包有pdfbox-1.7.1.jar,poi-3.9-20121203.jar,poi-ooxml-3.9-20121203.jar。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: