您的位置:首页 > 编程语言 > Java开发

JAVA中通过poi和pdfbox读取office文件和pdf文件内容

2017-11-15 10:33 1156 查看
          最近做了个文档管理系统,实现了公司文档资源在线化。因为涉及到全文搜索,所以需要读取文件的内容创建全文索引。

本人通过POI读取的office文件和pdfbox读取的pdf文件内容,具体代码如下:

(1)首先在工程中引入需要的jar包文件。如果项目用的是maven工程则需要在pom.xml文件中添加如下代码:

<!--读取pdf内容  -->
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.4</version>
</dependency>

<!--操作office的JAR包  -->
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.9</version>
</dependency>

(2)读取文件内容代码:

/**
* 读取ppt内容
* @param file
* @return
* @throws IOException
*/
public static String readPPT(String file) throws IOException {
StringBuilder sb = new StringBuilder();
SlideShow ppt = new SlideShow(new HSLFSlideShow(file));
Slide[] slides = ppt.getSlides();
//提取文本信息 
for (Slide each : slides) {
    TextRun[] textRuns = each.getTextRuns();
    for (int i=0 ;i< textRuns.length; i++ ) {
        RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
        for (int j = 0; j < richTextRuns.length; j++) {
            sb.append(richTextRuns[j].getText());
        }
        sb.append("\n");
    }
        sb.append("\n");
}
return sb.toString();
}

/**
* 读取pptx文件内容
* @param file
* @return
* @throws IOException
* @throws XmlException
* @throws OpenXML4JException
*/
 
public static String readPPT2007(String file) throws IOException, XmlException, OpenXML4JException {
     return new XSLFPowerPointExtractor(POIXMLDocument.openPackage(file)).getText();   
}

/**
* 读取xls文件内容
*/
public static String readEXCEL(String file) throws IOException {
StringBuilder content = new StringBuilder();
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));// 创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
if (convertCell(aCell).length() > 0) {
content.append(convertCell(aCell));
}
}
content.append("\n");
}
}
}
}
}
return content.toString();
}

/**
* 读取xlsx
* @param file
* @return
* @throws IOException
*/
public static String readEXCEL2007(String file) throws IOException {
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook(file);
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
if (convertCell(aCell).length() > 0) {
content.append(convertCell(aCell));
}
}
content.append("\n");
}
}
}
}
}
return content.toString();
}

private static String convertCell(Cell cell) {
NumberFormat formater = NumberFormat.getInstance();
formater.setGroupingUsed(false);
String cellValue = "";
if (cell == null) {
return cellValue;
}
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_NUMERIC:
cellValue = formater.format(cell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BLANK:
cellValue = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
break;
case HSSFCell.CELL_TYPE_ERROR:
cellValue = String.valueOf(cell.getErrorCellValue());
break;
default:
cellValue = "";
}
return cellValue.trim();
}

/**
* 读取doc文件
* @param file
* @return
* @throws Exception
*/
public static String readWORD(String file) throws Exception {
String returnStr = "";
try {
WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file)));
returnStr = wordExtractor.getText();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return returnStr;
}

/**
* 读取docx文件
* @param file
* @return
* @throws Exception
*/
public static String readWORD2007(String file) throws Exception {
     return new XWPFWordExtractor(POIXMLDocument.openPackage(file)).getText();   
}

/**
* 读取txt文件
* @param file
* @return
* @throws IOException
*/
public  static String readTXT(String file) throws IOException {
String encoding = get_charset(new File(file));
if (encoding.equalsIgnoreCase("GBK")) {
return FileUtils.readFileToString(new File(file), "gbk");
} else {
return FileUtils.readFileToString(new File(file), "utf8");
}
}

private static String get_charset(File file) throws IOException {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
BufferedInputStream bis = null;
try {
boolean checked = false;
bis = new BufferedInputStream(new FileInputStream(file));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
// int len = 0;
int loc = 0;

while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
// (0x80
// - 0xBF),也可能在GB编码内
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
// System.out.println( loc + " " + Integer.toHexString( read )
// );
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bis != null) {
bis.close();
}
}
return charset;
}

/**
* 读取pdf文件内容
* @param file
* @return
* @throws IOException
*/
public static String readPDF(String file) throws IOException {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(file);
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} finally {
if (is != null) {
is.close();
}
if (document != null) {
document.close();
}
}
return result;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐