使用POI读取Word207和Excel2007的例子(来自http://blog.csdn.net/ejbcreate/archive/2009/08/06/4419571.aspx)
2010-09-08 11:49
519 查看
好久没有在CSDN上写些东西了,今天在写全文检索功能时,POI读取Word207和Excel2007文档时遇到问题,于是扩展了一下原来的功能,以下是使用POI读取Word207和Excel2007的例子:
package com.test;
/**
* 需要的jar包:
* poi-3.0.2-FINAL-20080204.jar
* poi-contrib-3.0.2-FINAL-20080204.jar
* poi-scratchpad-3.0.2-FINAL-20080204.jar
* poi-3.5-beta6-20090622.jar
* geronimo-stax-api_1.0_spec-1.0.jar
* ooxml-schemas-1.0.jar
* openxml4j-bin-beta.jar
* poi-ooxml-3.5-beta6-20090622.jar
* xmlbeans-2.3.0.jar
* dom4j-1.6.1.jar
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
public class WordAndExcelExtractor {
public static void main(String[] args){
try{
String wordFile = "D:/松山血战.docx";
String wordText2007 = WordAndExcelExtractor.extractTextFromDOC2007(wordFile);
System.out.println("wordText2007======="+wordText2007);
InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
String excelText = WordAndExcelExtractor.extractTextFromXLS(is);
System.out.println("text2003==========" + excelText);
String excelFile = "D:/Hello2007.xlsx";
String excelText2007 = WordAndExcelExtractor.extractTextFromXLS2007(excelFile);
System.out.println("excelText2007==========" + excelText2007);
}catch(Exception e ){
e.printStackTrace();
}
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC(InputStream is) throws IOException {
WordExtractor ex = new WordExtractor(is); //is是WORD文件的InputStream
return ex.getText();
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC2007(String fileName) throws IOException, OpenXML4JException, XmlException {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
}
/**
* @Method: extractTextFromXLS
* @Description: 从excel 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
@SuppressWarnings("deprecation")
private static String extractTextFromXLS(InputStream is)
throws IOException {
StringBuffer content = new StringBuffer();
HSSFWorkbook workbook = new HSSFWorkbook(is); //创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets); //获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow); //获得列值
if(aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC){
content.append(aCell.getNumericCellValue());
}else if(aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN){
content.append(aCell.getBooleanCellValue());
}else {
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
}
return content.toString();
}
/**
* @Method: extractTextFromXLS2007
* @Description: 从excel 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
private static String extractTextFromXLS2007(String fileName) throws Exception{
StringBuffer content = new StringBuffer();
//构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = new XSSFWorkbook(fileName);
//循环工作表Sheet
for(int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++){
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if(xSheet == null){
continue;
}
//循环行Row
for(int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++){
XSSFRow xRow = xSheet.getRow(rowNum);
if(xRow == null){
continue;
}
//循环列Cell
for(int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++){
XSSFCell xCell = xRow.getCell(cellNum);
if(xCell == null){
continue;
}
if(xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN){
content.append(xCell.getBooleanCellValue());
}else if(xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC){
content.append(xCell.getNumericCellValue());
}else{
content.append(xCell.getStringCellValue());
}
}
}
}
return content.toString();
}
}
package com.test;
/**
* 需要的jar包:
* poi-3.0.2-FINAL-20080204.jar
* poi-contrib-3.0.2-FINAL-20080204.jar
* poi-scratchpad-3.0.2-FINAL-20080204.jar
* poi-3.5-beta6-20090622.jar
* geronimo-stax-api_1.0_spec-1.0.jar
* ooxml-schemas-1.0.jar
* openxml4j-bin-beta.jar
* poi-ooxml-3.5-beta6-20090622.jar
* xmlbeans-2.3.0.jar
* dom4j-1.6.1.jar
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
public class WordAndExcelExtractor {
public static void main(String[] args){
try{
String wordFile = "D:/松山血战.docx";
String wordText2007 = WordAndExcelExtractor.extractTextFromDOC2007(wordFile);
System.out.println("wordText2007======="+wordText2007);
InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
String excelText = WordAndExcelExtractor.extractTextFromXLS(is);
System.out.println("text2003==========" + excelText);
String excelFile = "D:/Hello2007.xlsx";
String excelText2007 = WordAndExcelExtractor.extractTextFromXLS2007(excelFile);
System.out.println("excelText2007==========" + excelText2007);
}catch(Exception e ){
e.printStackTrace();
}
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC(InputStream is) throws IOException {
WordExtractor ex = new WordExtractor(is); //is是WORD文件的InputStream
return ex.getText();
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC2007(String fileName) throws IOException, OpenXML4JException, XmlException {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
}
/**
* @Method: extractTextFromXLS
* @Description: 从excel 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
@SuppressWarnings("deprecation")
private static String extractTextFromXLS(InputStream is)
throws IOException {
StringBuffer content = new StringBuffer();
HSSFWorkbook workbook = new HSSFWorkbook(is); //创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets); //获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow); //获得列值
if(aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC){
content.append(aCell.getNumericCellValue());
}else if(aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN){
content.append(aCell.getBooleanCellValue());
}else {
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
}
return content.toString();
}
/**
* @Method: extractTextFromXLS2007
* @Description: 从excel 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
private static String extractTextFromXLS2007(String fileName) throws Exception{
StringBuffer content = new StringBuffer();
//构造 XSSFWorkbook 对象,strPath 传入文件路径
XSSFWorkbook xwb = new XSSFWorkbook(fileName);
//循环工作表Sheet
for(int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++){
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if(xSheet == null){
continue;
}
//循环行Row
for(int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++){
XSSFRow xRow = xSheet.getRow(rowNum);
if(xRow == null){
continue;
}
//循环列Cell
for(int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++){
XSSFCell xCell = xRow.getCell(cellNum);
if(xCell == null){
continue;
}
if(xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN){
content.append(xCell.getBooleanCellValue());
}else if(xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC){
content.append(xCell.getNumericCellValue());
}else{
content.append(xCell.getStringCellValue());
}
}
}
}
return content.toString();
}
}
相关文章推荐
- 哈哈,收藏源代码(本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/hakule/archive/2008/09/01/2861688.aspx)
- 本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/jinjazz/archive/2009/04/29/4138000.aspx
- .net 中viewstate的原理和使用 转 http://blog.csdn.net/greyls/archive/2007/07/08/1682570.aspx
- 使用POI读取Word207和Excel2007的例子
- 我的log4net使用手册(转自 http://blog.csdn.net/lyjcn/archive/2009/08/11/4432833.aspx)
- 生成100万条8位不重复数据的示例--来自http://blog.csdn.net/zjcxc/archive/2006/08/20/1099215.aspx
- FCKeditor使用说明http://blog.csdn.net/jhaij/archive/2007/05/01/1594124.aspx
- 在C#中使用异步Socket编程实现TCP网络服务的C/S的通讯构架(一)----基础类库部分(来源:http://blog.csdn.net/yangjundeng/archive/2005/03/17/321920.aspx)
- asp.net中生成、读取Rss http://blog.csdn.net/wszhoho/archive/2007/05/09/1601888.aspx
- 使用POI读取Word207和Excel2007的例子
- c#读取并修改App.config文件实例(转载:http://blog.csdn.net/abuhome/archive/2010/01/13/5184467.aspx)
- 使用信号实现异步通知机制的例子 http://blog.csdn.net/buaa_shang/article/details/9103155
- 使用 CL 编译器选项查看 C++ 类内存布局 (转载)http://blog.csdn.net/zhangcunli/archive/2009/10/23/4720781.aspx
- 在窗口应用中使用printf,cout 等将输出重定向到console收藏 http://blog.csdn.net/royer/archive/2007/01/25/1492968.aspx
- CImageList使用指南(http://blog.csdn.net/panfei10000/archive/2006/12/21/1452278.aspx)
- vs2005使用记趣(引用于http://blog.csdn.net/flyingjsj/archive/2008/08/02/2756496.aspx)
- group、grouping、rollup、cube的用法和区别(本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/junmail/archive/2006/12/26/1463132.aspx)
- Android应用程序使用Google Map (转http://blog.csdn.net/iefreer/archive/2009/09/20/4572879.aspx)
- [转] 一个分组查询的SQL 常用算法(附源码可直接执行) [来自--http://blog.csdn.net/rainbowsoftware/archive/2007/04/26/1585355.aspx]
- 成功的背后!(给所有IT人)(转载来自http://blog.csdn.net/ysuncn/archive/2007/10/07/1814127.aspx)