Java引用POI实现Word转Html方法
2012-12-17 16:54
597 查看
原文:
/article/9239771.html
1.
下载POI工具并引用
2.
读取整个doc文档,获得该文档的所有字符串。
3.
从该字符串中得到标题,把该标题构成一个HTML格式的字符串,如<html><head><title>测试文档</title></head><body>。
4.
从该文档中判断是否有表格,如有,把每个表格的开始偏移量,结束偏移量记录下来,同时根据每个表格的行,列读取表格的内容,并构造出表格的HTML字符串。
5.
从该字符串的第一个字符开始逐个字符循环,得到字符的字体,字号大小,直到下一个字符的字体,字号不一样时,把这些字符内容构造成一个HTML格式的字符串。
6.
如果碰到字符为回车符,制表符,把回车符,制表符构造成HTML格式的字符串。
7.
如果碰到字符为图片,读取图片,把图片放在指定路径,再把这一路径的信息构造成HTML字符串,如<img
src='c://test//1.jpg'/>。
8.
如读取字符串的位置等于表格的开始偏移量时,插入前面一构造出的表格HTML字符串,同时跳过表格的结束偏移量,继续往下循环读取字符。
9.
由于以上读取是按字符串逐个读取,并且根据字符的变化同时构造出HTML字符串,所以当字符串读取完毕后,即构造出一个完整的HTML字符串。
10.
举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java
[java]
view plaincopyprint?
package com;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
public class WordExcelToHtml {
/**
* 回车符ASCII码
*/
private static final short ENTER_ASCII = 13;
/**
* 空格符ASCII码
*/
private static final short SPACE_ASCII = 32;
/**
* 水平制表符ASCII码
*/
private static final short TABULATION_ASCII = 9;
public static String htmlText = "";
public static String htmlTextTbl = "";
public static int counter=0;
public static int beginPosi=0;
public static int endPosi=0;
public static int beginArray[];
public static int endArray[];
public static String htmlTextArray[];
public static boolean tblExist=false;
public static final String inputFile="c://bb.doc";
public static void main(String argv[])
{
try {
getWordAndStyle(inputFile);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 读取每个文字样式
*
* @param fileName
* @throws Exception
*/
public static void getWordAndStyle(String fileName) throws Exception {
FileInputStream in = new FileInputStream(new File(fileName));
HWPFDocument doc = new HWPFDocument(in);
Range rangetbl = doc.getRange();//得到文档的读取范围
TableIterator it = new TableIterator(rangetbl);
int num=100;
beginArray=new int[num];
endArray=new int[num];
htmlTextArray=new String[num];
// 取得文档中字符的总数
int length = doc.characterLength();
// 创建图片容器
PicturesTable pTable = doc.getPicturesTable();
htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
// 创建临时字符串,好加以判断一串字符是否存在相同格式
if(it.hasNext())
{
readTable(it,rangetbl);
}
int cur=0;
String tempString = "";
for (int i = 0; i < length - 1; i++) {
// 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
Range range = new Range(i, i + 1, doc);
CharacterRun cr = range.getCharacterRun(0);
//beginArray=new int[num];
//endArray=new int[num];
//htmlTextArray=new String[num];
if(tblExist)
{
if(i==beginArray[cur])
{
htmlText+=tempString+htmlTextArray[cur];
tempString="";
i=endArray[cur]-1;
cur++;
continue;
}
}
if (pTable.hasPicture(cr)) {
htmlText += tempString ;
// 读写图片
readPicture(pTable, cr);
tempString = "";
}
else {
Range range2 = new Range(i + 1, i + 2, doc);
// 第二个字符
CharacterRun cr2 = range2.getCharacterRun(0);
char c = cr.text().charAt(0);
System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
// 判断是否为回车符
if (c == ENTER_ASCII)
{
tempString += "<br/>";
}
// 判断是否为空格符
else if (c == SPACE_ASCII)
tempString += " ";
// 判断是否为水平制表符
else if (c == TABULATION_ASCII)
tempString += " ";
// 比较前后2个字符是否具有相同的格式
boolean flag = compareCharStyle(cr, cr2);
if (flag)
tempString += cr.text();
else {
String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
if (cr.isBold())
fontStyle += "font-weight:bold;";
if (cr.isItalic())
fontStyle += "font-style:italic;";
htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
if (cr.isBold())
fontStyle += "font-weight:bold;";
if (cr.isItalic())
fontStyle += "font-style:italic;";
htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
tempString = "";
}
}
}
htmlText += tempString+"</body></html>";
writeFile(htmlText);
}
/**
* 读写文档中的表格
*
* @param pTable
* @param cr
* @throws Exception
*/
public static void readTable(TableIterator it, Range rangetbl) throws Exception {
htmlTextTbl="";
//迭代文档中的表格
counter=-1;
while (it.hasNext())
{
tblExist=true;
htmlTextTbl="";
Table tb = (Table) it.next();
beginPosi=tb.getStartOffset() ;
endPosi=tb.getEndOffset();
System.out.println("............"+beginPosi+"...."+endPosi);
counter=counter+1;
//迭代行,默认从0开始
beginArray[counter]=beginPosi;
endArray[counter]=endPosi;
htmlTextTbl+="<table border>";
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
htmlTextTbl+="<tr>";
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得单元格
int cellWidth=td.getWidth();
//取得单元格的内容
for(int k=0;k<td.numParagraphs();k++){
Paragraph para =td.getParagraph(k);
String s = para.text().toString().trim();
if(s=="")
{
s=" ";
}
System.out.println(s);
htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
System.out.println(i+":"+j+":"+cellWidth+":"+s);
} //end for
} //end for
} //end for
htmlTextTbl+="</table>" ;
htmlTextArray[counter]=htmlTextTbl;
} //end while
}
/**
* 读写文档中的图片
*
* @param pTable
* @param cr
* @throws Exception
*/
public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
// 提取图片
Picture pic = pTable.extractPicture(cr, false);
// 返回POI建议的图片文件名
String afileName = pic.suggestFullFileName();
OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
pic.writeImageContent(out);
htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
}
public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
{
boolean flag = false;
if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
{
flag = true;
}
return flag;
}
/**
* 写文件
*
* @param s
*/
public static void writeFile(String s) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File("c://abc.html");
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos));
bw.write(s);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
/article/9239771.html
1.
下载POI工具并引用
2.
读取整个doc文档,获得该文档的所有字符串。
3.
从该字符串中得到标题,把该标题构成一个HTML格式的字符串,如<html><head><title>测试文档</title></head><body>。
4.
从该文档中判断是否有表格,如有,把每个表格的开始偏移量,结束偏移量记录下来,同时根据每个表格的行,列读取表格的内容,并构造出表格的HTML字符串。
5.
从该字符串的第一个字符开始逐个字符循环,得到字符的字体,字号大小,直到下一个字符的字体,字号不一样时,把这些字符内容构造成一个HTML格式的字符串。
6.
如果碰到字符为回车符,制表符,把回车符,制表符构造成HTML格式的字符串。
7.
如果碰到字符为图片,读取图片,把图片放在指定路径,再把这一路径的信息构造成HTML字符串,如<img
src='c://test//1.jpg'/>。
8.
如读取字符串的位置等于表格的开始偏移量时,插入前面一构造出的表格HTML字符串,同时跳过表格的结束偏移量,继续往下循环读取字符。
9.
由于以上读取是按字符串逐个读取,并且根据字符的变化同时构造出HTML字符串,所以当字符串读取完毕后,即构造出一个完整的HTML字符串。
10.
举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java
[java]
view plaincopyprint?
package com;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
public class WordExcelToHtml {
/**
* 回车符ASCII码
*/
private static final short ENTER_ASCII = 13;
/**
* 空格符ASCII码
*/
private static final short SPACE_ASCII = 32;
/**
* 水平制表符ASCII码
*/
private static final short TABULATION_ASCII = 9;
public static String htmlText = "";
public static String htmlTextTbl = "";
public static int counter=0;
public static int beginPosi=0;
public static int endPosi=0;
public static int beginArray[];
public static int endArray[];
public static String htmlTextArray[];
public static boolean tblExist=false;
public static final String inputFile="c://bb.doc";
public static void main(String argv[])
{
try {
getWordAndStyle(inputFile);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 读取每个文字样式
*
* @param fileName
* @throws Exception
*/
public static void getWordAndStyle(String fileName) throws Exception {
FileInputStream in = new FileInputStream(new File(fileName));
HWPFDocument doc = new HWPFDocument(in);
Range rangetbl = doc.getRange();//得到文档的读取范围
TableIterator it = new TableIterator(rangetbl);
int num=100;
beginArray=new int[num];
endArray=new int[num];
htmlTextArray=new String[num];
// 取得文档中字符的总数
int length = doc.characterLength();
// 创建图片容器
PicturesTable pTable = doc.getPicturesTable();
htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
// 创建临时字符串,好加以判断一串字符是否存在相同格式
if(it.hasNext())
{
readTable(it,rangetbl);
}
int cur=0;
String tempString = "";
for (int i = 0; i < length - 1; i++) {
// 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
Range range = new Range(i, i + 1, doc);
CharacterRun cr = range.getCharacterRun(0);
//beginArray=new int[num];
//endArray=new int[num];
//htmlTextArray=new String[num];
if(tblExist)
{
if(i==beginArray[cur])
{
htmlText+=tempString+htmlTextArray[cur];
tempString="";
i=endArray[cur]-1;
cur++;
continue;
}
}
if (pTable.hasPicture(cr)) {
htmlText += tempString ;
// 读写图片
readPicture(pTable, cr);
tempString = "";
}
else {
Range range2 = new Range(i + 1, i + 2, doc);
// 第二个字符
CharacterRun cr2 = range2.getCharacterRun(0);
char c = cr.text().charAt(0);
System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
// 判断是否为回车符
if (c == ENTER_ASCII)
{
tempString += "<br/>";
}
// 判断是否为空格符
else if (c == SPACE_ASCII)
tempString += " ";
// 判断是否为水平制表符
else if (c == TABULATION_ASCII)
tempString += " ";
// 比较前后2个字符是否具有相同的格式
boolean flag = compareCharStyle(cr, cr2);
if (flag)
tempString += cr.text();
else {
String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
if (cr.isBold())
fontStyle += "font-weight:bold;";
if (cr.isItalic())
fontStyle += "font-style:italic;";
htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
if (cr.isBold())
fontStyle += "font-weight:bold;";
if (cr.isItalic())
fontStyle += "font-style:italic;";
htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
tempString = "";
}
}
}
htmlText += tempString+"</body></html>";
writeFile(htmlText);
}
/**
* 读写文档中的表格
*
* @param pTable
* @param cr
* @throws Exception
*/
public static void readTable(TableIterator it, Range rangetbl) throws Exception {
htmlTextTbl="";
//迭代文档中的表格
counter=-1;
while (it.hasNext())
{
tblExist=true;
htmlTextTbl="";
Table tb = (Table) it.next();
beginPosi=tb.getStartOffset() ;
endPosi=tb.getEndOffset();
System.out.println("............"+beginPosi+"...."+endPosi);
counter=counter+1;
//迭代行,默认从0开始
beginArray[counter]=beginPosi;
endArray[counter]=endPosi;
htmlTextTbl+="<table border>";
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
htmlTextTbl+="<tr>";
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得单元格
int cellWidth=td.getWidth();
//取得单元格的内容
for(int k=0;k<td.numParagraphs();k++){
Paragraph para =td.getParagraph(k);
String s = para.text().toString().trim();
if(s=="")
{
s=" ";
}
System.out.println(s);
htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
System.out.println(i+":"+j+":"+cellWidth+":"+s);
} //end for
} //end for
} //end for
htmlTextTbl+="</table>" ;
htmlTextArray[counter]=htmlTextTbl;
} //end while
}
/**
* 读写文档中的图片
*
* @param pTable
* @param cr
* @throws Exception
*/
public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
// 提取图片
Picture pic = pTable.extractPicture(cr, false);
// 返回POI建议的图片文件名
String afileName = pic.suggestFullFileName();
OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
pic.writeImageContent(out);
htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
}
public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
{
boolean flag = false;
if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
{
flag = true;
}
return flag;
}
/**
* 写文件
*
* @param s
*/
public static void writeFile(String s) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File("c://abc.html");
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos));
bw.write(s);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
相关文章推荐
- Java引用POI实现Word转Html方法
- Java引用POI实现Word转Html方法
- java实现在线预览----poi操作word转html及03、07版本兼容问题
- java引用POI将Word转化为HTML
- java实现在线预览--poi实现word、excel、ppt转html
- java+poi实现word转html显示
- java+poi实现word转html显示
- java实现在线预览--poi实现word、excel、ppt转html
- 使用java的lambda表达式实现word count的两种方法
- java 调用jacob和java2word导出word,并实现将带有格式的html文本和图片插入word
- java poi word转html 报错
- POI实现java从Word中读取数据
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- java 用poi 将远程共享文件夹中word转html
- php实现word转html的方法
- Java 中POI 导入EXCEL2003 和EXCEL2007的实现方法
- java poi解析word的方法
- java导出word实现方式二,poi
- Java使用poi将word转换为html
- Java中Excel导入功能实现、excel导入公共方法_POI -