您的位置:首页 > 编程语言 > Java开发

Java引用POI实现Word转Html方法

2012-12-17 16:54 597 查看
原文:
/article/9239771.html
1.
下载POI工具并引用



2.
读取整个doc文档,获得该文档的所有字符串。

3.
从该字符串中得到标题,把该标题构成一个HTML格式的字符串,如<html><head><title>测试文档</title></head><body>。

4.
从该文档中判断是否有表格,如有,把每个表格的开始偏移量,结束偏移量记录下来,同时根据每个表格的行,列读取表格的内容,并构造出表格的HTML字符串。

5.
从该字符串的第一个字符开始逐个字符循环,得到字符的字体,字号大小,直到下一个字符的字体,字号不一样时,把这些字符内容构造成一个HTML格式的字符串。

6.
如果碰到字符为回车符,制表符,把回车符,制表符构造成HTML格式的字符串。

7.
如果碰到字符为图片,读取图片,把图片放在指定路径,再把这一路径的信息构造成HTML字符串,如<img
src='c://test//1.jpg'/>。

8.
如读取字符串的位置等于表格的开始偏移量时,插入前面一构造出的表格HTML字符串,同时跳过表格的结束偏移量,继续往下循环读取字符。

9.
由于以上读取是按字符串逐个读取,并且根据字符的变化同时构造出HTML字符串,所以当字符串读取完毕后,即构造出一个完整的HTML字符串。

10.
举例



Word文件



HTML文件

11.源代码

WordExcelToHtml.java

[java]
view plaincopyprint?

package com;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStream;

import java.io.OutputStreamWriter;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.model.PicturesTable;

import org.apache.poi.hwpf.usermodel.CharacterRun;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.Range;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.hwpf.usermodel.Paragraph;

import org.apache.poi.hwpf.usermodel.Table;

import org.apache.poi.hwpf.usermodel.TableCell;

import org.apache.poi.hwpf.usermodel.TableIterator;

import org.apache.poi.hwpf.usermodel.TableRow;

public class WordExcelToHtml {

/**

* 回车符ASCII码

*/

private static final short ENTER_ASCII = 13;

/**

* 空格符ASCII码

*/

private static final short SPACE_ASCII = 32;

/**

* 水平制表符ASCII码

*/

private static final short TABULATION_ASCII = 9;

public static String htmlText = "";

public static String htmlTextTbl = "";

public static int counter=0;

public static int beginPosi=0;

public static int endPosi=0;

public static int beginArray[];

public static int endArray[];

public static String htmlTextArray[];

public static boolean tblExist=false;

public static final String inputFile="c://bb.doc";

public static void main(String argv[])

{
try {

getWordAndStyle(inputFile);
} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();
}
}

/**

* 读取每个文字样式

*

* @param fileName

* @throws Exception

*/

public static void getWordAndStyle(String fileName) throws Exception {

FileInputStream in = new FileInputStream(new File(fileName));

HWPFDocument doc = new HWPFDocument(in);

Range rangetbl = doc.getRange();//得到文档的读取范围

TableIterator it = new TableIterator(rangetbl);

int num=100;

beginArray=new int[num];

endArray=new int[num];

htmlTextArray=new String[num];

// 取得文档中字符的总数

int length = doc.characterLength();

// 创建图片容器

PicturesTable pTable = doc.getPicturesTable();

htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";

// 创建临时字符串,好加以判断一串字符是否存在相同格式

if(it.hasNext())

{
readTable(it,rangetbl);
}

int cur=0;

String tempString = "";

for (int i = 0; i < length - 1; i++) {

// 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围

Range range = new Range(i, i + 1, doc);

CharacterRun cr = range.getCharacterRun(0);

//beginArray=new int[num];

//endArray=new int[num];

//htmlTextArray=new String[num];

if(tblExist)

{
if(i==beginArray[cur])

{
htmlText+=tempString+htmlTextArray[cur];
tempString="";

i=endArray[cur]-1;

cur++;
continue;

}
}
if (pTable.hasPicture(cr)) {

htmlText += tempString ;
// 读写图片

readPicture(pTable, cr);
tempString = "";

}
else {

Range range2 = new Range(i + 1, i + 2, doc);

// 第二个字符

CharacterRun cr2 = range2.getCharacterRun(0);

char c = cr.text().charAt(0);

System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);

// 判断是否为回车符

if (c == ENTER_ASCII)

{
tempString += "<br/>";

}
// 判断是否为空格符

else if (c == SPACE_ASCII)

tempString += " ";

// 判断是否为水平制表符

else if (c == TABULATION_ASCII)

tempString += " ";

// 比较前后2个字符是否具有相同的格式

boolean flag = compareCharStyle(cr, cr2);

if (flag)

tempString += cr.text();
else {

String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";

if (cr.isBold())

fontStyle += "font-weight:bold;";

if (cr.isItalic())

fontStyle += "font-style:italic;";

htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";

if (cr.isBold())

fontStyle += "font-weight:bold;";

if (cr.isItalic())

fontStyle += "font-style:italic;";

htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";

tempString = "";

}
}
}

htmlText += tempString+"</body></html>";

writeFile(htmlText);
}

/**

* 读写文档中的表格

*

* @param pTable

* @param cr

* @throws Exception

*/

public static void readTable(TableIterator it, Range rangetbl) throws Exception {

htmlTextTbl="";

//迭代文档中的表格

counter=-1;

while (it.hasNext())

{
tblExist=true;

htmlTextTbl="";

Table tb = (Table) it.next();
beginPosi=tb.getStartOffset() ;
endPosi=tb.getEndOffset();

System.out.println("............"+beginPosi+"...."+endPosi);

counter=counter+1;

//迭代行,默认从0开始

beginArray[counter]=beginPosi;
endArray[counter]=endPosi;

htmlTextTbl+="<table border>";

for (int i = 0; i < tb.numRows(); i++) {

TableRow tr = tb.getRow(i);

htmlTextTbl+="<tr>";

//迭代列,默认从0开始

for (int j = 0; j < tr.numCells(); j++) {

TableCell td = tr.getCell(j);//取得单元格

int cellWidth=td.getWidth();

//取得单元格的内容

for(int k=0;k<td.numParagraphs();k++){

Paragraph para =td.getParagraph(k);

String s = para.text().toString().trim();

if(s=="")

{
s=" ";

}
System.out.println(s);
htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";

System.out.println(i+":"+j+":"+cellWidth+":"+s);

} //end for

} //end for

} //end for

htmlTextTbl+="</table>" ;

htmlTextArray[counter]=htmlTextTbl;

} //end while

}

/**

* 读写文档中的图片

*

* @param pTable

* @param cr

* @throws Exception

*/

public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {

// 提取图片

Picture pic = pTable.extractPicture(cr, false);

// 返回POI建议的图片文件名

String afileName = pic.suggestFullFileName();

OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));

pic.writeImageContent(out);
htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";

}

public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)

{
boolean flag = false;

if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())

{
flag = true;

}
return flag;

}

/**

* 写文件

*

* @param s

*/

public static void writeFile(String s) {

FileOutputStream fos = null;

BufferedWriter bw = null;

try {

File file = new File("c://abc.html");

fos = new FileOutputStream(file);

bw = new BufferedWriter(new OutputStreamWriter(fos));

bw.write(s);
} catch (FileNotFoundException fnfe) {

fnfe.printStackTrace();
} catch (IOException ioe) {

ioe.printStackTrace();
} finally {

try {

if (bw != null)

bw.close();
if (fos != null)

fos.close();
} catch (IOException ie) {

}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: