您的位置:首页 > Web前端 > JavaScript

POI读取第三方下载的Word文档

2019-12-20 11:04 337 查看

3 月,跳不动了?>>>

因为从第三方读取到的word可能是其他格式(例如:html)转成word的,此时去读取word可能会失败。这里以HTML为例

依赖
<!-- parse world -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>

<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.1</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
代码片段
private String parseWord(String path) throws ParseWordException {
// inspect
if (isEmpty(path)) {
throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
}

// reader
File file = new File(path);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
}  catch (FileNotFoundException e) {
throw new ParseWordException(Code.READER_FILE_FAILURE.getCode(), Code.READER_FILE_FAILURE.getMessage());
}

// parse

try {
if (path.toUpperCase().endsWith(FileType.DOC.toString())) {
HWPFDocument wordDoc = new HWPFDocument(fis);
// 自己读

} else if (path.toUpperCase().endsWith(FileType.DOCX.toString())) {
XWPFDocument wordDocx = new XWPFDocument(fis);
// 自己读

} else {
// 文件格式非法
throw new ParseWordException(Code.FILE_TYPE_ILLEGAL.getCode(), Code.FILE_TYPE_ILLEGAL.getMessage());
}

}
catch (IllegalArgumentException ie) {
System.out.println(ie.getMessage());
if (isEmpty(ie.getMessage())) {
throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
}
if (ie.getMessage().contains("The document is really a HTML file")) {
// 格式转换
try {
String htmlPath = parseHtml(file);
Document doc = Jsoup.parse(new File(htmlPath), "GBK"); // 自己定
Elements elements = doc.select("table").select("tbody"); //读取所有的tbody标签,视情况而定
elements.forEach(e -> {
//读取td中所有的span标签,视情况而定,可能有图片,自己处理
e.select("td").select("span").eachText().stream().filter(d -> d != null && d.trim().length() > 0).forEach(System.out::println);

});

} catch (IOException e) {
throw new ParseWordException(Code.FILE_CONVERT_FAILURE.getCode(), Code.FILE_CONVERT_FAILURE.getMessage());
}
}

}
catch (IOException e) {
throw new ParseWordException(Code.PARSE_FAILURE.getCode(), Code.PARSE_FAILURE.getMessage());
}
return null;
}

/**
* parse HTML
*
* @param readerFile
* @return
* @throws IOException
*/
private String parseHtml(File readerFile) throws IOException {
String tempPath = "d:\\1.html"; // 创建一个零时文件,自己换一下路径

File outFile = new File(tempPath);
if (outFile.exists()) {
outFile.delete(); // 删掉之前已经存在的文件
}
FileInputStream fis = new FileInputStream(readerFile);
FileOutputStream fileOutputStream = new FileOutputStream(outFile);
int len = 0;
byte[] buffer = new byte[1024];
while ((len = fis.read(buffer)) != -1) {
fileOutputStream.write(buffer, 0, len);

}

return tempPath;
}

public static void main(String[] args) throws IOException, ParseWordException {
ParseWorld parse = new ParseWorld();
parse.parseWord("D:\\aaa.doc");

//
}

private boolean isEmpty(String str) {
return str == null || str.trim().length() == 0;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  POI ParseHTML jsoup