您的位置:首页 > 编程语言 > Java开发

java实现爬虫技术,读取txt,word,excel,ppt,pdf,html等格式的文件

2016-10-25 17:55 1241 查看

最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容,不多说,先把代码贴出来,之后有时间再来做详细的解读。

这是读取txt文件

/**
* 获取txt的文件内容 新建的默认格式 ,其它三种格式会乱码
*
* @param txtFile
* @return
*/
public String GetTxtContent(File txtFile) {
BufferedReader reader = null;

String tempString = null;
StringBuffer contents = new StringBuffer();
try {
reader = new BufferedReader(new FileReader(txtFile));
while ((tempString = reader.readLine()) != null) {
contents.append(tempString);
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return contents.toString().trim();
}
<h1>读取ppt</h1>	/**
* 读取PPT的内容
*
* @param excleFile
* @return
*/
public String GetPPTContent(File excleFile) {
StringBuffer contents = new StringBuffer("");// 文档内容
InputStream is = null;
SlideShow ppt = null;
try {
is = new FileInputStream(excleFile);
ppt = new SlideShow(new HSLFSlideShow(is));
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
Slide[] slides = ppt.getSlides();

for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
contents.append(t[j].getText());// 这里会将文字内容加到content中去
}
}
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return contents.toString().trim();
}
<h1>读取excel</h1>	/**
* 获取2007excle的内容
*
* @param exclexlsxFile
* @return
*/
public String GetExclexlsxContent(File exclexlsxFile) {
StringBuffer content = null;
XSSFWorkbook workbook = null;
InputStream in = null;
try {
in = new FileInputStream(exclexlsxFile);
content = new StringBuffer();
workbook = new XSSFWorkbook(in);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
XSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
}

for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
XSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
}
if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue()
.getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFor
4000
mat df = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
content.append(df.format(date));
}
}
}
}
}
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}

return content.toString().trim();
}
/**
* 读取excle的内容
*
* @param excleFile
* @return
*/
public String GetExcleContent(File excleFile) {
StringBuffer content = null;
HSSFWorkbook workbook = null;
InputStream in = null;
try {
in = new FileInputStream(excleFile);
content = new StringBuffer();
workbook = new HSSFWorkbook(in);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
HSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
}

for (int cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
HSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
}

if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue()
.getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFormat df = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
content.append(df.format(date));
}
}
}
}
}
if (in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}

return content.toString().trim();
}
<span style="font-size:48px;">读取word</span>
/**
* 获取word的内容
*
* @param wordPath
*            文件
* @return word的内容
*/
@SuppressWarnings("resource")
public  String GetWordContent(File wordFile) {
String strContent = "";
FileInputStream in=null;
try {
in = new FileInputStream(wordFile);
WordExtractor text = new WordExtractor(in);
strContent = text.getText();
} catch (Exception e) {
e.printStackTrace();
}finally{
if(in!=null){
try {
in.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

return strContent.trim();
}
/**
* 获取word2007的内容
*
* @param word2007Path
* @return
* @throws Exception
*/
public String GetWordDocxContent(File wordDocxFile) {
POIXMLTextExtractor extractor;
String text2007 = "";
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(wordDocxFile
.getPath());
extractor = new XWPFWordExtractor(opcPackage);
text2007 = extractor.getText();
} catch (IOException e) {
e.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
}
return text2007.trim();
}
<span style="font-size:48px;">读取pdf</span>
/**
* 读取PDF文字的内容
*
* @param pdfPath
*            pdf
* @return 返回pdf文件的内容
*/
public String GetPDFContent(File pdfFile) {
String content = "";
FileInputStream is = null;
PDDocument doc = null;
try {
is = new FileInputStream(pdfFile);
PDFParser parser = new PDFParser(is);
parser.parse();
doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
content = stripper.getText(doc);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
return content.trim();
}
<span style="font-size:48px;">读取html</span>
/**
* 读取网页纯文本内容用来存储索引方法*/
public  String GetHTML(String url) throws ParserException{
Parser parser = new Parser(url);
StringBean sb=new StringBean();
//設置不需要頁面的鏈接信息
sb.setLinks(false);
//設置將不間斷空格由正規空格替代
sb.setReplaceNonBreakingSpaces(true);
//設置一系列空格由單一空格代替
sb.setCollapse(true);
parser.visitAllNodesWith(sb);
return sb.getStrings().trim();
}
/**@param filePath
* 文件上傳路徑
* 处理附件方法 获得JSON数组
* @throws Exception */
public String HandleFj(String param,IService service,String filePath) throws Exception{
JSONArray json=null;
ArrayList<IEntity>list=null;
String sql="";
String fjtotalpath="";
try {
json=JSONArray.fromObject(DataObject.getObjectValue("param"));
} catch (Exception e) {
e.printStackTrace();
return "";
}
if(!StringHelper.isNullOrEmpty(json)){
StringBuffer fjcontenttotal=new StringBuffer();
for(int i=0;i<json.length();i++){
String fileid=json.getJSONObject(i).getString("id");//拿到fileid
String name=json.getJSONObject(i).getString("name");
if(!StringHelper.isNullOrEmpty(fileid)&&!StringHelper.isNullOrEmpty(name)){
sql="select t.localpath from t_srffile t where t.file_id='"+fileid+"'";
try {
list=service.selectRaw(sql, null);
} catch (Exception e) {
e.printStackTrace();
}
for(IEntity o:list){
String location=DataObject.getStringValue(o.get("location"));
fjtotalpath=filePath+location;
fjcontenttotal.append(this.GetFileContent(fjtotalpath));
}
}
return fjcontenttotal.toString();
}
}
return "";
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: