java实现爬虫技术,读取txt,word,excel,ppt,pdf,html等格式的文件
2016-10-25 17:55
1241 查看
最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容,不多说,先把代码贴出来,之后有时间再来做详细的解读。
这是读取txt文件
/** * 获取txt的文件内容 新建的默认格式 ,其它三种格式会乱码 * * @param txtFile * @return */ public String GetTxtContent(File txtFile) { BufferedReader reader = null; String tempString = null; StringBuffer contents = new StringBuffer(); try { reader = new BufferedReader(new FileReader(txtFile)); while ((tempString = reader.readLine()) != null) { contents.append(tempString); } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } return contents.toString().trim(); }
<h1>读取ppt</h1> /** * 读取PPT的内容 * * @param excleFile * @return */ public String GetPPTContent(File excleFile) { StringBuffer contents = new StringBuffer("");// 文档内容 InputStream is = null; SlideShow ppt = null; try { is = new FileInputStream(excleFile); ppt = new SlideShow(new HSLFSlideShow(is)); } catch (FileNotFoundException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } Slide[] slides = ppt.getSlides(); for (int i = 0; i < slides.length; i++) { TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun for (int j = 0; j < t.length; j++) { contents.append(t[j].getText());// 这里会将文字内容加到content中去 } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return contents.toString().trim(); }
<h1>读取excel</h1> /** * 获取2007excle的内容 * * @param exclexlsxFile * @return */ public String GetExclexlsxContent(File exclexlsxFile) { StringBuffer content = null; XSSFWorkbook workbook = null; InputStream in = null; try { in = new FileInputStream(exclexlsxFile); content = new StringBuffer(); workbook = new XSSFWorkbook(in); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet content.append("\n"); if (null == aSheet) { continue; } for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) { content.append("\n"); XSSFRow aRow = aSheet.getRow(rowNum); if (null == aRow) { continue; } for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) { XSSFCell aCell = aRow.getCell(cellNum); if (null == aCell) { continue; } if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) { content.append(aCell.getRichStringCellValue() .getString()); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { boolean b = HSSFDateUtil.isCellDateFormatted(aCell); if (b) { Date date = aCell.getDateCellValue(); SimpleDateFor 4000 mat df = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); content.append(df.format(date)); } } } } } if (in != null) { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } return content.toString().trim(); } /** * 读取excle的内容 * * @param excleFile * @return */ public String GetExcleContent(File excleFile) { StringBuffer content = null; HSSFWorkbook workbook = null; InputStream in = null; try { in = new FileInputStream(excleFile); content = new StringBuffer(); workbook = new HSSFWorkbook(in); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet content.append("\n"); if (null == aSheet) { continue; } for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) { content.append("\n"); HSSFRow aRow = aSheet.getRow(rowNum); if (null == aRow) { continue; } for (int cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) { HSSFCell aCell = aRow.getCell(cellNum); if (null == aCell) { continue; } if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) { content.append(aCell.getRichStringCellValue() .getString()); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { boolean b = HSSFDateUtil.isCellDateFormatted(aCell); if (b) { Date date = aCell.getDateCellValue(); SimpleDateFormat df = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); content.append(df.format(date)); } } } } } if (in != null) { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } return content.toString().trim(); }
<span style="font-size:48px;">读取word</span> /** * 获取word的内容 * * @param wordPath * 文件 * @return word的内容 */ @SuppressWarnings("resource") public String GetWordContent(File wordFile) { String strContent = ""; FileInputStream in=null; try { in = new FileInputStream(wordFile); WordExtractor text = new WordExtractor(in); strContent = text.getText(); } catch (Exception e) { e.printStackTrace(); }finally{ if(in!=null){ try { in.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return strContent.trim(); } /** * 获取word2007的内容 * * @param word2007Path * @return * @throws Exception */ public String GetWordDocxContent(File wordDocxFile) { POIXMLTextExtractor extractor; String text2007 = ""; try { OPCPackage opcPackage = POIXMLDocument.openPackage(wordDocxFile .getPath()); extractor = new XWPFWordExtractor(opcPackage); text2007 = extractor.getText(); } catch (IOException e) { e.printStackTrace(); } catch (XmlException e) { e.printStackTrace(); } catch (OpenXML4JException e) { e.printStackTrace(); } return text2007.trim(); }
<span style="font-size:48px;">读取pdf</span>
/** * 读取PDF文字的内容 * * @param pdfPath * pdf * @return 返回pdf文件的内容 */ public String GetPDFContent(File pdfFile) { String content = ""; FileInputStream is = null; PDDocument doc = null; try { is = new FileInputStream(pdfFile); PDFParser parser = new PDFParser(is); parser.parse(); doc = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); content = stripper.getText(doc); } catch (Exception e) { e.printStackTrace(); } finally { if (is != null) { try { is.close(); } catch (Exception e) { e.printStackTrace(); } } if (doc != null) { try { doc.close(); } catch (Exception e) { e.printStackTrace(); } } } return content.trim(); }
<span style="font-size:48px;">读取html</span> /** * 读取网页纯文本内容用来存储索引方法*/ public String GetHTML(String url) throws ParserException{ Parser parser = new Parser(url); StringBean sb=new StringBean(); //設置不需要頁面的鏈接信息 sb.setLinks(false); //設置將不間斷空格由正規空格替代 sb.setReplaceNonBreakingSpaces(true); //設置一系列空格由單一空格代替 sb.setCollapse(true); parser.visitAllNodesWith(sb); return sb.getStrings().trim(); } /**@param filePath * 文件上傳路徑 * 处理附件方法 获得JSON数组 * @throws Exception */ public String HandleFj(String param,IService service,String filePath) throws Exception{ JSONArray json=null; ArrayList<IEntity>list=null; String sql=""; String fjtotalpath=""; try { json=JSONArray.fromObject(DataObject.getObjectValue("param")); } catch (Exception e) { e.printStackTrace(); return ""; } if(!StringHelper.isNullOrEmpty(json)){ StringBuffer fjcontenttotal=new StringBuffer(); for(int i=0;i<json.length();i++){ String fileid=json.getJSONObject(i).getString("id");//拿到fileid String name=json.getJSONObject(i).getString("name"); if(!StringHelper.isNullOrEmpty(fileid)&&!StringHelper.isNullOrEmpty(name)){ sql="select t.localpath from t_srffile t where t.file_id='"+fileid+"'"; try { list=service.selectRaw(sql, null); } catch (Exception e) { e.printStackTrace(); } for(IEntity o:list){ String location=DataObject.getStringValue(o.get("location")); fjtotalpath=filePath+location; fjcontenttotal.append(this.GetFileContent(fjtotalpath)); } } return fjcontenttotal.toString(); } } return ""; }
相关文章推荐
- java实现爬虫技术,读取txt,word,excel,ppt,pdf,html等格式的文件
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例 .
- JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
- asp.net导出excel-一行代码实现excel、xml、pdf、word、html、csv等7种格式文件导出功能而且美观-SNF快速开发平台
- lucent检索技术之创建索引:使用POI读取txt/word/excel/ppt/pdf内容
- Android实现在线预览office文档(Word,Pdf,excel,PPT.txt等格式)
- PDF文件转化成word,ppt,excel,图片(png,jpg...),tiff,rtf,txt,html,PDF组合,PDF编辑,PDF创建
- java读取word-excel-ppt文件
- lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索
- JAVA读取WORD,EXCEL,POWERPOINT,PDF文件的方法