java下载网页并读取内容
2008-06-09 19:38
597 查看
下载回来怎么也得读取内容:
package com.core.crawl; import java.io.IOException; import com.util.file.Files; public class Crawl { /** * @param args * @throws IOException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException { long begin = System.currentTimeMillis(); //WebSpider spider2 = new WebSpider(); WebSpider spider1 = new WebSpider(); spider1.setWebAddress("http://www.w3c.org/robots.txt"); spider1.setDestFile(Files.getSysPath() + "/"+"robots."); //spider2.setWebAddress("http://blog.csdn.net/longronglin"); //spider2.setDestFile(Files.getSysPath() + "/"+"spider2."); Thread t1 = new Thread(spider1); //Thread t2 = new Thread(spider2); t1.start(); //t2.start(); t1.join(); //t2.join(); System.out.println("the end"); System.out.println(System.currentTimeMillis() - begin); } }
package com.core.crawl; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import com.core.http.Http; public class WebSpider implements Runnable{ private Http http = new Http(); private String webAddress = ""; private String destFile = ""; public void setWebAddress(String webAddress){ this.webAddress = webAddress; } public void setDestFile (String destFile){ this.destFile = destFile; } public boolean download() throws IOException, InterruptedException { HttpURLConnection httpConn = null; try { URL url = new URL(webAddress); httpConn = (HttpURLConnection) url.openConnection(); httpConn.setRequestMethod("GET"); httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14"); InputStream in = httpConn.getInputStream(); String fileType = http.fileType(httpConn.getContentType()); System.out.println(fileType); FileOutputStream out = new FileOutputStream(new File(destFile + fileType)); int chByte = in.read(); BufferedReader bf = new BufferedReader(new InputStreamReader(in)); String result = null; while ((result = bf.readLine()) != null) { System.out.println(result); } // while (chByte != -1) { // out.write(chByte); // // System.out.println(chByte); // chByte = in.read(); // } } catch (Exception ex) { System.out.println(ex.toString()); } finally { httpConn.disconnect(); } return true; } public void run() { try { System.out.println(Thread.currentThread().getName()); download(); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } }
package com.util.file; public class Files { /*** * 获取应用程序的根目录 * @return 应用程序根目录 */ public static String getSysPath(){ return System.getProperty("user.dir"); } }
Thread-0html # robots.txt for http://www.w3.org/## $Id: robots.txt,v 1.50 2007/12/13 17:09:37 ted Exp $# # For use by search.w3.orgUser-agent: W3C-gsaDisallow: /Out-Of-Date User-agent: W3T_SEDisallow: /Out-Of-Date User-agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS Search 4.0 Robot)Disallow: / # W3C Link checkerUser-agent: W3C-checklinkDisallow: # exclude some access-controlled areasUser-agent: *Disallow: /2004/ontaria/basicDisallow: /TeamDisallow: /ProjectDisallow: /WebDisallow: /SystemsDisallow: /HistoryDisallow: /Out-Of-DateDisallow: /2002/02/midDisallow: /mid/Disallow: /2004/08/W3CTalksDisallow: /2007/11/Talks/searchDisallow: /People/all/Disallow: /RDF/Validator/ARPServletDisallow: /2003/03/Translations/byLanguageDisallow: /2003/03/Translations/byTechnologyDisallow: /2005/11/Translations/QueryDisallow: /2003/glossary/subglossary/#Disallow: /2005/06/blog/#Disallow: /2001/07/pubrules-checker#shouldnt get transparent proxies but will ml links of things like pubrulesDisallow: /2000/06/webdata/xsltDisallow: /2000/09/webdata/xsltDisallow: /2005/08/online_xslt/xsltDisallow: /Bugs/Disallow: /Search/Mail/Public/Disallow: /2006/02/chartergenthe end10485 spider1.setWebAddress("http://www.w3c.org/"); spider1.setDestFile(Files.getSysPath() + "/"+"w3c.");的设置自己测试
相关文章推荐
- 实现Java读取网页内容并下载网页中出现的图片
- Java读取网页内容并下载图片的实例
- java下载读取网页内容方式
- Java读取网页内容并生成静态页面的简单实现
- Java用正则表达式如何读取网页内容
- java下载网页中的内容
- java下载html页面---把网页内容保存成本地html
- java读取网页内容,并保存
- 【Java】读取网页中的内容
- Java Web乱码-->读取网页中的数据(如输入框中的内容)写到数据库中乱码
- java下载html页面---把网页内容保存成本地html
- java 读取Excel处理并在网页下载处理后的新文件
- java的poi技术下载Excel模板上传Excel读取Excel中内容(SSM框架)
- java 使用URL来读取网页内容
- java读取网页图片路径并下载到本地
- java用正则表达式分析读取网页内容
- JAVA:获得网页下载文件内容
- java下载的excel显示发现不可读取的内容。是否恢复此工作薄的内容?如果信任此工作薄的来源,请单击‘是’。”
- 使用Java下载网页的下载链接的内容
- Java用正则表达式如何读取网页内容