您的位置:首页 > 编程语言 > Java开发

java下载网页并读取内容

2008-06-09 19:38 597 查看
下载回来怎么也得读取内容:
package com.core.crawl;

import java.io.IOException;

import com.util.file.Files;

public class Crawl {

/**
* @param args
* @throws IOException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException, InterruptedException {

long begin = System.currentTimeMillis();
//WebSpider spider2 = new WebSpider();
WebSpider spider1 = new WebSpider();
spider1.setWebAddress("http://www.w3c.org/robots.txt");
spider1.setDestFile(Files.getSysPath() + "/"+"robots.");

//spider2.setWebAddress("http://blog.csdn.net/longronglin");
//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");

Thread t1 = new Thread(spider1);
//Thread t2 = new Thread(spider2);
t1.start();
//t2.start();

t1.join();
//t2.join();

System.out.println("the end");
System.out.println(System.currentTimeMillis() - begin);
}

}
 
package com.core.crawl;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import com.core.http.Http;

public class WebSpider implements Runnable{

private Http http = new Http();

private String webAddress = "";
private String destFile = "";

public void setWebAddress(String webAddress){
this.webAddress = webAddress;
}

public void setDestFile (String destFile){
this.destFile = destFile;
}

public boolean download() throws IOException, InterruptedException {

HttpURLConnection httpConn = null;

try {
URL url = new URL(webAddress);

httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");
InputStream in = httpConn.getInputStream();
String fileType = http.fileType(httpConn.getContentType());
System.out.println(fileType);
FileOutputStream out = new FileOutputStream(new File(destFile + fileType));
int chByte = in.read();

BufferedReader bf = new BufferedReader(new InputStreamReader(in));
String result = null;

while ((result = bf.readLine()) != null) {
System.out.println(result);
}

//            while (chByte != -1) {
//		out.write(chByte);
//
//		System.out.println(chByte);
//		chByte = in.read();
//	    }

} catch (Exception ex) {
System.out.println(ex.toString());
} finally {
httpConn.disconnect();
}
return true;
}

public void run() {
try {
System.out.println(Thread.currentThread().getName());
download();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
 
package com.util.file;

public class Files {

/***
* 获取应用程序的根目录
* @return 应用程序根目录
*/
public static String getSysPath(){
return  System.getProperty("user.dir");
}

}
 
Thread-0html # robots.txt for http://www.w3.org/## $Id: robots.txt,v 1.50 2007/12/13 17:09:37 ted Exp $# # For use by search.w3.orgUser-agent: W3C-gsaDisallow: /Out-Of-Date User-agent: W3T_SEDisallow: /Out-Of-Date User-agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS Search 4.0 Robot)Disallow: / # W3C Link checkerUser-agent: W3C-checklinkDisallow: # exclude some access-controlled areasUser-agent: *Disallow: /2004/ontaria/basicDisallow: /TeamDisallow: /ProjectDisallow: /WebDisallow: /SystemsDisallow: /HistoryDisallow: /Out-Of-DateDisallow: /2002/02/midDisallow: /mid/Disallow: /2004/08/W3CTalksDisallow: /2007/11/Talks/searchDisallow: /People/all/Disallow: /RDF/Validator/ARPServletDisallow: /2003/03/Translations/byLanguageDisallow: /2003/03/Translations/byTechnologyDisallow: /2005/11/Translations/QueryDisallow: /2003/glossary/subglossary/#Disallow: /2005/06/blog/#Disallow: /2001/07/pubrules-checker#shouldnt get transparent proxies but will ml links of things like pubrulesDisallow: /2000/06/webdata/xsltDisallow: /2000/09/webdata/xsltDisallow: /2005/08/online_xslt/xsltDisallow: /Bugs/Disallow: /Search/Mail/Public/Disallow: /2006/02/chartergenthe end10485  spider1.setWebAddress("http://www.w3c.org/"); spider1.setDestFile(Files.getSysPath() + "/"+"w3c.");的设置自己测试
                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息