您的位置:首页 > 编程语言 > Java开发

爬虫技术(2)--抓取网页java代码实现

2016-06-30 17:27 711 查看
package creeper.part1.capturepage;
import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
//爬虫技术(1)--抓取网页
@SuppressWarnings("unused")
public class capturePage {
public static void main(String[] args) throws Exception {
//声明一个HttpClient客户端,相当于打开一个浏览器(4.3以后都是CloseableHttpClient以前的已经过时)
CloseableHttpClient httpClient=HttpClients.createDefault();
//创建代理,省略...

try {
//get方法,相当于打开了一个网页
String url="http://www.baidu.com";
HttpGet get=new HttpGet(url);
System.out.println("---------URI----------");
System.out.println(get.getURI());

//创建响应处理器处理响应内容
ResponseHandler<String> handler=new ResponseHandler<String>(){
@Override
public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException {
int status=response.getStatusLine().getStatusCode();//获取响应状态码
//对状态码进行判断处理
if(status>=200 && status<300 ){
HttpEntity entity=response.getEntity();//获取响应的数据
return entity==null?null:EntityUtils.toString(entity);
}else{
throw new ClientProtocolException("status:"+status);
}
}
};
//发送请求,相当于敲个回车
String responseBody=httpClient.execute(get, handler);
System.out.println("----------------responseBody-----------------");
System.out.println(responseBody);
System.out.println("----------------responseBody-----------------");
} catch (Exception e) {

}finally{
httpClient.close();
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: