您的位置:首页 > 理论基础 > 计算机网络

利用httpclient开源工具抓取网页的源码,并且控制台打印出来工具类的编写

2014-11-26 14:56 489 查看
利用httpclient开源工具抓取网页的源码,并且控制台打印出来工具类的编写

package com.pyc.search.crawler.node.tools;

import java.io.InputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.BasicHttpParams;
import org.junit.Test;

/**
* @update yangluan
* 通过 地址 和cookie信息 直接把需要的网页源代码拿下来
*/
public class HttpClientContent {

public static String getHeadersByHttpClient(String link,String cookie) throws Exception {

CloseableHttpClient httpClient = HttpClients.createDefault();

// 请求链接
HttpGet request = new HttpGet(link);

/**
* 请求参数设置
*/
// String location = null;
// int code = 0;
// BasicHttpParams params = new BasicHttpParams();
// // 参数,是否重定向,不重定向能够拿到location
// params.setParameter("http.protocol.handle-redirects", false);
// // params.setParameter("User-Agent",
// // "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
// request.setParams(params);
request.setHeader(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
request.setHeader("Cookie", cookie);
HttpResponse response = httpClient.execute(request);

/**
* 当链接不进行重定向动作时,状态码为302可以获取location链接 当进行重定向动作时,状态码为200没有location
*/
// code = response.getStatusLine().getStatusCode();
// if (code == 302) {
// Header head = response.getFirstHeader("Location");
// if (head != null) {
// location = head.getValue();
// }
// }
// System.out.println(code + ":" + location);

/**
* 获取内容块 code为200的状态
*/
HttpEntity entity = response.getEntity();
InputStream is = entity.getContent();
int size = 0;
byte[] buffer = new byte[1024];
StringBuffer sb = new StringBuffer();
while ((size = is.read(buffer)) > -1) {
sb.append(new String(buffer, 0, size, "utf-8"));
}

is.close();

return sb.toString();
}

public static void main(String[] args) throws Exception {
String html = getHeadersByHttpClient("抓取网站地址","Cookie信息");
System.out.println(html);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: