您的位置:首页 > 编程语言 > Java开发

爬虫记录(3)——模拟登录获取cookie,访问私信页面

2017-09-06 16:22 531 查看
继上一篇博文 爬虫记录(2)——简单爬取一个页面的图片并保存 ,今天我们通过httpclient模拟表单登录开源中国,获取cookie,然后通过cookie访问个人私信页面。

1、准备工作

模拟表单登录,首先需要知道登录的url,以及登录表单的字段。这里我们图一中故意输入一个错误的用户名和密码,然后通过查看图二中的network中,发现登录的url是https://www.oschina.net/action/user/hash_login?from=,字段是 账号为email , 密码是pwd

而且密码通过相应的处理,我们先不管他,反正输入的是正确的密码,直接拿图中字符串即可。

图一



图二



2、修改相应的爬虫工具类CrawlerUtils

增加post方法,插入消息体。也修改相应的get方法,增加插入cookie的方法。

package com.dyw.crawler.util;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

/**
* 爬虫工具类
* Created by dyw on 2017/9/1.
*/
public class CrawlerUtils {
/**
* http请求设置消息头
*
* @param httpMethod http请求方法
*/
private static void setHead(HttpMethod httpMethod) {
setHead(httpMethod, null);
}

/**
* http请求设置自定义消息头
*
* @param httpMethod http请求方法
* @param map        消息头
*/
private static void setHead(HttpMethod httpMethod, Map<String, String> map) {
//判断是否传入自定义消息头
if (null != map && map.size() > 0) {
map.keySet().forEach(key -> httpMethod.setRequestHeader(key, map.get(key)));
}
//公共消息头(不同的网站消息头内容不一致)
httpMethod.setRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36");
httpMethod.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
}

/**
* post方法设置登陆用户信息
*
* @param postMethod post方法
* @param loginInfo  消息体
*/
private static void setBody(PostMethod postMethod, NameValuePair[] loginInfo) {
postMethod.setRequestBody(loginInfo);
}

/**
* 获取html内容转成string输出(get方法)
*
* @param url url链接
* @return 整个网页转成String字符串
*/
public static String get(String url) throws Exception {
return get(url, null);
}

/**
* 获取html内容转成string输出(get方法)有自带的消息头
*
* @param url url链接
* @param map 消息头内容
* @return 整个网页转成String字符串
*/
public static String get(String url, Map<String, String> map) throws Exception {
String html = null;
HttpClient httpClient = new HttpClient();
HttpMethod httpMethod = new GetMethod(url);
setHead(httpMethod, map);
int status = httpClient.executeMethod(httpMethod);
if (status == HttpStatus.SC_OK) {
html = httpMethod.getResponseBodyAsString();
}
return html;
}

/**
* 登陆方法,获取cookie(post方法)
*
* @param url url链接
* @return cookie
*/
public static String post(String url, NameValuePair[] loginInfo) throws Exception {
HttpClient httpClient = new HttpClient();
// 模拟登陆,按实际服务器端要求选用Post请求方式
PostMethod postMethod = new PostMethod(url);
setHead(postMethod);
setBody(postMethod, loginInfo);
httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
httpClient.executeMethod(postMethod);
// 获得登陆后的 Cookie
Cookie[] cookies = httpClient.getState().getCookies();
StringBuffer cookie = new StringBuffer();
for (Cookie c : cookies) {
cookie.append(c.toString() + ";");
}
return cookie.toString();
}

/**
* 获取文件流(get方法)
*
* @param urlStr url地址
* @return InputStream
*/
public static InputStream downLoadFromUrl(String urlStr) throws IOException {
//通过httpclient来代替urlConnection
//        HttpHost proxy=new HttpHost("116.226.217.54", 9999);
HttpClient httpClient = new HttpClient();
HttpMethod httpMethod = new GetMethod(urlStr);
//        HostConfiguration hostConfiguration = new HostConfiguration();
//        hostConfiguration.setHost("116.226.217.54", 9999);
//        httpClient.setHostConfiguration(hostConfiguration);
setHead(httpMethod);
int status = httpClient.executeMethod(httpMethod);
InputStream responseBodyAsStream = null;
if (status == HttpStatus.SC_OK) {
responseBodyAsStream = httpMethod.getResponseBodyAsStream();
}
return responseBodyAsStream;
}
}


3、main主方法

package com.dyw.crawler.project;

import com.dyw.crawler.util.CrawlerUtils;
import org.apache.commons.httpclient.NameValuePair;

import java.util.HashMap;
import java.util.Map;

/**
* 模拟登陆
* Created by dyw on 2017/9/5.
*/
public class Project2 {

public static void main(String[] args) {
// 1   Url 开源中国网站登录url
String loginUrl = "https://www.oschina.net/action/user/hash_login?from=";
//个人私信网站,登录才能进入
String dataUrl = "https://my.oschina.net/u/3673710/admin/inbox";
// 设置登陆时要求的信息,用户名和密码
NameValuePair[] loginInfo = {new NameValuePair("email", "账号"),
new NameValuePair("pwd", "密码")};
try {
String cookie = CrawlerUtils.post(loginUrl, loginInfo);
Map<String, String> map = new HashMap<>();
map.put("Cookie", cookie);
String html = CrawlerUtils.get(dataUrl, map);
System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
}
}


这样我们就能获取到私信网站内容。

具体代码我上传在github上,需要完整代码的可以自己下载 https://github.com/dingyinwu81/crawler

如果有什么代码修改的建议,请给我留言呗! ☺☺☺
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  爬虫 java