您的位置:首页 > 编程语言 > Java开发

Java登录到新浪微博抓取数据

2015-11-07 17:36 375 查看
最近突然想做一个网络爬虫,爬取微博上的用户信息。这里说一下第一步登录的思路和奉上部分代码。




1.获得登录的参数信息
/**
* 初始登录信息
* 返回false说明初始失败
*/
public boolean preLogin(){
boolean flag = false;
try {
this.su = new String(Base64.encodeBase64(URLEncoder.encode(username, "UTF-8").getBytes()));
String url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+getTimestamp();
url = url +"&su"+this.su;
String content;
HttpGet httpGet = new HttpGet(url);
httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpGet.addHeader("Accept-Encoding", "gzip, deflate, sdch");
httpGet.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.addHeader("Cache-Control", "max-age=0");
httpGet.addHeader("Host", "login.sina.com.cn");
httpGet.addHeader("Proxy-Connection", "keep-alive");
httpGet.addHeader("Upgrade-Insecure-Requests", "1");
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
content = HttpTools.getRequest(this.httpClient,httpGet);
JSONObject json = new JSONObject(content);
servertime = json.getLong("servertime");
nonce = json.getString("nonce");
rsakv = json.getString("rsakv");
pubkey = json.getString("pubkey");
flag = encodePwd();
} catch (Exception e) {
e.printStackTrace();
}
return flag;
}


2.开始登录 POST请求

public boolean login() {
boolean flag = false;
String loginUrl = "http://login.sina.com.cn:80/sso/login.php?client=ssologin.js(v1.4.18)";
if (preLogin()) {
HttpPost httpPost = new HttpPost(loginUrl);
// 设置请求头
httpPost.addHeader("Host", "login.sina.com.cn");
httpPost.addHeader("Proxy-Connection", "keep-alive");
httpPost.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpPost.addHeader("Origin", "http://weibo.com");
httpPost.addHeader("Upgrade-Insecure-Requests", "1");
httpPost.addHeader(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");
// httpPost.addHeader("User-Agent",
// "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko");
httpPost.addHeader("Content-Type",
"application/x-www-form-urlencoded");
httpPost.addHeader("Referer", "http://weibo.com/");
httpPost.addHeader("Accept-Encoding", "gzip, deflate");
httpPost.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
// 设置请求参数
List<NameValuePair> parms = new ArrayList<NameValuePair>();
parms.add(new BasicNameValuePair("entry", "weibo"));
parms.add(new BasicNameValuePair("geteway", "1"));
parms.add(new BasicNameValuePair("from", ""));
parms.add(new BasicNameValuePair("savestate", "0"));
parms.add(new BasicNameValuePair("useticket", "1"));
parms.add(new BasicNameValuePair("pagerefer", ""));
parms.add(new BasicNameValuePair("vsnf", "1"));
parms.add(new BasicNameValuePair("su", this.su));
parms.add(new BasicNameValuePair("service", "miniblog"));
parms.add(new BasicNameValuePair("servertime", this.servertime + ""));
parms.add(new BasicNameValuePair("nonce", this.nonce));
parms.add(new BasicNameValuePair("pwencode", "rsa2"));
parms.add(new BasicNameValuePair("rsakv", this.rsakv));
parms.add(new BasicNameValuePair("sp", this.sp));
parms.add(new BasicNameValuePair("sr", "1366*768"));
parms.add(new BasicNameValuePair("encoding", "UTF-8"));
parms.add(new BasicNameValuePair("prelt", "83"));
parms.add(new BasicNameValuePair(
"url",
"http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"));
parms.add(new BasicNameValuePair("returntype", "META"));
// 执行
try {
String content = HttpTools.postRequest(this.httpClient,
httpPost, parms);
content = content.replaceAll("'", "\"");
// 解析返回的数据,判断是否登录成功
String regex = "location\\.replace\\(\"(.+?)\"\\);";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(content);
if (m.find()) {
location = m.group(1);
if (location.contains("reason=")) {
errInfo = location.substring(location
.indexOf("reason=") + 7);
errInfo = URLDecoder.decode(errInfo, "UTF-8");
} else {
location = URLDecoder.decode(location, "UTF-8");
flag = ajaxLogin(location);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
return flag;
}


public boolean ajaxLogin(String ajaxUrl){
boolean flag = false;
HttpGet ajaxGet = new HttpGet(ajaxUrl);
String content ;
try {
content = HttpTools.getRequest(this.httpClient, ajaxGet);
int beginIndex = content.indexOf("(");
int endIndex = content.lastIndexOf(")");
content = content.substring(beginIndex+1, endIndex);
content = URLDecoder.decode(content, "UTF-8");
JSONObject jsonObject = new JSONObject(content);
this.uniqueid = jsonObject.getJSONObject("userinfo").getString("uniqueid");
this.userdomain = jsonObject.getJSONObject("userinfo").getString("userdomain");
//          String lastUrl = "http://weibo.com/u/"+uniqueid+userdomain;
//HttpTools.getRequest(httpClient, lastUrl);
flag = true;
} catch (IOException e) {
e.printStackTrace();
}
return flag;
}


其中对用户名和密码加密部分百度一下你就知道。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  网络爬虫 微博 java