Java登录到新浪微博抓取数据
2015-11-07 17:36
375 查看
最近突然想做一个网络爬虫,爬取微博上的用户信息。这里说一下第一步登录的思路和奉上部分代码。
1.获得登录的参数信息 /** * 初始登录信息 * 返回false说明初始失败 */ public boolean preLogin(){ boolean flag = false; try { this.su = new String(Base64.encodeBase64(URLEncoder.encode(username, "UTF-8").getBytes())); String url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+getTimestamp(); url = url +"&su"+this.su; String content; HttpGet httpGet = new HttpGet(url); httpGet.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); httpGet.addHeader("Accept-Encoding", "gzip, deflate, sdch"); httpGet.addHeader("Accept-Language", "zh-CN,zh;q=0.8"); httpGet.addHeader("Cache-Control", "max-age=0"); httpGet.addHeader("Host", "login.sina.com.cn"); httpGet.addHeader("Proxy-Connection", "keep-alive"); httpGet.addHeader("Upgrade-Insecure-Requests", "1"); httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); content = HttpTools.getRequest(this.httpClient,httpGet); JSONObject json = new JSONObject(content); servertime = json.getLong("servertime"); nonce = json.getString("nonce"); rsakv = json.getString("rsakv"); pubkey = json.getString("pubkey"); flag = encodePwd(); } catch (Exception e) { e.printStackTrace(); } return flag; }
2.开始登录 POST请求
public boolean login() { boolean flag = false; String loginUrl = "http://login.sina.com.cn:80/sso/login.php?client=ssologin.js(v1.4.18)"; if (preLogin()) { HttpPost httpPost = new HttpPost(loginUrl); // 设置请求头 httpPost.addHeader("Host", "login.sina.com.cn"); httpPost.addHeader("Proxy-Connection", "keep-alive"); httpPost.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); httpPost.addHeader("Origin", "http://weibo.com"); httpPost.addHeader("Upgrade-Insecure-Requests", "1"); httpPost.addHeader( "User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); // httpPost.addHeader("User-Agent", // "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"); httpPost.addHeader("Content-Type", "application/x-www-form-urlencoded"); httpPost.addHeader("Referer", "http://weibo.com/"); httpPost.addHeader("Accept-Encoding", "gzip, deflate"); httpPost.addHeader("Accept-Language", "zh-CN,zh;q=0.8"); // 设置请求参数 List<NameValuePair> parms = new ArrayList<NameValuePair>(); parms.add(new BasicNameValuePair("entry", "weibo")); parms.add(new BasicNameValuePair("geteway", "1")); parms.add(new BasicNameValuePair("from", "")); parms.add(new BasicNameValuePair("savestate", "0")); parms.add(new BasicNameValuePair("useticket", "1")); parms.add(new BasicNameValuePair("pagerefer", "")); parms.add(new BasicNameValuePair("vsnf", "1")); parms.add(new BasicNameValuePair("su", this.su)); parms.add(new BasicNameValuePair("service", "miniblog")); parms.add(new BasicNameValuePair("servertime", this.servertime + "")); parms.add(new BasicNameValuePair("nonce", this.nonce)); parms.add(new BasicNameValuePair("pwencode", "rsa2")); parms.add(new BasicNameValuePair("rsakv", this.rsakv)); parms.add(new BasicNameValuePair("sp", this.sp)); parms.add(new BasicNameValuePair("sr", "1366*768")); parms.add(new BasicNameValuePair("encoding", "UTF-8")); parms.add(new BasicNameValuePair("prelt", "83")); parms.add(new BasicNameValuePair( "url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack")); parms.add(new BasicNameValuePair("returntype", "META")); // 执行 try { String content = HttpTools.postRequest(this.httpClient, httpPost, parms); content = content.replaceAll("'", "\""); // 解析返回的数据,判断是否登录成功 String regex = "location\\.replace\\(\"(.+?)\"\\);"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(content); if (m.find()) { location = m.group(1); if (location.contains("reason=")) { errInfo = location.substring(location .indexOf("reason=") + 7); errInfo = URLDecoder.decode(errInfo, "UTF-8"); } else { location = URLDecoder.decode(location, "UTF-8"); flag = ajaxLogin(location); } } } catch (Exception e) { e.printStackTrace(); } } return flag; }
public boolean ajaxLogin(String ajaxUrl){ boolean flag = false; HttpGet ajaxGet = new HttpGet(ajaxUrl); String content ; try { content = HttpTools.getRequest(this.httpClient, ajaxGet); int beginIndex = content.indexOf("("); int endIndex = content.lastIndexOf(")"); content = content.substring(beginIndex+1, endIndex); content = URLDecoder.decode(content, "UTF-8"); JSONObject jsonObject = new JSONObject(content); this.uniqueid = jsonObject.getJSONObject("userinfo").getString("uniqueid"); this.userdomain = jsonObject.getJSONObject("userinfo").getString("userdomain"); // String lastUrl = "http://weibo.com/u/"+uniqueid+userdomain; //HttpTools.getRequest(httpClient, lastUrl); flag = true; } catch (IOException e) { e.printStackTrace(); } return flag; }
其中对用户名和密码加密部分百度一下你就知道。
相关文章推荐
- java对世界各个时区(TimeZone)的通用转换处理方法(转载)
- java-注解annotation
- java-模拟tomcat服务器
- java-用HttpURLConnection发送Http请求.
- java-WEB中的监听器Lisener
- Android IPC进程间通讯机制
- Android Native 绘图方法
- Android java 与 javascript互访(相互调用)的方法例子
- 【CF应用开发大赛】微博社交简历
- 介绍一款信息管理系统的开源框架---jeecg
- 微博回应“用户信息被出售”事件:已上报司法机关
- 聚类算法之kmeans算法java版本
- java实现 PageRank算法
- PropertyChangeListener简单理解
- 插入排序
- 冒泡排序
- 堆排序
- 快速排序