您的位置:首页 > Web前端 > HTML

HtmlUnit 爬虫简单案例——模拟登陆CSDN

2017-12-14 15:46 471 查看
最近要弄一个爬虫程序,想着先来个简单的模拟登陆, 在权衡JxBrowser和HtmlUnit 两种技术,  JxBowser有界面呈现效果,但是对于某些js跳转之后的效果获取比较繁琐。
随后考虑用HtmlUnit, 想着借用咱们CSND的登陆练练手。谁知道CSDN的登陆,js加载时间超长,不设置长一点的加载时间,按钮提交根本没效果,js没生效。 具体看代码注释吧。 奉劝做爬虫的同志们,千万别用CSDN登陆练手,坑死我了。。。
maven配置如下:
<dependencies>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.18</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
</dependencies>

代码如下:
/*
* Copyright (c) 2017 Create By Shijing All Rights Reserved.
*/
package com.test;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.SilentCssErrorHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlButtonInput;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlPasswordInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
import com.gargoylesoftware.htmlunit.util.Cookie;

public class SimulateLogin
{
//访问的目标网址(CSDN)
private static String TARGET_URL = "https://passport.csdn.net/account/login?from=http://www.csdn.net";

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
// 模拟一个浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 设置webClient的相关参数
webClient.setCssErrorHandler(new SilentCssErrorHandler());
//设置ajax
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//设置支持js
webClient.getOptions().setJavaScriptEnabled(true);
//CSS渲染禁止
webClient.getOptions().setCssEnabled(false);
//超时时间
webClient.getOptions().setTimeout(50000);
//设置js抛出异常:false
webClient.getOptions().setThrowExceptionOnScriptError(false);
//允许重定向
webClient.getOptions().setRedirectEnabled(true);
//允许cookie
webClient.getCookieManager().setCookiesEnabled(true);
// 模拟浏览器打开一个目标网址
HtmlPage page = webClient.getPage(TARGET_URL);
/**等待js加载完全,CSDN这点 特别坑,js加载时间超长!!!!!!! 后人切记不要用CSDN模拟登陆!!!!!!!**/
webClient.waitForBackgroundJavaScript(10000*3);
// 根据form的名字获取页面表单,也可以通过索引来获取:page.getForms().get(0)
HtmlForm form = (HtmlForm) page.getElementById("fm1");
HtmlTextInput username = (HtmlTextInput) form.getInputByName("username");
HtmlPasswordInput password = (HtmlPasswordInput) form.getInputByName("password");
username.setValueAttribute("********"); //用户名
password.setValueAttribute("********"); //密码
HtmlButtonInput button = (HtmlButtonInput) page.getByXPath("//input[contains(@class, 'logging')]").get(0);
// ScriptResult result = page.executeJavaScript("javascript:document.getElementsByClassName('logging')[0].click()");
// HtmlPage retPage = (HtmlPage) result.getNewPage();
HtmlPage retPage = button.click();
// 等待JS驱动dom完成获得还原后的网页
webClient.waitForBackgroundJavaScript(1000);
//输出跳转网页的地址
System.out.println(retPage.getUrl().toString());
//输出跳转网页的内容
System.out.println(retPage.asXml());

//获取cookie
Set<Cookie> cookies = webClient.getCookieManager().getCookies();
Map<String, String> responseCookies = new HashMap<String, String>();
for (Cookie c : cookies) {
responseCookies.put(c.getName(), c.getValue());
System.out.print(c.getName()+":"+c.getValue());
}
webClient.close();
System.out.println("Success!");
}
}

另外,CSDN的JS总是莫名其妙的报一堆错,如果不想看,想忽略的话,在创建WebClient前加上如下代码
//设置日志级别,原页面js异常不打印
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");

java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit")
.setLevel(Level.OFF);

java.util.logging.Logger.getLogger("org.apache.commons.httpclient")
.setLevel(Level.OFF);
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: