您的位置:首页 > Web前端 > HTML

htmlUnit的使用

2015-10-11 14:40 585 查看
htmlUnit的功能比Jsoup要强大一些,是一个没有界面的浏览器,可以模拟登陆。

还可以支持XPath。

下面是一个模拟百度搜索的例子:

package com.swpu;

import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;

public class WorldBankCrawl {

public static void main(String[] args)
throws FailingHttpStatusCodeException, MalformedURLException,
IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false); // 必须加上
HtmlPage page = webClient
.getPage("https://www.baidu.com/?tn=96010190_dg");

final HtmlForm form = page.getFormByName("f");
final HtmlSubmitInput submitInput = form.getInputByValue("百度一下");
final HtmlTextInput input = form.getInputByName("wd");
input.setValueAttribute("西游记");

final HtmlPage nextPage = (HtmlPage) submitInput.click();
// String nextString = nextPage.asText();
// System.out.println(nextString);
// System.out.println(nextPage.asXml());
final java.util.List<?> images = nextPage
.getByXPath("//a[@title='万圣公主']/../..//img");
for (Object image : images) {
System.out.println(image);
}
System.out.println("src:\n");
System.out.println(nextPage
.getByXPath("//a[@title='万圣公主']/../..//@src"));
System.out.println(nextPage
.getByXPath("//a[@title='万圣公主']/../..//@title"));
System.out.println(nextPage.getByXPath(
"//a[@title='万圣公主']/../..//@title").size());

final java.util.List<?> table = nextPage
.getByXPath("//table[@class='c-table opr-toplist-table']");
System.out.println("test table:");
System.out.println(table);

final java.util.List<?> link = nextPage
.getByXPath("//a[@class='n']//@href");

System.out.println("link:" + link);
HtmlAnchor next2Anchor = (HtmlAnchor) nextPage.getByXPath(
"//a[@class='n']").get(0);
HtmlPage next2Page = next2Anchor.click();
System.out.println("NEXT PAGE:\n");
System.out.println(next2Page.asText());
System.out.println("测试XPath函数:");
System.out.println(next2Page
.getByXPath("//span[@title='《西游记》人物']/text()"));
System.out.println(next2Page.getByXPath("//span[@title='《西游记》人物']"));
System.out.println(next2Page.getByXPath("//div[2]"));
System.out.println("函数设置:");
System.out.println(next2Page.getByXPath("//a[@class='n']"));

// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
// final List<HtmlTableRow> headerRows = header.getRows();
// final HtmlTableHeader header = ((HtmlTable) table).getHeader();
/*
* final java.util.List<HtmlTableRow> headerRows = header.getRows();
* for(HtmlTableRow headerRow : headerRows) {
* System.out.println(headerRow.asText()); }
*/

/*
* final java.util.List<?> attributeList =
* nextPage.getByXPath("//a[@title='万圣公主']/../..//@src;"); for(Object
* attr: attributeList) { System.out.println(attr); }
*/
// System.out.println(
// nextPage.getByXPath("//a[@title='万圣公主']/../..//img").get(0));

// System.out.println(nextPage.getByXPath("//a[@title='万圣公主']/../..//img"));
// //不需要特定顺序必须使用//
}
}


下面是登陆新浪的例子:

package com.swpu;

import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class ParseSinaBlog {

public static void main(String[] args)
throws FailingHttpStatusCodeException, MalformedURLException,
IOException {
// TODO Auto-generated method stub

try {

final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);

webClient.getOptions().setJavaScriptEnabled(true); // 必须设置为true
webClient.getOptions().setCssEnabled(false);
webClient
.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.getOptions().setThrowExceptionOnScriptError(false);

final HtmlPage page = (HtmlPage) webClient
.getPage("http://login.sina.com.cn/sso/login.php?"
+ "client=ssologin.js(v1.3.16)");

HtmlInput ln = page.getHtmlElementById("username");
HtmlInput pwd = page.getHtmlElementById("password");
HtmlInput btn = page.getFirstByXPath(".//*[@id='vForm']/"
+ "div[3]/ul/li[6]/div[2]/input");

ln.setAttribute("value", "你的用户名");
pwd.setAttribute("value", "你的密码");

HtmlPage page2 = btn.click();
// 登录完成,现在可以爬取任意你想要的页面了。

// System.out.println(page2.asText());
/*
* HtmlAnchor anchor = (HtmlAnchor)page.getByXPath
* ("//li/a[@href='http://weibo.com']");
*/
/*
* System.out.println("anchor:\n\n\n" + page.getByXPath
* ("//li/a[@href='http://weibo.com']"));
*/
/*
* HtmlPage weiboPage = anchor.click(); java.util.List<?> wbList =
* weiboPage.getByXPath ("//div[@class]"); // \"WB_text W_f14\
*
* for(Object object : wbList) { System.out.println(object); }
*/

/*
* HtmlPage nextPage = webClient.getPage("http://weibo.com/friends?"
* + "leftnav=1&wvr=6&isfriends=1&step=2");
* System.out.println(nextPage.asXml());
*/

/*
* HtmlPage page3 = webClient.getPage("http://weibo.com/" +
* "friends?leftnav=1&wvr=5&isfriends=1&step=2");
*/
// System.out.println(" : " + page3.asXml());
HtmlPage firstPage = webClient.getPage("http://weibo.com/"
+ "u/2795493364/home?leftnav=1");
System.out.println(firstPage.asXml());

/*
* java.util.List<?> contents = firstPage.getByXPath
* ("//div[@class='WB_text W_f14']");
*
* System.out.println("print contents:"); for(Object content :
* contents) { System.out.println(content);
*
*
* }
*/

} catch (Exception ex) {
System.out.println(ex.getMessage());
}

}
}


不会的就用一个例子试试。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: