htmlUnit的使用
2015-10-11 14:40
585 查看
htmlUnit的功能比Jsoup要强大一些,是一个没有界面的浏览器,可以模拟登陆。
还可以支持XPath。
下面是一个模拟百度搜索的例子:
下面是登陆新浪的例子:
不会的就用一个例子试试。
还可以支持XPath。
下面是一个模拟百度搜索的例子:
package com.swpu; import java.io.IOException; import java.net.MalformedURLException; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlAnchor; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; import com.gargoylesoftware.htmlunit.html.HtmlTextInput; public class WorldBankCrawl { public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setJavaScriptEnabled(false); // 必须加上 HtmlPage page = webClient .getPage("https://www.baidu.com/?tn=96010190_dg"); final HtmlForm form = page.getFormByName("f"); final HtmlSubmitInput submitInput = form.getInputByValue("百度一下"); final HtmlTextInput input = form.getInputByName("wd"); input.setValueAttribute("西游记"); final HtmlPage nextPage = (HtmlPage) submitInput.click(); // String nextString = nextPage.asText(); // System.out.println(nextString); // System.out.println(nextPage.asXml()); final java.util.List<?> images = nextPage .getByXPath("//a[@title='万圣公主']/../..//img"); for (Object image : images) { System.out.println(image); } System.out.println("src:\n"); System.out.println(nextPage .getByXPath("//a[@title='万圣公主']/../..//@src")); System.out.println(nextPage .getByXPath("//a[@title='万圣公主']/../..//@title")); System.out.println(nextPage.getByXPath( "//a[@title='万圣公主']/../..//@title").size()); final java.util.List<?> table = nextPage .getByXPath("//table[@class='c-table opr-toplist-table']"); System.out.println("test table:"); System.out.println(table); final java.util.List<?> link = nextPage .getByXPath("//a[@class='n']//@href"); System.out.println("link:" + link); HtmlAnchor next2Anchor = (HtmlAnchor) nextPage.getByXPath( "//a[@class='n']").get(0); HtmlPage next2Page = next2Anchor.click(); System.out.println("NEXT PAGE:\n"); System.out.println(next2Page.asText()); System.out.println("测试XPath函数:"); System.out.println(next2Page .getByXPath("//span[@title='《西游记》人物']/text()")); System.out.println(next2Page.getByXPath("//span[@title='《西游记》人物']")); System.out.println(next2Page.getByXPath("//div[2]")); System.out.println("函数设置:"); System.out.println(next2Page.getByXPath("//a[@class='n']")); // final HtmlTableHeader header = ((HtmlTable) table).getHeader(); // final List<HtmlTableRow> headerRows = header.getRows(); // final HtmlTableHeader header = ((HtmlTable) table).getHeader(); /* * final java.util.List<HtmlTableRow> headerRows = header.getRows(); * for(HtmlTableRow headerRow : headerRows) { * System.out.println(headerRow.asText()); } */ /* * final java.util.List<?> attributeList = * nextPage.getByXPath("//a[@title='万圣公主']/../..//@src;"); for(Object * attr: attributeList) { System.out.println(attr); } */ // System.out.println( // nextPage.getByXPath("//a[@title='万圣公主']/../..//img").get(0)); // System.out.println(nextPage.getByXPath("//a[@title='万圣公主']/../..//img")); // //不需要特定顺序必须使用// } }
下面是登陆新浪的例子:
package com.swpu; import java.io.IOException; import java.net.MalformedURLException; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlInput; import com.gargoylesoftware.htmlunit.html.HtmlPage; public class ParseSinaBlog { public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { // TODO Auto-generated method stub try { final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38); webClient.getOptions().setJavaScriptEnabled(true); // 必须设置为true webClient.getOptions().setCssEnabled(false); webClient .setAjaxController(new NicelyResynchronizingAjaxController()); webClient.getOptions().setThrowExceptionOnScriptError(false); final HtmlPage page = (HtmlPage) webClient .getPage("http://login.sina.com.cn/sso/login.php?" + "client=ssologin.js(v1.3.16)"); HtmlInput ln = page.getHtmlElementById("username"); HtmlInput pwd = page.getHtmlElementById("password"); HtmlInput btn = page.getFirstByXPath(".//*[@id='vForm']/" + "div[3]/ul/li[6]/div[2]/input"); ln.setAttribute("value", "你的用户名"); pwd.setAttribute("value", "你的密码"); HtmlPage page2 = btn.click(); // 登录完成,现在可以爬取任意你想要的页面了。 // System.out.println(page2.asText()); /* * HtmlAnchor anchor = (HtmlAnchor)page.getByXPath * ("//li/a[@href='http://weibo.com']"); */ /* * System.out.println("anchor:\n\n\n" + page.getByXPath * ("//li/a[@href='http://weibo.com']")); */ /* * HtmlPage weiboPage = anchor.click(); java.util.List<?> wbList = * weiboPage.getByXPath ("//div[@class]"); // \"WB_text W_f14\ * * for(Object object : wbList) { System.out.println(object); } */ /* * HtmlPage nextPage = webClient.getPage("http://weibo.com/friends?" * + "leftnav=1&wvr=6&isfriends=1&step=2"); * System.out.println(nextPage.asXml()); */ /* * HtmlPage page3 = webClient.getPage("http://weibo.com/" + * "friends?leftnav=1&wvr=5&isfriends=1&step=2"); */ // System.out.println(" : " + page3.asXml()); HtmlPage firstPage = webClient.getPage("http://weibo.com/" + "u/2795493364/home?leftnav=1"); System.out.println(firstPage.asXml()); /* * java.util.List<?> contents = firstPage.getByXPath * ("//div[@class='WB_text W_f14']"); * * System.out.println("print contents:"); for(Object content : * contents) { System.out.println(content); * * * } */ } catch (Exception ex) { System.out.println(ex.getMessage()); } } }
不会的就用一个例子试试。
相关文章推荐
- word转化html
- 使用etree.HTML的编码问题
- html之超链接
- html 图像处理 灰度图和浮雕图类PS
- 替换html元素
- 添加html元素
- HTML编码
- 改变 HTML 内容
- html弹窗半透明
- HTML 标签
- html中静态进度条的实现
- HTML head 头标签
- Html编码
- MVC 调试页面路径变成 Views/Controller/Action.cshtml问题
- html a标签
- 使用HtmlUnit获取博客园第一页的所有文章标题
- html基础-标题
- html的input输入框提示信息 点击隐藏
- 奖学金申请表--用html中的table实现
- html 中include另外一个页面