您的位置:首页 > 理论基础 > 计算机网络

Java之网络爬虫WebCollector+selenium+phantomjs(三)

2016-08-27 14:14 519 查看
经过前面两篇的学习Java之网络爬虫WebCollector+selenium+phantomjs(一)Java之网络爬虫WebCollector+selenium+phantomjs(二)的学习后,我们来做一个小例子。我们所要做的东西为:爬取到京东列表页面,在页面上抽取出商品信息(名称、价格、评价),然后打印出抽取的商品信息。

贴出代码:

Goods.Java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler;

/**
*商品信息
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-21
*/
public class Goods {
private String platform;
private String url;
private String name;
private Float price;
private Integer commit;

public Goods(){
}

public String getPlatform() {
return platform;
}

public void setPlatform(String platform) {
this.platform = platform;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Float getPrice() {
return price;
}
public void setPrice(Float price) {
this.price = price;
}
public Integer getCommit() {
return commit;
}
public void setCommit(Integer commit) {
this.commit = commit;
}

@Override
public String toString() {
return "{platform="+platform+",url=" + url + ",name=" + name + ",price="
+ price + ",commit=" + commit + "}";
}

}


上面类为封装的商品信息。

EECrawler.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler;

import java.util.concurrent.atomic.AtomicInteger;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;

/**
*电商平台爬虫
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-20
*/
public abstract class ECCrawler extends DeepCrawler {

private String seedFormat;//种子格式化
protected RegexRule regexRule;

public RegexRule getRegexRule() {
return regexRule;
}
public void setRegexRule(RegexRule regexRule) {
this.regexRule = regexRule;
}
public void addRegex(String urlRegex) {
this.regexRule.addRule(urlRegex);
}
public ECCrawler(String crawlPath,String seedFormat ){
super(crawlPath);
this.seedFormat=seedFormat;
this.regexRule=new RegexRule();
}

/*用一个自增id来生成唯一文件名*/
AtomicInteger id=new AtomicInteger(0);

@Override
public Links visitAndGetNextLinks(Page page) {
Links nextLinks = new Links();
String conteType = page.getResponse().getContentType();
if (conteType != null && conteType.contains("text/html")) {
org.jsoup.nodes.Document doc = page.getDoc();
if (doc != null)
nextLinks.addAllFromDocument(page.getDoc(), regexRule);
}
try {
visit(page, nextLinks);
} catch (Exception ex) {
LOG.info("Exception", ex);
}
return nextLinks;
}
@Override
public void start(int depth) throws Exception {
addSeed();
super.start(depth);
}
/**
* add seed
*
* @throws Exception
*/
private void addSeed() throws Exception{
int totalPage=getTotalPage(getPage(getSeed(seedFormat, 1)));
for(int page=1;page<=totalPage;page++){
this.addSeed(getSeed(seedFormat, page));
}
}

/**
* 根据url获取Page实例
*
* @param url
* @return
* @throws Exception
*/
private Page getPage(String url) throws Exception {
HttpRequest httpRequest = new HttpRequest(url);
HttpResponse response = httpRequest.getResponse();
Page page = new Page();
page.setUrl(url);
page.setHtml(response.getHtmlByCharsetDetect());
page.setResponse(response);
return page;
}

/**
*获取查询商品总页数
*
* @return
*/
public abstract int getTotalPage(Page page);

/**
* 获取seed url
*
* @param seedFormat
* @param page
* @return
*/
public String getSeed(String seedFormat,Object ... page){
return String.format(seedFormat, page);
}

public abstract void visit(Page page, Links links);
}


上面抽象类继承DeepCrawler,为爬取电商列表也基类,爬取列表页html(包括js动态生成的html),并且可以抽取到列表页数,允许捕获所有页商品信息。

GoodsList.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler;

import java.util.ArrayList;

import cn.edu.hfut.dmic.webcollector.model.Page;

/**
*
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-23
*/
public abstract class GoodsList extends ArrayList<Goods> {

/**
*
*/
private static final long serialVersionUID = -6935403464055289581L;

public abstract void addGoods(Page page);
}


上面抽象类为存储商品信息的容器,继承自ArrayList,并且添加addGoods方法,用来添加商品信息到容器中。

JDCrawler.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler.jd;

import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;

import com.zhao.crawler.ECCrawler;
import com.zhao.crawler.Goods;

/**
*JD 爬虫
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-20
*/
public class JDCrawler extends ECCrawler {

private JDGoodsList goodsList;

/**
*
*
* @param crawlPath
* @param seekFormat
*/
public JDCrawler(String crawlPath, String seekFormat) {
super(crawlPath, seekFormat);
goodsList=new JDGoodsList();
}

@Override
public int getTotalPage(Page page) {
//		Element ele=page.getDoc().select("div#J_bottomPage").select("span.p-skip >em").first().select("b").first();
//		return ele==null?0:Integer.parseInt(ele.text());
return 1;
}

@Override
public void visit(Page page, Links links) {
System.out.println("url:"+page.getUrl()+"\tlinks size:"+links.size());
goodsList.addGoods(page);
}

public static void main(String[] args) throws Exception {
JDCrawler crawler=new JDCrawler("D:/test/crawler/jd/", "http://list.jd.com/list.html?cat=1319,1523,7052&page=%s&go=0&JL=6_0_0");
crawler.setThreads(100);//抓取启动线程数
crawler.start(1);//层数

crawler.print();
}

protected void print(){
for(Goods g:goodsList){
System.out.println(g);
}
}
}


继承ECCrawler,实现京东平台专属爬取类。获取页码数利用浏览器审查元素,定位到页面信息即可,为了方便测试,这里只返回1。启动时我们直接爬取种子页面,所以设置为1即可,具体的抽取商品信息交给了下面JDGoodsList来处理。抓取结束后,执行一边打印函数,打印出商品信息。

JDGoodsList.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler.jd;

import java.util.List;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;

import cn.edu.hfut.dmic.webcollector.model.Page;

import com.zhao.crawler.Goods;
import com.zhao.crawler.GoodsList;
import com.zhao.crawler.util.PageUtils;
import com.zhao.crawler.util.Platform;
import com.zhao.crawler.util.Tools;

/**
*
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-23
*/
public class JDGoodsList extends GoodsList {

/**
*
*/
private static final long serialVersionUID = -7487110223660262262L;

@Override
public void addGoods(Page page) {
WebDriver driver = null;
try {
driver = PageUtils.getWebDriver(page);
List<WebElement> eles = driver.findElements(By.cssSelector("li.gl-item"));
if (!eles.isEmpty()) {
for (WebElement ele : eles) {
Goods g = new Goods();
g.setPlatform(Platform.JD);// 电商平台
// 价格
String priceStr = ele.findElement(By.className("p-price"))
.findElement(By.className("J_price"))
.findElement(By.tagName("i"))
.getText();
if (Tools.notEmpty(priceStr)) {
g.setPrice(Float.parseFloat(priceStr));
} else {
g.setPrice(-1f);
}
// 商品名
g.setName(ele.findElement(By.className("p-name"))
.findElement(By.tagName("em")).getText());
// 商品链接
g.setUrl(ele.findElement(By.className("p-name"))
.findElement(By.tagName("a"))
.getAttribute("href"));
// 评价
String commitStr = ele
.findElement(By.className("p-commit"))
.findElement(By.tagName("a"))
.getText();
if (Tools.notEmpty(commitStr)) {
g.setCommit(Integer.parseInt(commitStr));
} else {
g.setCommit(-1);
}

add(g);
}
} else {
System.out.println("else is empty");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (driver != null) {
driver.quit();
}
}
}
}

PageUtils.java

/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.zhao.crawler.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import cn.edu.hfut.dmic.webcollector.model.Page;

/**
*
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-22
*/
public class PageUtils {

/**
* 获取webcollector 自带 htmlUnitDriver实例(模拟默认浏览器)
*
* @param page
* @return
*/
public static HtmlUnitDriver getDriver(Page page) {
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get(page.getUrl());
return driver;
}

/**
* 获取webcollector 自带htmlUnitDriver实例
*
* @param page
* @param browserVersion 模拟浏览器
* @return
*/
public static HtmlUnitDriver getDriver(Page page,
BrowserVersion browserVersion) {
HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion);
driver.setJavascriptEnabled(true);
driver.get(page.getUrl());
return driver;
}

/**
* 获取PhantomJsDriver(可以爬取js动态生成的html)
*
* @param page
* @return
*/
public static WebDriver getWebDriver(Page page) {
// WebDriver driver = new HtmlUnitDriver(true);

// System.setProperty("webdriver.chrome.driver", "D:\\Installs\\Develop\\crawling\\chromedriver.exe");
// WebDriver driver = new ChromeDriver();

System.setProperty("phantomjs.binary.path", "D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe");
WebDriver driver = new PhantomJSDriver();
driver.get(page.getUrl());

// JavascriptExecutor js = (JavascriptExecutor) driver;
// js.executeScript("function(){}");
return driver;
}

/**
* 直接调用原生phantomJS(即不通过selenium)
*
* @param page
* @return
*/
public static String getPhantomJSDriver(Page page) {
Runtime rt = Runtime.getRuntime();
Process process = null;
try {
process = rt.exec("D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe" +
"D:/MyEclipseWorkSpace/WebCollectorDemo/src/main/resources/parser.js " +
page.getUrl().trim());
InputStream in = process.getInputStream();
InputStreamReader reader = new InputStreamReader(
in, "UTF-8");
BufferedReader br = new BufferedReader(reader);
StringBuffer sbf = new StringBuffer();
String tmp = "";
while((tmp = br.readLine())!=null){
sbf.append(tmp);
}
return sbf.toString();
} catch (IOException e) {
e.printStackTrace();
}

return null;
}
}


获取WebDriver工具类,上篇有介绍。

Platform.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package com.zhao.crawler.util;

/**
*电商平台标识
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-23
*/
public interface Platform {
/**
* 京东
*/
public static final String JD="JD";
}

Tools.java
/*
* Copyright (C) 2015 zhao
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.zhao.crawler.util;

import org.apache.commons.lang3.StringUtils;

/**
*
*
* @author <a href="ls.zhaoxiangyu@gmail.com">zhao</>
* @date 2015-10-23
*/
public class Tools {

/**
* 判断字符窜是否等于null、""," ","null"
*
* @param str
* @return
*/
public static boolean isEmpty(String str){
return StringUtils.isBlank(str)||"null".equals(str);
}

/**
* 判断字符窜是否不等于null、""," ","null"
*
* @param str
* @return
*/
public static boolean notEmpty(String str){
return !StringUtils.isBlank(str)&&!"null".equals(str);
}

}


运行程序,控制台输出结果为:



ok,成功抽取商品信息。

自此,此次学习结束。源码下载地址(免费下载):WebCollectorDemo
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: