您的位置:首页 > 其它

webmagic0.6.0抓取aliyun博客

2017-02-15 00:00 10 查看
抓取网站:https://yq.aliyun.com/articles



lib包:https://github.com/code4craft/webmagic/releases/download/WebMagic-0.6.0/webmagic-0.6.0-all.tar.gz



代码

import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class CrawlAliyun  implements PageProcessor{

//入口
public static final String URL_START ="https://yq.aliyun.com/articles/type_all";
//分页
public static final String URL_PAGE = "https://yq.aliyun.com/articles/type_all-order_createtime-page_[0-9]+";
//标签
public static final String URL_TAGS = "https://yq.aliyun.com/tags/type_blog-tagid_[0-9]+";
//博文
public static final String URL_CONTENT = "https://yq.aliyun.com/articles/[0-9]+";

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setCharset("utf-8");

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
Selectable select=null;
List<String> urls=null;
if (page.getUrl().regex(URL_PAGE).match()||page.getUrl().regex(URL_START).match()) {
System.out.println("=====pageurl====="+page.getUrl());
select = page.getHtml().xpath("//section[@class='yq-new-list yq-n-l-blog']");

urls = select.links().regex(URL_PAGE).all();
page.addTargetRequests(urls);

urls = select.links().regex(URL_CONTENT).all();
page.addTargetRequests(urls);
}
if (page.getUrl().regex(URL_CONTENT).match()) {
System.out.println("=====arturl====="+page.getUrl());
select = page.getHtml().xpath("//p[@class='blog-tags']");
urls = select.links().regex(URL_TAGS).all();
page.addTargetRequests(urls);

page.putField("url", page.getUrl().toString());
page.putField("title", page.getHtml().xpath("//h2[@class='blog-title']/text()").toString() );
page.putField("author", page.getHtml().xpath("//a[@class='b-author']/text()").toString());
page.putField("authorUrl", page.getHtml().xpath("//a[@class='b-author']").$("a","href").toString());
page.putField("createtime", page.getHtml().xpath("//span[@class='b-time']/text()").toString());
page.putField("watched", page.getHtml().xpath("//span[@class='b-watch']/text()").toString().replace("浏览", ""));
page.putField("tags",page.getHtml().xpath("//p[@class='blog-tags']/a/text()").all() );
page.putField("summary", page.getHtml().xpath("//p[@class='blog-summary']/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='content-detail']/html()").toString());
}
}

@Override
public Site getSite() {
site.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return site;
}

public static void main(String args[]) {
Spider.create(new CrawlAliyun())
//从"https://github.com/code4craft"开始抓
.addUrl(URL_START)
.addPipeline(new JsonFilePipeline("F:\\webmagic\\"))
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
}

结果

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  webmagic