您的位置:首页 > 其它

使用webmagic爬取搜狗上公众账号发布的文章信息

2015-05-05 01:29 465 查看
/**

* 爬取文章类

*/

public class ArticleCrawler {

Static ApplicationContext context = SpringFactory.getApplicationContext();
Static ArticleDao articleDao=(ArticleDao)context.getBean("articleDao");
Static AccountCrawlerDao accountCrawlerDao = (AccountCrawlerDao)context.getBean("accountCrawlerDao");

/**
* 爬取文章信息
* @param pageUrl
* @return
* @throws ParseException
*/
public ArrayList<ArticleBo> getPublicArticles(String pageUrl){
ArrayList<ArticleBo> boList = new ArrayList<>();
Spider spider = Spider.create(new ArticleCrawlerProcessor())
.thread(3).addPipeline(new ResultItemsCollectorPipeline());

ResultItems resultItems = spider.get(pageUrl);
spider.close();

boList=resultItems.get("articleList");
return boList;
}

/**
* 爬取+入库流程
*/
public void run(String openId){
// 得到公众号发布文章信息
ArrayList<ArticleBo> articleList = new ArrayList<ArticleBo>();

String Url = "http://weixin.sogou.com/gzhjs?openid="+openId;

// 爬取文章
articleList = getPublicArticles(Url);

if(null!=articleList&&articleList.size()>0){
// 文章入库
for(ArticleBo bo:articleList){
try {
articleDao.addArticle(bo);
} catch (Exception e){
// 入库失败
}
}
}
}

public static void main(String args[]){
ArticleCrawler crawler = new ArticleCrawler();

// 得到需要爬取发布文章的公众账号
List<AccountCrawlerBo> boList = accountCrawlerDao.getCrawlerList();
if (null!=boList&&boList.size()>0) {
for (AccountCrawlerBo bo:boList) {
crawler.run(bo.getPublicOpenId());
}
}
}


}

/**

* 爬虫进程类

*/

public class ArticleCrawlerProcessor implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(60000);

@Override
public void process(Page page) {

// 文章列表对应的审查元素
List<String> resultList = page.getHtml().xpath("//item").all();

List<ArticleBo> articleList = new ArrayList<>();
if((resultList!=null)&&(resultList.size()>0))
{
// 解析文章信息:title,biz,mid,url,abstract,createTime等
for(int i=0; i<resultList.size(); i++){

Html html = new Html(resultList.get(i));

String title = StringTool.getMiddleStr("<![CDATA[","]]>",html.xpath("title/text()").get());
String url = html.xpath("url/text()").get();
String biz = StringTool.getMiddleStr("__biz=","&mid=",url);
String mid = StringTool.getMiddleStr("&mid=","&idx=",url);
int idx = Integer.parseInt(StringTool.getMiddleStr("&idx=", "&sn=", url));
String sn = StringTool.getMiddleStr("&sn=","&3rd=",url);
String publicOpenId = StringTool.getMiddleStr(" ","<",html.xpath("openid/text()").get());
String articleAbstract = StringTool.getMiddleStr(" ","<", html.xpath("content168/text()").get());
String articleAbstractPicture = StringTool.getMiddleStr(" ","<", html.xpath("imglink/text()").get());
String articleCreatetime = StringTool.getMiddleStr(" ", "<", html.xpath("lastmodified/text()").get());
articleCreatetime = StringTool.getTimeFromUnix(articleCreatetime);

ArticleBo bo = new ArticleBo();

bo.setArticleTitle(title);
bo.setArticleUrl(url);
bo.setArticleBiz(biz);
bo.setArticleMid(mid);
bo.setArticleIdx(idx);
bo.setArticleSn(sn);
bo.setPublicOpenId(publicOpenId);
bo.setArticleAbstract(articleAbstract);
bo.setArticleAbstractPicture(articleAbstractPicture);
bo.setArticleCreatetime(articleCreatetime);

articleList.add(bo);
}
}

page.putField("articleList", articleList);
}

@Override
public Site getSite() {
return site;
}


}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: