使用webmagic爬取搜狗上公众账号发布的文章信息
2015-05-05 01:29
465 查看
/**
* 爬取文章类
*/
public class ArticleCrawler {
}
/**
* 爬虫进程类
*/
public class ArticleCrawlerProcessor implements PageProcessor {
}
* 爬取文章类
*/
public class ArticleCrawler {
Static ApplicationContext context = SpringFactory.getApplicationContext(); Static ArticleDao articleDao=(ArticleDao)context.getBean("articleDao"); Static AccountCrawlerDao accountCrawlerDao = (AccountCrawlerDao)context.getBean("accountCrawlerDao"); /** * 爬取文章信息 * @param pageUrl * @return * @throws ParseException */ public ArrayList<ArticleBo> getPublicArticles(String pageUrl){ ArrayList<ArticleBo> boList = new ArrayList<>(); Spider spider = Spider.create(new ArticleCrawlerProcessor()) .thread(3).addPipeline(new ResultItemsCollectorPipeline()); ResultItems resultItems = spider.get(pageUrl); spider.close(); boList=resultItems.get("articleList"); return boList; } /** * 爬取+入库流程 */ public void run(String openId){ // 得到公众号发布文章信息 ArrayList<ArticleBo> articleList = new ArrayList<ArticleBo>(); String Url = "http://weixin.sogou.com/gzhjs?openid="+openId; // 爬取文章 articleList = getPublicArticles(Url); if(null!=articleList&&articleList.size()>0){ // 文章入库 for(ArticleBo bo:articleList){ try { articleDao.addArticle(bo); } catch (Exception e){ // 入库失败 } } } } public static void main(String args[]){ ArticleCrawler crawler = new ArticleCrawler(); // 得到需要爬取发布文章的公众账号 List<AccountCrawlerBo> boList = accountCrawlerDao.getCrawlerList(); if (null!=boList&&boList.size()>0) { for (AccountCrawlerBo bo:boList) { crawler.run(bo.getPublicOpenId()); } } }
}
/**
* 爬虫进程类
*/
public class ArticleCrawlerProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(60000); @Override public void process(Page page) { // 文章列表对应的审查元素 List<String> resultList = page.getHtml().xpath("//item").all(); List<ArticleBo> articleList = new ArrayList<>(); if((resultList!=null)&&(resultList.size()>0)) { // 解析文章信息:title,biz,mid,url,abstract,createTime等 for(int i=0; i<resultList.size(); i++){ Html html = new Html(resultList.get(i)); String title = StringTool.getMiddleStr("<![CDATA[","]]>",html.xpath("title/text()").get()); String url = html.xpath("url/text()").get(); String biz = StringTool.getMiddleStr("__biz=","&mid=",url); String mid = StringTool.getMiddleStr("&mid=","&idx=",url); int idx = Integer.parseInt(StringTool.getMiddleStr("&idx=", "&sn=", url)); String sn = StringTool.getMiddleStr("&sn=","&3rd=",url); String publicOpenId = StringTool.getMiddleStr(" ","<",html.xpath("openid/text()").get()); String articleAbstract = StringTool.getMiddleStr(" ","<", html.xpath("content168/text()").get()); String articleAbstractPicture = StringTool.getMiddleStr(" ","<", html.xpath("imglink/text()").get()); String articleCreatetime = StringTool.getMiddleStr(" ", "<", html.xpath("lastmodified/text()").get()); articleCreatetime = StringTool.getTimeFromUnix(articleCreatetime); ArticleBo bo = new ArticleBo(); bo.setArticleTitle(title); bo.setArticleUrl(url); bo.setArticleBiz(biz); bo.setArticleMid(mid); bo.setArticleIdx(idx); bo.setArticleSn(sn); bo.setPublicOpenId(publicOpenId); bo.setArticleAbstract(articleAbstract); bo.setArticleAbstractPicture(articleAbstractPicture); bo.setArticleCreatetime(articleCreatetime); articleList.add(bo); } } page.putField("articleList", articleList); } @Override public Site getSite() { return site; }
}
相关文章推荐
- 网信办发布《互联网用户公众账号信息服务管理规定》
- 网信办发布《互联网用户公众账号信息服务管理规定》
- 使用Adsutil.vbs脚本获取IIS配置信息及账号密码
- 【超详细教程】使用Windows Live Writer 2012和Office Word 2013 发布文章到博客园全面总结
- 使用 Microsoft Word 发布博客文章
- 使用Windows Live Writer 2012或Office Word 2013 发布文章到博客园
- 使用Cloud Foundry Explorer for Visual Studio查看Predix账号信息
- 使用ScribeFire,Windows Live Writer 2012和Office Word 2013 发布文章到博客园全面总结
- 如何使用live writer客户端来发布CSDN的博客文章
- 使用selenium+BeautifulSoup+正则表达式下载公众号我要whatyouneed文章里的音乐
- [解读小程序]公众号热门文章信息流
- 博客工具 - 使用word2007发布文章到wordpress
- nodejs爬虫-通过抓取搜狗微信网站获取微信文章信息
- 如何使用客户端来发布CSDN文章-window live writer
- 如何使用live writer客户端来发布CSDN的博客文章
- 开源中国微信公众账号使用说明
- (转)【超详细教程】使用Windows Live Writer 2012和Office Word 2013 发布文章到博客园全面总结
- 使用苹果企业级开发者账号发布应用程序
- 使用小书匠编辑器将文章快速发布到各大主要博客平台
- 使用Word2007发布博客文章