大众点评网爬虫实现源码
2016-06-04 00:00
92 查看
摘要: 大众点评网爬虫实现源码
源码如下,大家可以拷贝到神箭手云爬虫(http://www.shenjianshou.cn/)上直接运行:
源码如下,大家可以拷贝到神箭手云爬虫(http://www.shenjianshou.cn/)上直接运行:
// 大众点评上爬取所有"黄焖鸡米饭"的商户信息 var keywords = "黄焖鸡米饭"; var scanUrls = []; //国内的城市id到2323,意味着种子url有2323个 //作为sample,这里改成1,只爬取上海的黄焖鸡米饭门店 //for (var i = 1; i <= 2323; i++) { for (var i = 1; i <= 1; i++) { scanUrls.push("http://www.dianping.com/search/keyword/"+i+"/0_"+keywords); } var configs = { domains: ["dianping.com"], scanUrls: scanUrls, helperUrlRegexes: ["http://www.dianping.com/search/keyword/\\d+/0_.*"], contentUrlRegexes: ["http://www.dianping.com/shop/\\d+/editmember"], enableProxy: true, interval: 5000, fields: [ { name: "shop_name", selector: "//div[contains(@class,'shop-review-wrap')]/div/h3/a/text()" }, { name: "id", selector: "//div[contains(@class,'shop-review-wrap')]/div/h3/a/@href" }, { name: "create_time", selector: "//div[contains(@class,'block raw-block')]/ul/li[1]/span" }, { name: "region_name", selector: "//div[@class='breadcrumb']/b[1]/a/span/text()", required: true }, { name: "province_name", selector: "//div[@class='breadcrumb']/b[1]/a/span/text()" } ] }; configs.onProcessHelperUrl = function(url, content, site) { var urls = extractList(content, "//div[@class='tit']/a[not(contains(@class,'shop-branch'))]/@href"); for (var i = 0; i < urls.length; i++) { site.addUrl(urls[i]+"/editmember"); } var nextPage = extract(content,"//div[@class='page']/a[@class='next']/@href"); if (nextPage) { site.addUrl(nextPage); var result = /\d+$/.exec(nextPage); if (result) { var data = result[0]; var count = nextPage.length-data.length; var lll = nextPage.substr(0, count)+(parseInt(data)+1); site.addUrl(nextPage.substr(0, count)+(parseInt(data)+1)); site.addUrl(nextPage.substr(0, count)+(parseInt(data)+2)); } } return false; } configs.afterExtractField = function(fieldName, data, page) { if (fieldName == "id") { var result = /\d+$/.exec(data); if (result) { data = result[0]; } } else if (fieldName == "shop_name") { if (data.indexOf("黄焖鸡米饭") == -1) { page.skip(); } } else if (fieldName == "create_time") { var result = /\d{2}-\d{2}-\d{2}$/.exec(data); data = "20"+result[0]; } else if (fieldName == "province_name" || fieldName == "region_name") { var position = data.indexOf("县"); if (position != -1 && position < data.length -1) { data = data.substr(0,position+1); } position = data.indexOf("市"); if (position != -1 && position < data.length -1) { data = data.substr(0,position+1); } data = data.replace("餐厅",""); if (fieldName == "province_name") { data = getProvinceNameByRegion(data); } } return data; } var crawler = new Crawler(configs); crawler.start();
相关文章推荐
- 从源码安装Mysql/Percona 5.5
- Python3写爬虫(四)多线程实现数据爬取
- Scrapy的架构介绍
- 爬虫笔记
- dedecms采集过滤常用代码集合
- 浅析Ruby的源代码布局及其编程风格
- 基于C#实现网页爬虫
- Nodejs爬虫进阶教程之异步并发控制
- asp.net 抓取网页源码三种实现方法
- PHP+HTML+JavaScript+Css实现简单爬虫开发
- JS小游戏之仙剑翻牌源码详解
- JS小游戏之宇宙战机源码详解
- jQuery源码分析之jQuery中的循环技巧详解
- 本人自用的global.js库源码分享
- java中原码、反码与补码的问题分析
- ASP.NET使用HttpWebRequest读取远程网页源代码
- php采集速度探究总结(原创)
- phpphp图片采集后按原路径保存图片示例
- asp代理采集的核心函数代码
- perl 采集入库脚本分享