[爬虫源码]和大家分享一下瓜子二手车上的二车手信息爬虫源码
2016-12-09 16:53
627 查看
使用javascript编写的爬虫源码,用于爬取瓜子二手车上的二车手信息
源码如下:
var scanUrl = "http://www.guazi.com/hz/buy/";//@input(scanUrl, 入口url, 请输入一个需爬取城市的url,格式为:“http://www.guazi.com/城市名称/buy/”)
if (scanUrl.trim().length > 0) {
var city = scanUrl.trim().substring(scanUrl.indexOf(".com/") + 5, scanUrl.indexOf("/buy/"));
}
var configs = {
domains: ["guazi.com"],
scanUrls: [scanUrl],
contentUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/\\w+\\.htm"],
helperUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/buy/(o\\d+/)?"],
enableJS: false,
interval: 10000,
fields: [
{
name: "car_name",
selector: "//h1[contains(@class,'dt-titletype')]"
},
{
name: "car_price",
selector: "//span[contains(@class,'fc-org pricestype')]"
},
{
name: "car_license",
selector: "//li[contains(@class,'one')]/b"
},
{
name: "car_mileage",
selector: "//ul[contains(@class,'assort')]/li[2]/b"
},
{
name: "car_gearbox",
selector: "//ul[contains(@class,'assort')]/li[3]/b"
},
{
name: "car_emission_standard",
selector: "//li[contains(@class,'em-sta detailHoverTips')]/b"
},
{
name: "car_license_location",
selector: "//ul[contains(@class,'assort')]/li[5]/b"
},
{
name: "car_owner",
selector: "//li[contains(@class,'owner')]/text()[2]"
},
{
name: "car_description",
selector: "//*[@id='base']/p"
}
]
};
configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == "car_price") {
var price = extract(data, "//b").replace("¥", "¥");
var coinUnit = exclude(data, "//b");
return (price + coinUnit);
}
else if (fieldName == "car_owner") {
return data.trim();
}
else if (fieldName == "car_description") {
return data.replace("<em></em>", "");
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();
代码运行方法及运行效果:
https://github.com/ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81.txt
源码如下:
var scanUrl = "http://www.guazi.com/hz/buy/";//@input(scanUrl, 入口url, 请输入一个需爬取城市的url,格式为:“http://www.guazi.com/城市名称/buy/”)
if (scanUrl.trim().length > 0) {
var city = scanUrl.trim().substring(scanUrl.indexOf(".com/") + 5, scanUrl.indexOf("/buy/"));
}
var configs = {
domains: ["guazi.com"],
scanUrls: [scanUrl],
contentUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/\\w+\\.htm"],
helperUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/buy/(o\\d+/)?"],
enableJS: false,
interval: 10000,
fields: [
{
name: "car_name",
selector: "//h1[contains(@class,'dt-titletype')]"
},
{
name: "car_price",
selector: "//span[contains(@class,'fc-org pricestype')]"
},
{
name: "car_license",
selector: "//li[contains(@class,'one')]/b"
},
{
name: "car_mileage",
selector: "//ul[contains(@class,'assort')]/li[2]/b"
},
{
name: "car_gearbox",
selector: "//ul[contains(@class,'assort')]/li[3]/b"
},
{
name: "car_emission_standard",
selector: "//li[contains(@class,'em-sta detailHoverTips')]/b"
},
{
name: "car_license_location",
selector: "//ul[contains(@class,'assort')]/li[5]/b"
},
{
name: "car_owner",
selector: "//li[contains(@class,'owner')]/text()[2]"
},
{
name: "car_description",
selector: "//*[@id='base']/p"
}
]
};
configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == "car_price") {
var price = extract(data, "//b").replace("¥", "¥");
var coinUnit = exclude(data, "//b");
return (price + coinUnit);
}
else if (fieldName == "car_owner") {
return data.trim();
}
else if (fieldName == "car_description") {
return data.replace("<em></em>", "");
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();
代码运行方法及运行效果:
https://github.com/ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81.txt
相关文章推荐
- 一个不错的Silverlight展示网站反编译后的源码和大家分享一下(http://www.microsoft.com/taiwan/student/Good.htm),很简单的
- Python爬虫之requests+正则表达式抓取猫眼电影top100以及瓜子二手网二手车信息(四)
- struts2错误信息的几种显示 今天总结一下 分享给大家
- 我与大家分享一下手机内存卡照片恢复的相关信息
- 马上要过年了,老板突然想到要大家各自交一份“我的50个梦想”,自己用了40分钟左右给搞出来了,贴出来给大家分享一下
- 从网上找到一个清晰CSS视频教程和大家分享一下
- 整理了一下Asp.net源码常见问题(完善中...),欢迎大家补充修正(最后更新于06-01)!
- 一道机试题,拿出来和大家分享一下,稍候贴出我做的代码(C#)
- 机子中毒了,一点小经验和大家分享一下!
- 基于HTTP的QQ协议V1.1的不完整成果,拿出来与大家分享一下
- C#2005用XML来保存连接数据库的字符串(这样改变了服务器连接配制只需改一下这个文件就成了),琢磨了几天终于有解了,和大家分享一下,希望高手指点!
- 如何积累财富[转载] 我觉得挺经典 和大家一起分享一下!
- 关于网站的推广方面,自己的一些愚见和大家分享一下。
- 最近招了几个新员工,大家有什么好的管理经验,分享一下
- 太高兴了,我的书《大象-Thinking in UML》目前在互动网上是销售冠军!与大家分享一下!
- 蛙蛙推荐:最近开发一个小型的OA,把常用代码和大家分享一下先
- Asp.net无刷新中文验证码源码调试成功,特分享给大家
- 一个很震撼的桌面操作演示,拿出来和大家分享一下
- 自己写了一个Asp.net探针的源码,希望大家讨论一下
- 分享实用工具源码--实现Windows IDE中查看Linux下编译信息