webhave爬虫抓取京东数据
2017-07-03 09:26
155 查看
抓取京东店铺商品信息webmagic使用maven管理
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.2</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.2</version> </dependency>
//传店铺url
flag= WebMagic.running(shopUrl);
//京东店铺商品信息有的在页面能直接抓取,有的需要分析有数据的js
//分析js,游览器 f12 f5
//根据页面信息,拼接js
public class WebMagic implements PageProcessor{ static Integer flag ; public static Integer running(String url) { Spider.create(new WebMagic()) .addUrl(url) .addPipeline(new ConsolePipeline()) .thread(5).run(); return flag; } public static final String URL_POST ="(http[s]{0,1})://\\w+\\.jd\\.com/view_search-\\S+\\.html";//正则匹配规则 public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";// //自营店铺 public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S";//匹配正则url // private Site site = Site.me().setRetryTimes(3).setSleepTime(100); public Site getSite() { return site; } public void process(Page page) { if(page.getUrl().regex(URL_POST).match()){ //获取商品类目categoryId和appId String categoryId=""; String appId=""; String orderBy=""; String direction=""; String pageSize=""; String pageNo=""; String url=page.getUrl().toString(); String [] sub_url_array = url.split("-"); if (sub_url_array != null && sub_url_array.length >0) { appId =sub_url_array[1]; categoryId=sub_url_array[2]; orderBy=sub_url_array[3]; direction 4000 =sub_url_array[4]; pageSize =sub_url_array[5]; //pageNo=sub_url_array[6]; pageNo=sub_url_array[6].replaceAll(".html", ""); } page.putField("pageInstanceId",page.getHtml().xpath("//[@id='pageInstance_id']/@value").all()); page.putField("venderId",page.getHtml().xpath("//[@id='vender_id']/@value").all()); page.putField("instanceid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure']/@m_render_instance_id").all()); page.putField("prototypeid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_prototype_id").all()); page.putField("templateId",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_template_id").all()); page.putField("shopId",page.getHtml().xpath("//[@id='shop_id']/@value").all()); List<String> pageInstanceIds = (List<String>) page.getResultItems().get("pageInstanceId"); List<String> venderIds = (List<String>) page.getResultItems().get("venderId"); List<String> instanceids=(List<String>) page.getResultItems().get("instanceid"); List<String> prototypeids=(List<String>) page.getResultItems().get("prototypeid"); List<String> templateIds=(List<String>) page.getResultItems().get("templateId"); List<String> shopIds=(List<String>) page.getResultItems().get("shopId"); String pageInstanceId=""; String venderId=""; String instanceid=""; String prototypeid=""; String templateId=""; String shopId=""; if (pageInstanceIds != null && pageInstanceIds.size()>0) { pageInstanceId=pageInstanceIds.get(0); venderId=venderIds.get(0); instanceid=instanceids.get(0); prototypeid=prototypeids.get(0); templateId=templateIds.get(0); shopId=shopIds.get(0); } //当前时间戳获取 String res; Date date = new Date(); long ts = date.getTime(); res = String.valueOf(ts); for (int i = 1; i <5; i++) { String surl=""; surl="http://module-jshop.jd.com/module/getModuleHtml.html?appId="+appId+"&orderBy="+orderBy+"&pageNo="+i+"&direction="+direction+"&categoryId="+categoryId+"&pageSize="+pageSize+"&pagePrototypeId=8&pageInstanceId="+pageInstanceId+"&moduleInstanceId="+instanceid+"&prototypeId="+prototypeid+"&templateId="+templateId+"&layoutInstanceId="+instanceid+"&origin=0&shopId="+shopId+"&venderId="+venderId+"&callback=jshop_module_render_callback&_="+res; JDItemJsonPreocessor.running(surl); } flag=200; } } }
//获取商品数据,价格sku,名称等信息
static Map<String,String> maps = new HashMap<String, String>(); private PageInfo pages = new PageInfo(); private ShopItem shopitem; private ShopInfo shopinfo; private List<ShopInfo> shopInfolist;//店铺信息 private List<ShopItem> shopItemlist; public static Map<String,String> running(String url) { Spider.create(new JDItemJsonPreocessor()).addUrl(url).run(); return maps; } private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl"); ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl"); public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*"; //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
static Map<String,String> maps = new HashMap<String, String>(); private PageInfo pages = new PageInfo(); private ShopItem shopitem; private ShopInfo shopinfo; private List<ShopInfo> shopInfolist;//店铺信息 private List<ShopItem> shopItemlist; public static Map<String,String> running(String url) { Spider.create(new JDItemJsonPreocessor()).addUrl(url).run(); return maps; } private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl"); ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl"); public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*"; //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
page.putField("id",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[3]/div/span[2]/@jdprice").all());page.putField("name",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[2]/a/text()").all());page.putField("img",page.getHtml().xpath("//div/div/div/ul/li/div/div[1]/a/img/@original").all()); List<String> ids = (List<String>) page.getResultItems().get("id"); List<String> name = (List<String>) page.getResultItems().get("name"); List<String> imgs=(List<String>) page.getResultItems().get("img"); String makerUrl = makerUrl(ids); Map<String, String> running = JDJsonPreocessor.running(makerUrl);//拼接价格js for (int i = 0; i < name.size(); i++) { String price = running.get("J_"+ids.get(i)); String ItemId=ids.get(i); String productname =name.get(i); String pImg=""; pImg="http:"+imgs.get(i).replaceAll("\\\\\"", "");
public String makerUrl(List<String> ids){ StringBuffer sb = new StringBuffer(); for (String id : ids) { sb.append("J_"+id+","); } String substring = sb.substring(0, sb.length()-1); //获取时间戳 String res; Date date = new Date(); long ts = date.getTime(); res = String.valueOf(ts); return "http://p.3.cn/prices/mgets?callback=jQuery3944635&skuIds="+substring+"&_="+res; }
//获取价格信息
package com.huanovo.fxprice.service.impl;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Set;import com.huanovo.fxprice.util.JsonUtil;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.pipeline.FilePipeline;import us.codecraft.webmagic.processor.PageProcessor;public class JDJsonPreocessor implements PageProcessor{static Map<String,String> maps = new HashMap<String, String>(); public static Map<String,String> running(String url) { Spider.create(new JDJsonPreocessor()).addUrl(url).run(); return maps; } private Site site = Site.me() .setRetryTimes(3) .setSleepTime(100) .addHeader("Accept-Encoding", "/") .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36"); public Site getSite() { return site; } public void process(Page page) { page.setSkip(true); String text = page.getRawText(); int begin = text.indexOf("["); int end = text.indexOf("]"); String substring = text.substring(begin, end + 1); String jsonName = "result"; String json = "{\"" + jsonName + "\":" + substring + "}"; Map<String, Object> map = JsonUtil.jsonToMap(json); List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName); for (Map<String, Object> map1 : list) { String key = map1.get("id").toString(); String value = map1.get("p").toString(); maps.put(key, value); } }}
主要就是 ,1.拿到链接分析页面数据 2.模拟链接访问3.xpath提取页面信息over
相关文章推荐
- Python开源爬虫项目代码:抓取淘宝、京东、QQ、知网数据--转
- [置顶] [爬虫]使用python抓取京东全站数据(商品,店铺,分类,评论)
- 07-爬虫的多线程调度 | 01.数据抓取 | Python
- Python爬虫处理抓取数据中文乱码问题
- chencang-----是使用淘宝API获取数据好,还是利用爬虫去抓取数据好呢
- Node.js爬虫数据抓取乱码问题总结
- python爬虫系列之爬京东手机数据
- 爬虫如何抓取到Asp.Net中-doPostBack获取新页面的数据
- nodejs爬虫抓取数据乱码问题总结
- htmlunit做爬虫绕过验证码抓取网站数据
- 分布式爬虫:使用Scrapy抓取数据
- nodejs爬虫抓取数据之编码问题
- 爬虫相关(3)----- 深度剖析Jsoup抓取数据
- java利用爬虫技术抓取(省、市(区号\邮编)、县)数据
- python&php数据抓取、爬虫分析与中介,有网址案例
- Python爬虫框架Scrapy实战之抓取户外数据
- mysql中kill掉所有锁表的进程爬虫抓取数据分析
- java实现网络爬虫--抓取网站数据
- 京东商品数据抓取