您的位置:首页 > 其它

webhave爬虫抓取京东数据

2017-07-03 09:26 155 查看
抓取京东店铺商品信息webmagic使用maven管理
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.2</version>
</dependency>
//传店铺url
flag= WebMagic.running(shopUrl);
//京东店铺商品信息有的在页面能直接抓取,有的需要分析有数据的js
//分析js,游览器 f12  f5
//根据页面信息,拼接js
public class WebMagic implements PageProcessor{
static Integer flag ;
public static Integer running(String url) {

Spider.create(new WebMagic())
.addUrl(url)
.addPipeline(new ConsolePipeline())
.thread(5).run();
return flag;
}

public static final String URL_POST ="(http[s]{0,1})://\\w+\\.jd\\.com/view_search-\\S+\\.html";//正则匹配规则
public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";// //自营店铺

public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S";//匹配正则url	 //

private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

public Site getSite() {
return site;
}

public void process(Page page) {
if(page.getUrl().regex(URL_POST).match()){
//获取商品类目categoryId和appId
String categoryId="";
String appId="";
String orderBy="";
String direction="";
String pageSize="";
String pageNo="";
String url=page.getUrl().toString();
String [] sub_url_array = url.split("-");  
if (sub_url_array != null && sub_url_array.length >0) {

appId =sub_url_array[1];
categoryId=sub_url_array[2];
orderBy=sub_url_array[3];
direction
4000
=sub_url_array[4];
pageSize =sub_url_array[5];
//pageNo=sub_url_array[6];
pageNo=sub_url_array[6].replaceAll(".html", "");

}
page.putField("pageInstanceId",page.getHtml().xpath("//[@id='pageInstance_id']/@value").all());
page.putField("venderId",page.getHtml().xpath("//[@id='vender_id']/@value").all());

page.putField("instanceid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure']/@m_render_instance_id").all());
page.putField("prototypeid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_prototype_id").all());
page.putField("templateId",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_template_id").all());

page.putField("shopId",page.getHtml().xpath("//[@id='shop_id']/@value").all());

List<String> pageInstanceIds = (List<String>) page.getResultItems().get("pageInstanceId");
List<String> venderIds = (List<String>) page.getResultItems().get("venderId");

List<String> instanceids=(List<String>) page.getResultItems().get("instanceid");
List<String> prototypeids=(List<String>) page.getResultItems().get("prototypeid");
List<String> templateIds=(List<String>) page.getResultItems().get("templateId");

List<String> shopIds=(List<String>) page.getResultItems().get("shopId");

String pageInstanceId="";
String venderId="";
String instanceid="";
String prototypeid="";
String templateId="";
String shopId="";

if (pageInstanceIds != null && pageInstanceIds.size()>0) {
pageInstanceId=pageInstanceIds.get(0);
venderId=venderIds.get(0);
instanceid=instanceids.get(0);
prototypeid=prototypeids.get(0);
templateId=templateIds.get(0);

shopId=shopIds.get(0);
}

//当前时间戳获取
String res;
Date date = new Date();
long ts = date.getTime();
res = String.valueOf(ts);

for (int i = 1; i <5; i++) {
String surl="";

surl="http://module-jshop.jd.com/module/getModuleHtml.html?appId="+appId+"&orderBy="+orderBy+"&pageNo="+i+"&direction="+direction+"&categoryId="+categoryId+"&pageSize="+pageSize+"&pagePrototypeId=8&pageInstanceId="+pageInstanceId+"&moduleInstanceId="+instanceid+"&prototypeId="+prototypeid+"&templateId="+templateId+"&layoutInstanceId="+instanceid+"&origin=0&shopId="+shopId+"&venderId="+venderId+"&callback=jshop_module_render_callback&_="+res;

JDItemJsonPreocessor.running(surl);
}
flag=200; 
}
}

}
//获取商品数据,价格sku,名称等信息
static Map<String,String> maps = new HashMap<String, String>();
private PageInfo pages = new PageInfo();
private ShopItem shopitem;
private ShopInfo shopinfo;
private List<ShopInfo> shopInfolist;//店铺信息
private List<ShopItem> shopItemlist;
   
   public static Map<String,String> running(String url) {
   
       Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
       return maps;
   }

  private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
   ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
   ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");

   public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
   //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
   public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
   public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
static Map<String,String> maps = new HashMap<String, String>();
private PageInfo pages = new PageInfo();
private ShopItem shopitem;
private ShopInfo shopinfo;
private List<ShopInfo> shopInfolist;//店铺信息
private List<ShopItem> shopItemlist;
   
   public static Map<String,String> running(String url) {
   
       Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
       return maps;
   }

  private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
   ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
   ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");

   public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
   //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
   public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
   public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
page.putField("id",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[3]/div/span[2]/@jdprice").all());page.putField("name",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[2]/a/text()").all());page.putField("img",page.getHtml().xpath("//div/div/div/ul/li/div/div[1]/a/img/@original").all()); List<String> ids = (List<String>) page.getResultItems().get("id");     List<String> name = (List<String>) page.getResultItems().get("name");     List<String> imgs=(List<String>) page.getResultItems().get("img");      	      String makerUrl = makerUrl(ids);     Map<String, String> running = JDJsonPreocessor.running(makerUrl);//拼接价格js     for (int i = 0; i < name.size(); i++) {      String price = running.get("J_"+ids.get(i));   	  String ItemId=ids.get(i);   	  String productname =name.get(i);   	  String pImg="";   	  pImg="http:"+imgs.get(i).replaceAll("\\\\\"", "");
 public String makerUrl(List<String> ids){   	  StringBuffer sb = new StringBuffer();   	  for (String id : ids) {   	   sb.append("J_"+id+",");   	  }   	  String substring = sb.substring(0, sb.length()-1);   	  //获取时间戳    	   String res;          Date date = new Date();          long ts = date.getTime();          res = String.valueOf(ts);   	  return "http://p.3.cn/prices/mgets?callback=jQuery3944635&skuIds="+substring+"&_="+res;   	 }
//获取价格信息
package com.huanovo.fxprice.service.impl;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Set;import com.huanovo.fxprice.util.JsonUtil;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.pipeline.FilePipeline;import us.codecraft.webmagic.processor.PageProcessor;public class JDJsonPreocessor implements PageProcessor{static Map<String,String> maps = new HashMap<String, String>();         public static Map<String,String> running(String url) {       Spider.create(new JDJsonPreocessor()).addUrl(url).run();       return maps;   }   private Site site = Site.me()   .setRetryTimes(3)   .setSleepTime(100)   .addHeader("Accept-Encoding", "/")   .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");   public Site getSite() {       return site;   }   public void process(Page page) {       page.setSkip(true);       String text = page.getRawText();       int begin = text.indexOf("[");       int end = text.indexOf("]");       String substring = text.substring(begin, end + 1);       String jsonName = "result";       String json = "{\"" + jsonName + "\":" + substring + "}";       Map<String, Object> map = JsonUtil.jsonToMap(json);       List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);       for (Map<String, Object> map1 : list) {           String key = map1.get("id").toString();           String value = map1.get("p").toString();           maps.put(key, value);       }   }}
主要就是 ,1.拿到链接分析页面数据 2.模拟链接访问3.xpath提取页面信息over

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: