webmagic实战使用
2017-01-16 21:00
162 查看
一.引入依赖包
二.代码
上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情
每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点
采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);
String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller
ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);
解析Ajax json结果
if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject
doctorJo = JSON.parseObject(o);
针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.4.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.4.3</version> </dependency>
二.代码
package com.pz998.quartz.spider; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang.StringUtils; import org.eclipse.jetty.util.MultiMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc; import com.pz998.rpc.model.entity.BdDepartmentRpc; import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc; import com.pz998.rpc.model.entity.BdDoctorRpc; import com.pz998.rpc.model.entity.BdHospitalRpc; import net.minidev.json.JSONArray; import net.minidev.json.JSONObject; import net.minidev.json.parser.JSONParser; import net.minidev.json.parser.ParseException; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import us.codecraft.webmagic.selector.JsonPathSelector; import us.codecraft.xsoup.Xsoup; public class YiBaiduProcessor implements PageProcessor{ private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1"; private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+"; private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++"; private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+"; private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+"; private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+"; private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId=\\d+&hosId=\\d+"; private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+"; //https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83 //https://yi.baidu.com/pc/hospital/alldep?key= private Site site = Site.me(); public static final String STATE_SUCCESS = "0"; public static final Map<String,String> CITY_MAP = new HashMap<String,String>(); static{ CITY_MAP.put("371","武汉"); CITY_MAP.put("1", "北京"); CITY_MAP.put("2", "上海"); CITY_MAP.put("84","广州"); } @Override public void process(Page page) { String url=page.getUrl().toString(); if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){ try{ String state = new JsonPathSelector("$.status").select(page.getRawText()); if(STATE_SUCCESS.equals(state)){ List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText()); MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String cityId = resultMap.getString("cityId"); if(CollectionUtils.isNotEmpty(hospitalList)){ List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>(); for(Object obj:hospitalList){ JSONObject jsonObj = (JSONObject)obj; String name = (String)jsonObj.get("name"); System.out.println("name:"+name); String address = (String)jsonObj.get("address"); String level = (String)jsonObj.get("level"); Integer insurance = (Integer)jsonObj.get("insurance"); String phone = (String)jsonObj.get("phone"); String grade = (String)jsonObj.get("grade"); Integer doctorNum = (Integer)jsonObj.get("doctorNum"); String imageUrl = (String)jsonObj.get("logo"); Integer serveNum = (Integer)jsonObj.get("serveNum"); Integer commentNum = (Integer)jsonObj.get("commentNum"); String routeLink = (String)jsonObj.get("routeLink"); MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink); String location = routeLinkMap.getString("location"); String latitude = ""; String longitude = ""; if(StringUtils.isNotEmpty(location)){ String[] locationArray = location.split(","); latitude = locationArray.length>0?locationArray[0]:""; longitude = locationArray.length>1?locationArray[1]:""; } BdHospitalRpc bdHospitalRpc = new BdHospitalRpc(); bdHospitalRpc.setSourceId(name); bdHospitalRpc.setName(name); bdHospitalRpc.setAddress(address); bdHospitalRpc.setLevel(level); bdHospitalRpc.setPhone(phone); bdHospitalRpc.setImageUrl(imageUrl); bdHospitalRpc.setLatitude(latitude); bdHospitalRpc.setLongitude(longitude); bdHospitalRpc.setScore(grade); String city = CITY_MAP.get(cityId); bdHospitalRpc.setCity(city); String insuranceStr = insurance==null?"":insurance.toString(); bdHospitalRpc.setIsMedicalInsurance(insuranceStr); String doctorNumStr = doctorNum==null?"":doctorNum.toString(); bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr); String serveNumStr = serveNum==null?"":serveNum.toString(); bdHospitalRpc.setFinishedServiceNum(serveNumStr); String commentNumStr=commentNum==null?"":commentNum.toString(); bdHospitalRpc.setPatientCommentNum(commentNumStr); bdHospitalList.add(bdHospitalRpc); String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name; String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name; page.addTargetRequest(infoUrl); page.addTargetRequest(allDeptUrl); } page.putField("bdHospitalList", bdHospitalList); } } }catch(Exception e){ e.printStackTrace(); } }else if(page.getUrl().regex(HOSPITAL_INFO_URL).match()){ try{ MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String hosName = resultMap.getString("key"); BdHospitalRpc bdHospitalRpc = new BdHospitalRpc(); List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all(); if(CollectionUtils.isNotEmpty(contextList)){ String context1 = contextList.size()>=1?contextList.get(0):""; String context2 = contextList.size()>=2?contextList.get(1):""; String context3 = contextList.size()>=3?contextList.get(2):""; String context4 = contextList.size()>=4?contextList.get(3):""; String context5 = contextList.size()>=5?contextList.get(4):""; bdHospitalRpc.setContent(context1); bdHospitalRpc.setHistory(context2); bdHospitalRpc.setCharacteristicDept(context3); bdHospitalRpc.setTeam(context4); bdHospitalRpc.setHonor(context5); // System.out.println("医院概况:"+context1); // System.out.println("历史沿革:"+context2); // System.out.println("特色科室:"+context3); // System.out.println("医护团队:"+context4); // System.out.println("医院荣誉:"+context5); } bdHospitalRpc.setSourceId(hosName); page.putField("bdHospitalRpc", bdHospitalRpc); }catch(Exception e){ e.printStackTrace(); } }else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){ try{ MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String hosName = resultMap.getString("key"); String topDepts = ""; List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all(); List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>(); for(String html:tableHtml){ Document document = Jsoup.parse(html); String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get(); List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list(); List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list(); //重点科室信息 if(StringUtils.isEmpty(platDept)){ topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames); //医院科室信息 }else{ for(String d:hospitalDepts){ Document deptDocument = Jsoup.parse(d); String deptName = Xsoup.select(deptDocument, "a/text()").get(); String deptHref = Xsoup.select(deptDocument, "a/@href").get(); MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref); String deptId = deptResultMap.getString("adminDepartId"); String hosId = deptResultMap.getString("hosId"); BdDepartmentRpc bdDepart = new BdDepartmentRpc(); bdDepart.setSourceId(deptId); bdDepart.setName(deptName); bdDepart.setParentSource(platDept); bdDepart.setHospitalSource(hosName); departmentList.add(bdDepart); //将科室详情地址放入目标采集队列 page.addTargetRequest(deptHref); //将科室下医生列表链接放入队列 for(int i=1;i<6;i++){ String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId="+deptId+"&hosId="+hosId; page.addTargetRequest(doctorUrl); } } } } BdHospitalRpc bdHospitalRpc = new BdHospitalRpc(); bdHospitalRpc.setSourceId(hosName); System.out.println("重点科室:"+topDepts); bdHospitalRpc.setCharacteristicFaculty(topDepts); page.putField("hosTopDept", bdHospitalRpc); page.putField("departmentList", departmentList); // System.out.println(page.getHtml().toString()); }catch(Exception e){ e.printStackTrace(); } //采集科室信息 } else if(page.getUrl().regex(DEPT_INFO_URL).match()){ String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString(); String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString(); String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString(); String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString(); MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String deptId = deptResultMap.getString("adminDepartId"); String hosId = deptResultMap.getString("hosId"); BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc(); bdDepartmentRpc.setAddress(deptAddress); bdDepartmentRpc.setPhone(deptPhone); bdDepartmentRpc.setContent(content); bdDepartmentRpc.setSourceId(deptId); bdDepartmentRpc.setTitleDescr(titleDescr); page.putField("bdDepartmentRpc", bdDepartmentRpc); }else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){ String status = new JsonPathSelector("$.status").select(page.getRawText()); if(STATE_SUCCESS.equals(status)){ String data = new JsonPathSelector("$.data[*]").select(page.getRawText()); if(data!=null){ MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String deptId = deptResultMap.getString("adminDepartId"); String hosId = deptResultMap.getString("hosId"); String pageNum = deptResultMap.getString("page"); List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>(); JSONParser jsonParser = new JSONParser(); JSONObject dataJo = null; try { dataJo = (JSONObject)jsonParser.parse(data); } catch (ParseException e) { e.printStackTrace(); } if("1".equals(pageNum)){ JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList"); if(CollectionUtils.isNotEmpty(diseaseArray)){ JSONObject obj = (JSONObject)diseaseArray.get(0); JSONArray diseaseList = (JSONArray)obj.get("list"); if(CollectionUtils.isNotEmpty(diseaseList)){ for(Object disease:diseaseList){ JSONObject diseaseJo=(JSONObject)disease; String itemName = (String)diseaseJo.get("itemName"); if("全部".equals(itemName)){ continue; } BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc(); bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId); bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId); bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName); BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc); } } } } page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList); if(dataJo.containsKey("doctorList")){ List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText()); if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(Object o:doctorList){ JSONObject doctorJo = (JSONObject)o; //医生认证信息 String identifyMarkStr = ""; if(doctorJo.containsKey("doctorIdentify")){ List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString()); identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList); } String doctorName = (String)doctorJo.get("doctorName"); String doctorTitle= (String)doctorJo.get("doctorTitle"); Object commentScore = doctorJo.get("commentScore"); String doctorSkill = (String)doctorJo.get("doctorSkill"); String allTimeHref = (String)doctorJo.get("allTimeHref"); String doctorPhoto = (String)doctorJo.get("doctorPhoto"); //医生详情页加入目标采集 page.addTargetRequest(allTimeHref); MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref); String doctorId = resultMap.getString("doctorId"); BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setHospitalSourceId(hosId); bdDoctorRpc.setDepartmentSourceId(deptId); bdDoctorRpc.setSourceId(doctorId); bdDoctorRpc.setName(doctorName); bdDoctorRpc.setPracticeTitle(doctorTitle); String commentScoreStr = commentScore==null?"":commentScore.toString(); bdDoctorRpc.setRecommendScore(commentScoreStr); bdDoctorRpc.setDiseaseTag(doctorSkill); bdDoctorRpc.setImageUrl(doctorPhoto); bdDoctorRpc.setIdentifyMark(identifyMarkStr); bdDoctorList.add(bdDoctorRpc); JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient"); if(CollectionUtils.isNotEmpty(treatPatientArray)){ for(Object treatPatient:treatPatientArray){ JSONObject treatPatientJo = (JSONObject)treatPatient; String diseaseName = (String)treatPatientJo.get("diseaseName"); BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc(); bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName); bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId); bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc); } } } page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList); page.putField("bdDoctorList", bdDoctorList); } } } } }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){ MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId"); BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId); String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); bdDoctorRpc.setIntro(experience); List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all(); if(CollectionUtils.isNotEmpty(commentList)){ String recommendScore = commentList.size()>=1?commentList.get(0):""; String treatmentEffectScore = commentList.size()>=2?commentList.get(1):""; String attitudeScore = commentList.size()>=3?commentList.get(2):""; bdDoctorRpc.setRecommendScore(recommendScore); bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore); bdDoctorRpc.setAttitudeScore(attitudeScore); } page.putField("bdDoctorRpc", bdDoctorRpc); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run(); } }
上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情
每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点
采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值
如果代码 }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);
String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller
ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);
解析Ajax json结果
List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject
doctorJo = JSON.parseObject(o);
针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容