nodejs 访问网站并操作xpath
2016-05-11 15:54
453 查看
var xpath = require('xpath'); //引用xpath包 var dom = require('xmldom-silent').DOMParser;//引用xmldom包 var request=require('request'); var fs=require('fs'); var urlencode = require('urlencode');//引用url解码和编码包 var headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25' //设置手机useragent }; request( { url:"https://www.google.co.jp/search?hl=ja&newwindow=1&site=&source=hp&q=hotel&oq=", headers:headers },function(error,response,body) { findXpath(body); //fileWrite(body); console.log("ok"); }); function findXpath(xml){ //var xml = "<book><title>Harry Potter</title></book>" var doc = new dom().parseFromString(xml) var XPATH_CITE = "//div[@id='mbEnd']//ol/li//cite/text()|//div[@id='tads']//ol/li//cite/text()|//div[@id='tadsb']//div[@class='ads-ad']//h3/text()"; var XPATH_H3 = "//div[@class='ads-ad']//h3//text()"; var XPATH_ADURL = "//div[@class='ads-ad']/h3/a/@href|//div[@id='tadsb']/ol/li/h3/a/@href"; var XPATH_INFO = "//div[@id='mbEnd']//ol/li//div[@class='ac ads-creative']//text()|//div[@id='taw']//ol/li//div[contains(@class,'ads-creative')]//text()|//div[@class='ads-ad']//div[@class='ads-creative']//text()"; var citeNodes = xpath.select(XPATH_CITE, doc); var h3Nodes = xpath.select(XPATH_H3, doc); var adInfoNodes = xpath.select(XPATH_INFO, doc); var adUrlNodes = xpath.select(XPATH_ADURL, doc); console.log("---------------------Node--------------Info-----------------------"); for(var i=0;i<citeNodes.length;i++) { var citeTxt = citeNodes[i].nodeValue;//循环获取节点 var h3Txt =h3Nodes[i].nodeValue;//循环获取节点 var adUrlTxt = adUrlNodes[i].nodeValue.match(/adurl=(http[\S]*$)/)[1].replace("adurl=","");//循环获取节点 var adInfoTxt =adInfoNodes[i].nodeValue;//循环获取节点 //var adUrl= UrlDecode(htmlDecode(adUrlTxt)); var adUrl=urlencode.decode(adUrlTxt); console.log(citeTxt); console.log(h3Txt); console.log(adUrl); console.log(adInfoTxt); } } function fileWrite(body) { fs.writeFile('233.html', body, function (err) { if (err) throw err; console.log('Saved successfully'); //文件被保存 }); }
相关文章推荐
- Lambda架构与推荐在电商网站实践
- 小米网技术架构变迁实践
- 浅谈千万级的PV/IP规模高性能高并发网站架构
- keepalived-lvs-nat-主备模型实现高可用负载均衡
- 译见|深度剖析「微服务架构」的九大特征
- 从Google与eBay的系统架构学到的经验
- 大型网站架构不得不考虑的10个问题
- 支付宝钱包架构
- nice架构演进
- 快的打车架构
- 关于“幽灵架构”的补充说明2:Struct以及Copy - on -Write
- 大型网站架构演变和知识体系
- Kafka 0.9+Zookeeper3.4.6集群搭建、配置,新Client API的使用要点,高可用性测试,以及各种坑
- hadoop HA 高可用集群部署搭建
- Haproxy+keepalived实现高可用负载均衡
- 大型网站架构技术一览
- 怎么做网站的SEO——浅谈个人认知
- 每秒处理10万订单乐视集团支付架构
- 三层架构和MVC比较的个人理解
- 在首席架构师手里,应用架构如此设计