您的位置:首页 > 运维架构 > 网站架构

nodejs 访问网站并操作xpath

2016-05-11 15:54 453 查看
var xpath = require('xpath'); //引用xpath包
var dom = require('xmldom-silent').DOMParser;//引用xmldom包
var request=require('request');
var fs=require('fs');
var urlencode = require('urlencode');//引用url解码和编码包
var headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25' //设置手机useragent
};

request(
{
url:"https://www.google.co.jp/search?hl=ja&newwindow=1&site=&source=hp&q=hotel&oq=",
headers:headers
},function(error,response,body)
{
findXpath(body);
//fileWrite(body);
console.log("ok");
});

function findXpath(xml){
//var xml = "<book><title>Harry Potter</title></book>"
var doc = new dom().parseFromString(xml)
var XPATH_CITE = "//div[@id='mbEnd']//ol/li//cite/text()|//div[@id='tads']//ol/li//cite/text()|//div[@id='tadsb']//div[@class='ads-ad']//h3/text()";
var XPATH_H3 = "//div[@class='ads-ad']//h3//text()";
var XPATH_ADURL = "//div[@class='ads-ad']/h3/a/@href|//div[@id='tadsb']/ol/li/h3/a/@href";
var XPATH_INFO = "//div[@id='mbEnd']//ol/li//div[@class='ac ads-creative']//text()|//div[@id='taw']//ol/li//div[contains(@class,'ads-creative')]//text()|//div[@class='ads-ad']//div[@class='ads-creative']//text()";
var citeNodes = xpath.select(XPATH_CITE, doc);
var h3Nodes = xpath.select(XPATH_H3, doc);
var adInfoNodes = xpath.select(XPATH_INFO, doc);
var adUrlNodes = xpath.select(XPATH_ADURL, doc);

console.log("---------------------Node--------------Info-----------------------");

for(var i=0;i<citeNodes.length;i++)
{
var citeTxt = citeNodes[i].nodeValue;//循环获取节点
var h3Txt =h3Nodes[i].nodeValue;//循环获取节点
var adUrlTxt = adUrlNodes[i].nodeValue.match(/adurl=(http[\S]*$)/)[1].replace("adurl=","");//循环获取节点
var adInfoTxt =adInfoNodes[i].nodeValue;//循环获取节点
//var adUrl= UrlDecode(htmlDecode(adUrlTxt));
var adUrl=urlencode.decode(adUrlTxt);
console.log(citeTxt);
console.log(h3Txt);
console.log(adUrl);
console.log(adInfoTxt);
}
}
function fileWrite(body)
{
fs.writeFile('233.html', body, function (err) {
if (err) throw err;
console.log('Saved successfully'); //文件被保存
});
}

  

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: