您的位置:首页 > Web前端 > Node.js

基于node.js的网页抓捕

2016-10-13 15:27 288 查看
从无到有耗时2天,蓝馊!香菇!

var tmp = '';
var count = 0;
var url = 'http://data.eastmoney.com/zjlx/600050.html';
var http = require('http');
var iconv = require('iconv-lite');
var req = http.request(url, function(res){
//从网页上取数据
res.on('data',function(data){
count++;
data = iconv.decode(data, 'GBK');
tmp += data;
console.log('Count\t'+count);
});
res.on('end', function() {
if(tmp){
//writeFile('a.txt',tmp);
tmp = replace(tmp);
//writeFile('b.txt',tmp);
var cheerio = require('cheerio'),
$ = cheerio.load(tmp);
tmp = $('#content_zjlxtable').html();
writeFile('c.html',tmp);
//tmp = TableToCsv(tmp);
//writeFile('d.csv',tmp);
}else{
console.log('Empty');
}
});
});
req.on('error', function(e) {
console.log('problem with request: ' + e.message);
});
req.end();
//写文件
function writeFile(file,str){
var fs = require('fs');
fs.writeFile(file, str, function(err){
if(err)console.log("fail " + err);
});
}
function replace(source){
var rep0 = /'/g;
var rep1 = /\r\n/g;//MLGB原来少个/r,win是/r/n,linux是/r
var rep2 = /<!--.*?-->/ig;
var rep3 = /\/\*.*?\*\//ig;
var rep4 = /[ ]+</ig;
var source0 = source.replace(rep0,'\"');
var source1 = source0.replace(rep1,'');
var source2 = source1.replace(rep2,'');
var source3 = source2.replace(rep3,'');
var source4 = source3.replace(rep4,'<');
return source4;
}///<[^>]+>/g;
function TableToCsv(source){
var rep0 = /\s/g;
var rep1 = /<\/th>/g;
var rep2 = /<\/td>/g;
var rep3 = /<\/tr>/g;
var rep4 = /<[^>]+>/g;
var rep5 = / /g;
var source0 = source.replace(rep0,'');
var source1 = source0.replace(rep1,',');
var source2 = source1.replace(rep2,',');
var source3 = source2.replace(rep3,'\r\n');
var source4 = source3.replace(rep4,'');
var source5 = source4.replace(rep5,'');
return source5;
}


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  node.js