您的位置:首页 > Web前端 > Node.js

随手写个node爬虫

2017-10-27 16:34 239 查看
以下案例是用node爬取百度传课,获取免费视频课程信息,并下载展示图片

const fs = require('fs');
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const URL = require('url');
const url = 'https://chuanke.baidu.com/course/72351176577777664__cost_asc___2.html';

const protocol = URL.parse(url).protocol;
const host = URL.parse(url).host;

const fsName = 'java_course.txt';
const imgPath = './images/';
let page = 1;

startSpider();

function startSpider() {
console.log('爬虫开始...');
fs.writeFileSync(fsName, '百度传课' + '\n');
itemSpider(url);
}

function itemSpider(url) {  // 单页爬虫
(async function () {
try {
console.log('当前页面', url);
let html = await fetch(url).then(res => res.text());
fs.appendFileSync(fsName, '这是第' + page + '页');
page += 1;
let $ = cheerio.load(html);
queryData($); // 处理数据
if (page < 50) {
setTimeout(() => {
spiderNext($); // 继续下一页
}, 1000);
}
} catch (exception) {
console.log('出错了:', exception);
}
})();
}

function queryData($) {
try {
let panels = $('.item-panel'
4000
);
panels.map((index, item) => {
let tittle = $(item).find('.item-title a').text();
let href = protocol + $(item).find('.item-title a').attr('href');
let price = $(item).find('.price span').text();
let text = tittle + ' (' + price + ') ' + href + '\n';
let src = protocol + $(item).find('img').attr('src');
downImg(tittle, src);
fs.appendFile(fsName, text, (err) => {
if (!err) {
console.log(tittle);
}
});
});
} catch (exception) {
console.log(exception);
}
}

function spiderNext($) {
let nextUrl = protocol + '//' + host + $('.ck-page .next').attr('href');
itemSpider(nextUrl);
}

function downImg(tittle, src) {
try {
fetch(src).then(res => {
res.body.pipe(fs.createWriteStream(imgPath + tittle + '.jpg'));
});
} catch (exception) {
console.log(exception);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: