您的位置:首页 > Web前端 > Node.js

node js 小爬虫

2016-04-11 22:59 375 查看
 node js 小爬虫
var http = require('http');var cheerio = require('cheerio');var url = "http://www.imooc.com/learn/348";var baseurl= "http://www.imooc.com/learn/";var vediosId=[637,348,259,197,134,75];var Promise=require('bluebird')function filterChapters(html) {var $ = cheerio.load(html);var chapters = $('.chapter');var title = $('.course-infos .path span').text();var number = parseInt($($('.meta-value strong')[3]).text().trim(), 10);var courseData ={couseTitle:title,number:number,videos:[]};var chapter;var chapterTitle;var videos;var chapterData;var video;var videoTitle;var id;var adres4000s;chapters.each(function(index, item) {chapter = $(item);chapterTitle = chapter.find('strong').text();videos = chapter.find('li');chapterData = {'chapterTitle': chapterTitle,'videos': []};videos.each(function(index, item) {video = $(item).find('.J-media-item');videoTitle = video.text();id = video.attr('href').split('video/')[1];adress = video.attr('href');chapterData.videos.push({'title':videoTitle,'id':id,'adress':adress});});courseData.videos.push(chapterData);});return courseData;}function printCourseInfo(coursesData) {var chapterTitle;var urlPre = 'URL:http://www.imooc.com';coursesData.forEach(function(courseData) {chapterTitle = courseData.couseTitle;console.log('@@'+chapterTitle + ':' + courseData.number+'人学过!'+ '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n');courseData.videos.forEach(function(item){console.log('###'+item.chapterTitle);item.videos.forEach(function(t){console.log('   '+t.title+' '+ t.id+' '+ t.adress+'\n');})})});}function getPageasync(url){return new Promise(function(resolve,reject){console.log(url);http.get(url, function(res) {var html = '';res.on('data', function(data) {html += data;});res.on('end', function() {resolve(html);});}).on('error', function(e) {reject(e);console.log("失败");});})}var fecthCourseArray=[];vediosId.forEach(function(id){fecthCourseArray.push(getPageasync(baseurl+id))});Promise.all(fecthCourseArray).then(function(pages){var cousesData=[];pages.forEach(function(html){var couseData=filterChapters(html);cousesData.push(couseData)});cousesData.sort(function(a,b){return a.number< b.number});printCourseInfo(cousesData)});
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  nodejs