python 网络爬虫爬取落网 期刊内容 下载图片 音乐
2018-01-05 11:21
746 查看
1.实现方案
1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成
1.2 进程及线程数根据网络情况设置
1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.代码
89bf
1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成
1.2 进程及线程数根据网络情况设置
1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.代码
#!/usr/bin/python3 # -*- coding: UTF-8 -*- import re import string import sys import os import threading import requests import queue import time import multiprocessing class spider: def __init__(self,path): #url 队列 self.queUrl = queue.Queue() #网页内容 队列 self.quePageInfo = queue.Queue() #保存爬取结果 self.f = open(path, "w+") #爬取线程线程 self.threads = [] #写文件锁 self.mu = threading.Lock() #下载图片url队列 self.queImg = queue.Queue() #下载音乐队列 self.queMusic = queue.Queue() #初始化下载路径 self.DownLoadPath() #提取网页包含链接 def GetUrl(self,page): regular = "href=\"([^\"]*)" pattern = re.compile(regular) result = pattern.findall(page) for i in result: self.queUrl.put(i) #给定url放入抓取队列 def SetCapUrlQueue(self,url): self.queUrl.put(url) #创建图片保存以及音乐下载目录 def DownLoadPath(self): #下载图片目录 self.pathImg = "./img" #下载音乐目录 self.pathMusic = "./music" #音乐下载链接 self.musicUrl = "http://mp3-cdn2.luoo.net/low/luoo/radio" #不存在创建 isExists=os.path.exists(self.pathImg) if not isExists: try: os.makedirs(self.pathImg) except Exception: print ("create",self.pathImg,"err") isExists=os.path.exists(self.pathMusic) if not isExists: try: os.makedirs(self.pathMusic) except Exception: print ("create",self.pathMusic,"err") #获取网页放入队列 def GetPage(self): while not self.queUrl.empty(): url = self.queUrl.get() r = requests.get(url) a = [url,r.text] self.quePageInfo.put(a) #下载图片 def DownLoadImg(self): while True: while not self.queImg.empty(): img = self.queImg.get() #文件存在不下载 path = self.pathImg + "/" + img[1] isExists = os.path.exists(path) if not isExists: try: r = requests.get(img[0]) except Exception: print ("get err") continue else: if r.status_code == 200: open(path, 'wb').write(r.content) else: #GET 失败获取五次 for i in range(0,5): r = requests.get(img[0]) if r.status_code == 200: open(path, 'wb').write(r.content) break time.sleep(2) time.sleep(2) #下载音乐 def DownLoadMusic(self): while True: while not self.queMusic.empty(): music = self.queMusic.get() path = self.pathMusic + "/" + music[0] + "/" if '/' in music[1]: tmp = music[1].split('/') filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3" else: filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3" #按期刊创建文件夹 if self.mu.acquire(True): isExists = os.path.exists(path) if not isExists: os.makedirs(path) self.mu.release() #音乐不存在下载 isExists = os.path.exists(filePath) if not isExists: try: r = requests.get(music[2]) except Exception: print ("get err") continue else: if r.status_code == 200: print ("downLoad",music[2]) open(filePath, 'wb').write(r.content) else: #GET 失败获取五次 for i in range(0,5): newUrl = music[2].replace('/0','/') print ("redownLoad",newUrl) r = requests.get(newUrl) if r.status_code == 200: open(filePath, 'wb').write(r.content) break time.sleep(2) time.sleep(2) #创建下载音乐线程 def CreateDownLoadMusicThread(self,num): for i in range(0,num): t = threading.Thread(target=self.DownLoadMusic,args=()) self.threads.append(t) #创建获取网页信息线程 def CreateGetPageThread(self,num): for i in range(0,num): t = threading.Thread(target=self.GetPage,args=()) self.threads.append(t) #创建分析网页内容线程 def CreatePsrPageThread(self,num): for i in range(0,num): t = threading.Thread(target=self.PsrPage,args=()) self.threads.append(t) #创建图片下载线程 def CreateDownLoadImgThread(self,num): for i in range(0,num): t = threading.Thread(target=self.DownLoadImg,args=()) self.threads.append(t) #启动线程 def Run(self): for t in self.threads: t.setDaemon(True) t.start() t.join() #取出网页内容队列分析 def PsrPage(self): while True: while not self.quePageInfo.empty(): a = self.quePageInfo.get() #提取图片链接 regular = "(\<img src=\"(http:\/\/img-cdn2.luoo.net\/pics\/vol\/([^\!]*)![^\"]*))|" #提取描述 regular += "(<meta name=\"description\" content=\"([^\"]*))|" #提取音乐主题 regular += "(<meta name=\"keywords\" content=\"([^\"]*))|" #提取期刊编号 regular += "(vol-number rounded\"\>([^\<]*))|" #提取期刊标题 regular += "(vol-title\"\>([^\<]*))|" #提取音乐 regular += "(trackname btn-play\"\>([^\<]*))" pattern = re.compile(regular) result = pattern.findall(a[1]) if len(result)<10: continue i = 0 first = 0 content = a[0] + '\n' imgName = "" music = "" for tmp in result: if (i == 0): #描述 content += tmp[4] + '\n' elif (i == 1): #音乐主题 content += "@mark " + tmp[6] + '\n' elif (i == 2): #期刊编号 music = str(int(tmp[8])) content += "@vol " + tmp[8] + '\n' imgName = tmp[8] + ".jpg" elif (i == 3): #期刊标题 content += "@tip " + tmp[10] + '\n' elif (tmp[0] != ''): first = first + 1 #第一张图片为封面 if(first == 1): #提取图片链接 图片名称 img = [tmp[1],imgName] self.queImg.put(img) content += "@img " + imgName + '\n' content += "@music\n" else: #音乐名 content += " " + tmp[12] + '\n' #保存音乐下载链接 s = tmp[12].split('.') path = self.musicUrl + music + "/" + s[0] + ".mp3" info = [music,tmp[12],path] self.queMusic.put(info) i = i + 1 #获取锁写文件 if self.mu.acquire(True): self.f.write(content) self.mu.release() time.sleep(2) #关闭文件退出 def Quit(self): self.f.close() def worker(num): path = 'result' + str(num) + '.txt' Luo = spider(path) avg = 250 num = num*avg + avg for i in range(num-avg,num): content = "http://www.luoo.net/music/" if i < 10: url = content + "00" + str(i) elif i < 100: url = content + "0" + str(i) else: url = content + str(i) Luo.SetCapUrlQueue(url) Luo.CreateGetPageThread(1) Luo.CreatePsrPageThread(1) Luo.CreateDownLoadImgThread(1) Luo.CreateDownLoadMusicThread(1) Luo.Run() #创建进程 def RunSpider(num): for i in range(0, num): p = multiprocessing.Process(target = worker, args=(i,)) p.start() if __name__ == '__main__': RunSpider(1)
89bf
相关文章推荐
- python3.6 urllib.request库实现简单的网络爬虫、下载图片
- python 3.x网络爬虫 下载图片
- python 网页爬虫,下载网络图片
- Python3网络爬虫:Scrapy入门之使用ImagesPipline下载图片
- python中使用网络爬虫下载图片
- 每天一篇python:简单爬虫下载图片篇
- Python爬虫_自动下载图片
- Python3 爬虫下载指定页面图片
- Python3网络爬虫:requests爬取动态网页内容
- Python---对html文件内容进行搜索取出特定URL地址字符串,保存成列表,并使用每个url下载图片,并保存到硬盘上,使用bs4,beautifulsoup模块
- mac os平台使用python爬虫自动下载巨潮网络文件
- python 使用 urllib.urlretrieve()下载网络图片,在本地打开提示文件损坏无法打开
- python实现爬虫下载美女图片
- 使用简易Python爬虫下载百度贴吧图片
- Python爬虫(02)从网站下载图片
- python简单爬虫(下载知乎图片示例)
- python爬虫入门(2)如何爬微博内容,及图片
- Python3网络爬虫应用:爱奇艺等主流视频网站的VIP视频破解(在线观看+视频下载)
- java下载网页内容和网络图片
- 07精通Python网络爬虫——爬取京东手机图片