您的位置:首页 > 理论基础 > 计算机网络

python 网络爬虫爬取落网 期刊内容 下载图片 音乐

2018-01-05 11:21 746 查看
1.实现方案

   1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成

   1.2 进程及线程数根据网络情况设置

   1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.代码
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import re
import string
import sys
import os
import threading
import requests
import queue
import time
import multiprocessing

class spider:

def __init__(self,path):
#url 队列
self.queUrl = queue.Queue()
#网页内容 队列
self.quePageInfo = queue.Queue()
#保存爬取结果
self.f = open(path, "w+")
#爬取线程线程
self.threads = []
#写文件锁
self.mu = threading.Lock()
#下载图片url队列
self.queImg = queue.Queue()
#下载音乐队列
self.queMusic = queue.Queue()
#初始化下载路径
self.DownLoadPath()

#提取网页包含链接
def GetUrl(self,page):
regular = "href=\"([^\"]*)"
pattern = re.compile(regular)
result = pattern.findall(page)
for i in result:
self.queUrl.put(i)

#给定url放入抓取队列
def SetCapUrlQueue(self,url):
self.queUrl.put(url)

#创建图片保存以及音乐下载目录
def DownLoadPath(self):
#下载图片目录
self.pathImg = "./img"
#下载音乐目录
self.pathMusic = "./music"
#音乐下载链接
self.musicUrl = "http://mp3-cdn2.luoo.net/low/luoo/radio"
#不存在创建
isExists=os.path.exists(self.pathImg)
if not isExists:
try:
os.makedirs(self.pathImg)
except Exception:
print ("create",self.pathImg,"err")

isExists=os.path.exists(self.pathMusic)
if not isExists:
try:
os.makedirs(self.pathMusic)
except Exception:
print ("create",self.pathMusic,"err")

#获取网页放入队列
def GetPage(self):
while not self.queUrl.empty():
url = self.queUrl.get()
r = requests.get(url)
a = [url,r.text]
self.quePageInfo.put(a)

#下载图片
def DownLoadImg(self):
while True:
while not self.queImg.empty():
img = self.queImg.get()
#文件存在不下载
path = self.pathImg + "/" + img[1]
isExists = os.path.exists(path)
if not isExists:
try:
r = requests.get(img[0])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
open(path, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
r = requests.get(img[0])
if r.status_code == 200:
open(path, 'wb').write(r.content)
break
time.sleep(2)

time.sleep(2)

#下载音乐
def DownLoadMusic(self):
while True:
while not self.queMusic.empty():
music = self.queMusic.get()
path = self.pathMusic + "/" + music[0] + "/"
if '/' in music[1]:
tmp = music[1].split('/')
filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3"
else:
filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3"

#按期刊创建文件夹
if self.mu.acquire(True):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
self.mu.release()

#音乐不存在下载
isExists = os.path.exists(filePath)
if not isExists:
try:
r = requests.get(music[2])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
print ("downLoad",music[2])
open(filePath, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
newUrl = music[2].replace('/0','/')
print ("redownLoad",newUrl)
r = requests.get(newUrl)
if r.status_code == 200:
open(filePath, 'wb').write(r.content)
break
time.sleep(2)

time.sleep(2)

#创建下载音乐线程
def CreateDownLoadMusicThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadMusic,args=())
self.threads.append(t)

#创建获取网页信息线程
def CreateGetPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.GetPage,args=())
self.threads.append(t)

#创建分析网页内容线程
def CreatePsrPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.PsrPage,args=())
self.threads.append(t)

#创建图片下载线程
def CreateDownLoadImgThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadImg,args=())
self.threads.append(t)

#启动线程
def Run(self):
for t in self.threads:
t.setDaemon(True)
t.start()
t.join()

#取出网页内容队列分析
def PsrPage(self):
while True:
while not self.quePageInfo.empty():
a = self.quePageInfo.get()
#提取图片链接
regular  = "(\<img src=\"(http:\/\/img-cdn2.luoo.net\/pics\/vol\/([^\!]*)![^\"]*))|"
#提取描述
regular += "(<meta name=\"description\" content=\"([^\"]*))|"
#提取音乐主题
regular += "(<meta name=\"keywords\" content=\"([^\"]*))|"
#提取期刊编号
regular += "(vol-number rounded\"\>([^\<]*))|"
#提取期刊标题
regular += "(vol-title\"\>([^\<]*))|"
#提取音乐
regular += "(trackname btn-play\"\>([^\<]*))"
pattern = re.compile(regular)
result = pattern.findall(a[1])
if len(result)<10:
continue

i = 0
first = 0
content = a[0] + '\n'
imgName = ""
music = ""

for tmp in result:
if (i == 0):
#描述
content += tmp[4] + '\n'
elif (i == 1):
#音乐主题
content += "@mark " + tmp[6] + '\n'
elif (i == 2):
#期刊编号
music = str(int(tmp[8]))
content += "@vol  " + tmp[8] + '\n'
imgName = tmp[8] + ".jpg"
elif (i == 3):
#期刊标题
content += "@tip  " + tmp[10] + '\n'
elif (tmp[0] != ''):
first = first + 1
#第一张图片为封面
if(first == 1):
#提取图片链接 图片名称
img = [tmp[1],imgName]
self.queImg.put(img)
content += "@img  " + imgName + '\n'
content += "@music\n"
else:
#音乐名
content += "      " + tmp[12] + '\n'
#保存音乐下载链接
s = tmp[12].split('.')
path = self.musicUrl + music + "/" + s[0] + ".mp3"
info = [music,tmp[12],path]
self.queMusic.put(info)
i = i + 1

#获取锁写文件
if self.mu.acquire(True):
self.f.write(content)
self.mu.release()
time.sleep(2)

#关闭文件退出
def Quit(self):
self.f.close()

def worker(num):
path = 'result' + str(num) + '.txt'
Luo = spider(path)
avg = 250
num = num*avg + avg
for i in range(num-avg,num):
content = "http://www.luoo.net/music/"
if i < 10:
url = content +  "00" + str(i)
elif i < 100:
url = content +  "0" + str(i)
else:
url = content + str(i)
Luo.SetCapUrlQueue(url)
Luo.CreateGetPageThread(1)
Luo.CreatePsrPageThread(1)
Luo.CreateDownLoadImgThread(1)
Luo.CreateDownLoadMusicThread(1)
Luo.Run()

#创建进程
def RunSpider(num):
for i in range(0, num):
p = multiprocessing.Process(target = worker, args=(i,))
p.start()

if __name__ == '__main__':
RunSpider(1)


89bf
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息