您的位置:首页 > 编程语言 > Python开发

Python-爬取校花网视频(单线程和多线程版本)

2018-11-25 20:30 459 查看

一、参考文章

    python爬虫爬取校花网视频,单线程爬取

    爬虫----爬取校花网视频,包含多线程版本

    上述两篇文章都是对校花网视频的爬取,由于时间相隔很久了,校花网上的一些视频已经不存在了,因此上述文章中的代码在运行时会出现一些异常,本篇文章主要是对上述文章中的代码进行了优化和异常处理,在次做笔记记录方便以后查阅,修改如下:

1、添加的异常处理如下红色部分代码

二、单线程版本

#-*- coding=utf-8 -*-
import re
import requests
import hashlib
import time
import os

header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
'Referer':'http://www.xiaohuar.com'
}

def get_index(url):
respose = requests.get(url, headers = header)
if respose.status_code == 200:
return respose.text

def parse_index(res):
urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S)  # re.S 把文本信息转换成1行匹配
return urls

def get_detail(urls):
for url in urls:
if not url.startswith('http'):
url='http://www.xiaohuar.com%s' %url
result = requests.get(url, headers = header)
if result.status_code == 200 :
mp4_url_list = re.findall(r'id="media".*?src="(.*?)"', result.text, re.S)
if mp4_url_list:
mp4_url = mp4_url_list[0]
save(mp4_url)

path = os.getcwd() + '/video/'

def save(url):
try:#下载视频加异常处理
video = requests.get(url, headers = header)
except requests.exceptions.RequestException as e :
print(repr(e))
return

if video.status_code == 200:
m = hashlib.md5()
m.update(url.encode('utf-8'))
m.update(str(time.time()).encode('utf-8'))
filename = r'%s.mp4' % m.hexdigest()
filepath = path + filename
print(filepath)
with open(filepath, 'wb') as f:
f.write(video.content)
else:
print(f'视频不存在了:{url}')

def main():
for i in range(5):
res1 = get_index('http://www.xiaohuar.com/list-3-%s.html' % i )#拿第一页数据
res2 = parse_index(res1)#提取第一页上的所有url
get_detail(res2)#下载url集合上的视频

if __name__ == '__main__':
main()

三、多线程版本

#-*- coding=utf-8 -*-
# 异步,多线程优化下载速度

import requests
import re
import os
import hashlib,time
from concurrent.futures import ThreadPoolExecutor

p = ThreadPoolExecutor(30)

def get_index(url):
response = requests.get(url)
if response.status_code == 200:
return response.text

def parse_index(res):
res = res.result()
urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S)

p.submit(get_detail, urls)

def get_detail(urls):
for url in urls:
if not url.startswith('http'):
url='http://www.xiaohuar.com%s' %url
r1=requests.get(url)
if r1.status_code == 200:
url_list=re.findall(r'id="media".*?src="(.*?)"', r1.text, re.S)
if url_list:
mp4_url = url_list[0]
save(mp4_url)

path = os.getcwd() + '/video_mutil/'
if not os.path.exists(path):
os.makedirs(path)

def save(url):
try:#下载视频做异常处理,视频可能不存在了
r2 = requests.get(url)
except requests.exceptions.RequestException as e :
print(repr(e))
return

if r2.status_code == 200:
m=hashlib.md5()
m.update(url.encode('utf-8'))
m.update(str(time.time()).encode('utf-8'))
filename = '%s.mp4' %m.hexdigest()
file_path = path + filename
with open(file_path,'wb') as f:
f.write(r2.content)
print('视频下载完成:%s' % file_path)
else:
print(f'视频不存在了:{url}')

def main():
for i in range(5):
p.submit(get_index, 'http://www.xiaohuar.com/list-3-%s.html' % i).add_done_callback(parse_index)

if __name__ == '__main__':
main()

四、资源下载

    资源下载地址:Python爬取校花网视频-单线程和多线程版本

 

转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:朝十晚八

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: