您的位置:首页 > 数据库 > Redis

Python多线程,多进程,多协程代码,Redis数据库-分布式速度学习测试

2018-08-20 21:06 911 查看

学习Scrapy框架前要先了解这些各个提高代码运行方式的内在关系逻辑

三种方式同时利用5个(线,进,协)来测试请求50个网页,比较速度

首先上多线程:

采用的是队列+多线程,这也是分布式爬虫底架常见的使用方法

本此多线程采用的是threading框架,也有如_thread等其它框架

[code]import time
import requests
import threading
import queue as Qe

threads = []
link_list = []
threadList = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5']

with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()

start = time.time()

class MyThread(threading.Thread):
def __init__(self, name, q):
threading.Thread.__init__(self)
self.name = name
self.q = q

def run(self):
print('Starting' + self.name)
while True:
try:
crawler(self.name, self.q)
except:
break
print('Exiting' + self.name)

def crawler(threadName, q):
url = q.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(q.qsize(), threadName, r.status_code, url)
except Exception as e:
print(q.qsize(), threadName, url, 'Error:', e)

workQueue = Qe.Queue(50)

# 创建新线程
for tNname in threadList:
thread = MyThread(tNname, workQueue)
thread.start()
threads.append(thread)

# 填充队列
for url in link_list:
workQueue.put(url)

# 等待所有线程完成
for t in threads:
t.join()

end = time.time()
print('总时间:', end-start)
print('Exiting Main Thread')

多进程:

采用的是非阻塞调用,Pool框架,也有如Process等框架大家可自行学习

[code]import time
import requests
from multiprocessing import Pool, Manager

link_list = []

with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()

start = time.time()

def crawler(q, index):
Process_id = 'Process-' + str(index)
while not q.empty():
url = q.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(Process_id, q.qsize(), r.status_code, url)
except Exception as e:
print(Process_id, q.qsize(), url, 'Error:', e)

if __name__ == '__main__':
manager = Manager()
workQueue = manager.Queue(50)

# 填充队列
for url in link_list:
workQueue.put(url)

po = Pool(processes=5)  # 无穷多进程
for i in range(5):
po.apply_async(crawler, args=(workQueue, i))

print('Started processes')
po.close()
po.join()

end = time.time()
print('总时间:', end-start)
print('Main process Ended')

 

多协程:

本次多协程采用的是常用的gevent框架

[code]import time
import gevent
import requests
from gevent.queue import Queue, Empty

from gevent import monkey # 把下面有可能有IO操作的单独做上标记
monkey.patch_all() # 将Io转为异步执行的函数

jobs = []
link_list = []

with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()

start = time.time()

def crawler(index):
Process_id = 'Process-' + str(index)
while not workQueue.empty():
url = workQueue.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(Process_id, workQueue.qsize(), r.status_code, url)
except Exception as e:
print(Process_id, workQueue.qsize(), url, 'Error:', e)

def boss():
for url in link_list:
workQueue.put_nowait(url)

if __name__ == '__main__':
workQueue = Queue(50)

gevent.spawn(boss).join()
for i in range(5):
jobs.append(gevent.spawn(crawler, i))
gevent.joinall(jobs)

end = time.time()
print('总时间:', end-start)
print('Main Ended')

由上述代码运行对比得到时间:

多线程:11.943s

多进程:9.652s

多协程:5.673s

可以看出多协程是最出色的,更为巧妙的是三种提高代码运行速度的方式可相互联系,相互配合,让爬虫更加快速(本此结果仅代表作者本次测试结论,真实结论有待证明)

下面上Redis内存式数据库分布式爬虫获取50个网页图片的代码

 

1.master(中枢管理)

[code]import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
r = Redis(host='127.0.0.1', port=6379)

link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()

for url in link_list:
try:
response = requests.get(url, timeout=2)
soup = BeautifulSoup(response.text, 'html.parser')
img_list = soup.find_all('img')
for img in img_list:
img_url = img['src']
if img_url != '':
print('加入的图片url:', img_url)
r.lpush('img_url', img_url)
except Exception as e:
print(url, 'Error:', e)
print('现在的图片链接个数为', r.llen('img_url'))

def get_img():
r = Redis(host='127.0.0.1', port=6379)
while True:
try:
url = r.lpop('img_url')
url = url.decode('ascii')
if url[:2] == '//':
url = 'http:' + url
try:
response = requests.get(url, timeout=2)
name = int(time.time())
f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
f.write(response.content)
f.close()
print('已经获取图片', url)
except Exception as e:
print('爬取图片过程出问题', e)
time.sleep(3)
except Exception as e:
print('Error:', e)
time.sleep(10)
break

if __name__ == '__main__':
this_machine = 'master'
print('开始分布式爬虫')
if this_machine == 'master':
push_redis_list()
else:
get_img()

2.slave(爬虫执行者)

[code]import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
r = Redis(host='127.0.0.1', port=6379)

link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()

for url in link_list:
try:
response = requests.get(url, timeout=2)
soup = BeautifulSoup(response.text, 'html.parser')
img_list = soup.find_all('img')
for img in img_list:
img_url = img['src']
if img_url != '':
print('加入的图片url:', img_url)
r.lpush('img_url', img_url)
except Exception as e:
print(url, 'Error:', e)
print('现在的图片链接个数为', r.llen('img_url'))

def get_img():
r = Redis(host='127.0.0.1', port=6379)
while True:
try:
url = r.lpop('img_url')
url = url.decode('ascii')
if url[:2] == '//':
url = 'http:' + url
try:
response = requests.get(url, timeout=2)
name = int(time.time())
f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
f.write(response.content)
f.close()
print('已经获取图片', url)
except Exception as e:
print('爬取图片过程出问题', e)
time.sleep(3)
except Exception as e:
print('Error:', e)
time.sleep(10)
break

if __name__ == '__main__':
this_machine = 'slave'
print('开始分布式爬虫')
if this_machine == 'master':
push_redis_list()
else:
get_img()

master只有一个,而slave可以有很多个。互不干扰,相互运行,大大加快了爬虫的运行速度

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: