Python多线程,多进程,多协程代码,Redis数据库-分布式速度学习测试
2018-08-20 21:06
911 查看
学习Scrapy框架前要先了解这些各个提高代码运行方式的内在关系逻辑
三种方式同时利用5个(线,进,协)来测试请求50个网页,比较速度
首先上多线程:
采用的是队列+多线程,这也是分布式爬虫底架常见的使用方法
本此多线程采用的是threading框架,也有如_thread等其它框架
[code]import time import requests import threading import queue as Qe threads = [] link_list = [] threadList = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5'] with open('alexa.txt', 'r') as file: file_list = file.readlines() for eachone in file_list: link = eachone.split('\t')[1].replace('\n', '') link_list.append(link) file.close() start = time.time() class MyThread(threading.Thread): def __init__(self, name, q): threading.Thread.__init__(self) self.name = name self.q = q def run(self): print('Starting' + self.name) while True: try: crawler(self.name, self.q) except: break print('Exiting' + self.name) def crawler(threadName, q): url = q.get(timeout=2) try: r = requests.get(url, timeout=2) print(q.qsize(), threadName, r.status_code, url) except Exception as e: print(q.qsize(), threadName, url, 'Error:', e) workQueue = Qe.Queue(50) # 创建新线程 for tNname in threadList: thread = MyThread(tNname, workQueue) thread.start() threads.append(thread) # 填充队列 for url in link_list: workQueue.put(url) # 等待所有线程完成 for t in threads: t.join() end = time.time() print('总时间:', end-start) print('Exiting Main Thread')
多进程:
采用的是非阻塞调用,Pool框架,也有如Process等框架大家可自行学习
[code]import time import requests from multiprocessing import Pool, Manager link_list = [] with open('alexa.txt', 'r') as file: file_list = file.readlines() for eachone in file_list: link = eachone.split('\t')[1].replace('\n', '') link_list.append(link) file.close() start = time.time() def crawler(q, index): Process_id = 'Process-' + str(index) while not q.empty(): url = q.get(timeout=2) try: r = requests.get(url, timeout=2) print(Process_id, q.qsize(), r.status_code, url) except Exception as e: print(Process_id, q.qsize(), url, 'Error:', e) if __name__ == '__main__': manager = Manager() workQueue = manager.Queue(50) # 填充队列 for url in link_list: workQueue.put(url) po = Pool(processes=5) # 无穷多进程 for i in range(5): po.apply_async(crawler, args=(workQueue, i)) print('Started processes') po.close() po.join() end = time.time() print('总时间:', end-start) print('Main process Ended')
多协程:
本次多协程采用的是常用的gevent框架
[code]import time import gevent import requests from gevent.queue import Queue, Empty from gevent import monkey # 把下面有可能有IO操作的单独做上标记 monkey.patch_all() # 将Io转为异步执行的函数 jobs = [] link_list = [] with open('alexa.txt', 'r') as file: file_list = file.readlines() for eachone in file_list: link = eachone.split('\t')[1].replace('\n', '') link_list.append(link) file.close() start = time.time() def crawler(index): Process_id = 'Process-' + str(index) while not workQueue.empty(): url = workQueue.get(timeout=2) try: r = requests.get(url, timeout=2) print(Process_id, workQueue.qsize(), r.status_code, url) except Exception as e: print(Process_id, workQueue.qsize(), url, 'Error:', e) def boss(): for url in link_list: workQueue.put_nowait(url) if __name__ == '__main__': workQueue = Queue(50) gevent.spawn(boss).join() for i in range(5): jobs.append(gevent.spawn(crawler, i)) gevent.joinall(jobs) end = time.time() print('总时间:', end-start) print('Main Ended')
由上述代码运行对比得到时间:
多线程:11.943s
多进程:9.652s
多协程:5.673s
可以看出多协程是最出色的,更为巧妙的是三种提高代码运行速度的方式可相互联系,相互配合,让爬虫更加快速(本此结果仅代表作者本次测试结论,真实结论有待证明)
下面上Redis内存式数据库分布式爬虫获取50个网页图片的代码
1.master(中枢管理)
[code]import re import time import requests from redis import Redis from bs4 import BeautifulSoup headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\ Safari/537.36'} def push_redis_list(): r = Redis(host='127.0.0.1', port=6379) link_list = [] with open('alexa.txt', 'r') as file: file_list = file.readlines() for eachone in file_list: link = eachone.split('\t')[1].replace('\n', '') link_list.append(link) file.close() for url in link_list: try: response = requests.get(url, timeout=2) soup = BeautifulSoup(response.text, 'html.parser') img_list = soup.find_all('img') for img in img_list: img_url = img['src'] if img_url != '': print('加入的图片url:', img_url) r.lpush('img_url', img_url) except Exception as e: print(url, 'Error:', e) print('现在的图片链接个数为', r.llen('img_url')) def get_img(): r = Redis(host='127.0.0.1', port=6379) while True: try: url = r.lpop('img_url') url = url.decode('ascii') if url[:2] == '//': url = 'http:' + url try: response = requests.get(url, timeout=2) name = int(time.time()) f = open('E:\截图库\\' + str(name) + url[-4:], 'wb') f.write(response.content) f.close() print('已经获取图片', url) except Exception as e: print('爬取图片过程出问题', e) time.sleep(3) except Exception as e: print('Error:', e) time.sleep(10) break if __name__ == '__main__': this_machine = 'master' print('开始分布式爬虫') if this_machine == 'master': push_redis_list() else: get_img()
2.slave(爬虫执行者)
[code]import re import time import requests from redis import Redis from bs4 import BeautifulSoup headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\ Safari/537.36'} def push_redis_list(): r = Redis(host='127.0.0.1', port=6379) link_list = [] with open('alexa.txt', 'r') as file: file_list = file.readlines() for eachone in file_list: link = eachone.split('\t')[1].replace('\n', '') link_list.append(link) file.close() for url in link_list: try: response = requests.get(url, timeout=2) soup = BeautifulSoup(response.text, 'html.parser') img_list = soup.find_all('img') for img in img_list: img_url = img['src'] if img_url != '': print('加入的图片url:', img_url) r.lpush('img_url', img_url) except Exception as e: print(url, 'Error:', e) print('现在的图片链接个数为', r.llen('img_url')) def get_img(): r = Redis(host='127.0.0.1', port=6379) while True: try: url = r.lpop('img_url') url = url.decode('ascii') if url[:2] == '//': url = 'http:' + url try: response = requests.get(url, timeout=2) name = int(time.time()) f = open('E:\截图库\\' + str(name) + url[-4:], 'wb') f.write(response.content) f.close() print('已经获取图片', url) except Exception as e: print('爬取图片过程出问题', e) time.sleep(3) except Exception as e: print('Error:', e) time.sleep(10) break if __name__ == '__main__': this_machine = 'slave' print('开始分布式爬虫') if this_machine == 'master': push_redis_list() else: get_img()
master只有一个,而slave可以有很多个。互不干扰,相互运行,大大加快了爬虫的运行速度
阅读更多相关文章推荐
- Python多进程、多线程、协程学习小结
- Python之多进程、多线程、协程和分布式进程
- python学习之路-11 多线程、多进程、协程
- 也说性能测试,顺便说python的多进程+多线程、协程
- 深入浅析python中的多进程、多线程、协程
- Python3学习笔记12-标准库之多进程、数学、循环器、数据库sqLite
- Python 多进程 多线程 协程 I/O多路复用
- Redis学习13之predis的set速度测试
- 基于redis集群实现的分布式锁,可用于秒杀商品的库存数量管理,有测试代码(何志雄)
- 深入理解Spring Redis的使用 (九)、通过Redis 实现 分布式锁 的 BUG,以及和数据库加锁的性能测试
- python3学习笔记:多进程分布式小例子
- redis集群配置及运行命令(windows和centos)附Python测试范例代码
- windows下,多线程c代码嵌入python例子,测试通过
- python并发编程之多进程、多线程、异步和协程详解
- python并发编程之多进程、多线程、异步和协程详解
- 多进程多线程测试代码
- Python 多进程/多线程 学习笔记
- python3.4学习笔记(二十五) Python 调用mysql redis实例代码
- python并发编程之多进程、多线程、异步和协程详解
- 【Python】[进程和线程]多进程,多线程,ThreadLocal,进程VS.线程,分布式进程