使用多进程,多线程和协程实现爬虫,使用queue完成数据通信
2019-02-26 17:00
399 查看
一直在看python的多进程,多线程和协程,动手试了试写个爬虫练习一下。
多线程版本
[code]import ssl import threading import time import urllib.request from queue import Queue from lxml import etree class myThread(threading.Thread): def __init__(self,url,q): super(myThread, self).__init__() self.q = q self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' } self.url = url def run(self): self.parse_html() def send_request(self, url): request = urllib.request.Request(url=url, headers=self.headers) #我这里需要加上ssl认证 ssl._create_default_https_context = ssl._create_unverified_context response = urllib.request.urlopen(request) html_response = response.read().decode('utf-8') return html_response def parse_html(self): resp = self.send_request(self.url) html = etree.HTML(resp) div_list = html.xpath("//div[@class='index_only']//div[@id='container']/div") for div in div_list: name = div.xpath(".//img/@alt")[0] src = div.xpath(".//img/@src2")[0] # print(src) # print(name) self.q.put(name + '\t' + src) def main(): q = Queue() base_url = 'http://sc.chinaz.com/tupian/renwutupian_' url_list = [base_url + str(num)+'.html' for num in range(2, 100)] print(url_list) Thread_list = [] # 创建并启动线程 for url in url_list: p = myThread(url, q) p.start() Thread_list.append(p) # 让主线程等待子线程执行完成 for i in Thread_list: i.join() #打印队列里存入的信息 while not q.empty(): print(q.get()) if __name__ == "__main__": start = time.time() main() print('总共耗时:%s' % (time.time() - start))
多进程版本
[code]import multiprocessing import ssl import time import urllib.request from queue import Queue from lxml import etree class ZhanzProcess(multiprocessing.Process): def __init__(self, url, q): super(ZhanzProcess, self).__init__() self.q = q self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' } def run(self): self.parse_html() def send_request(self, url): request = urllib.request.Request(url=url, headers=self.headers) ssl._create_default_https_context = ssl._create_unverified_context response = urllib.request.urlopen(request) html_response = response.read().decode('utf-8') return html_response def parse_html(self): res = self.send_request(self.url) html = etree.HTML(res) div_list = html.xpath("//div[@class='index_only']//div[@id='container']/div") for div in div_list: name = div.xpath(".//img/@alt")[0] src = div.xpath(".//img/@src2")[0] # print(name + '\t' + src) self.q.put(name + '\t' + src) def main(): q = Queue() base_url = 'http://sc.chinaz.com/tupian/renwutupian_' url_list = [base_url + str(num) + '.html' for num in range(2, 100)] print(url_list) process_list = [] for url in url_list: # print(url) p = ZhanzProcess(url, q) p.start() process_list.append(p) for i in process_list: i.join() while not q.empty(): print(q.get()) if __name__ == "__main__": start = time.time() main() print('总共耗时:%s' % (time.time() - start))
协程版本
[code]import ssl import time import urllib.request from queue import Queue import gevent as gevent from lxml import etree # 打猴子补丁 from gevent import monkey monkey.patch_all() class Zzspider(object): def __init__(self): self.q = Queue() self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' } def run(self, url): self.parse_html(url) def send_request(self, url): request = urllib.request.Request(url=url, headers=self.headers) ssl._create_default_https_context = ssl._create_unverified_context response = urllib.request.urlopen(request) html_response = response.read().decode('utf-8') return html_response def parse_html(self, url): res = self.send_request(url) html = etree.HTML(res) div_list = html.xpath("//div[@class='index_only']//div[@id='container']/div") for div in div_list: name = div.xpath(".//img/@alt")[0] src = div.xpath(".//img/@src2")[0] self.q.put(name + '\t' + src) def main(self): base_url = 'http://sc.chinaz.com/tupian/renwutupian_' url_list = [base_url + str(num) + '.html' for num in range(2, 100)] job_list = [gevent.spawn(self.run, url) for url in url_list] # 让线程等待所有任务完成,再继续执行。 gevent.joinall(job_list) while not self.q.empty(): print(self.q.get()) if __name__ == '__main__': start = time.time() zz = Zzspider() zz.main() print('总共耗时:%s' % (time.time() - start))
相关文章推荐
- 爬虫--多线程-多进程-协程以及池的使用
- 使用threading,queue,fake_useragent,requests ,lxml,多线程爬取嗅事百科13页文字数据,爬虫案例
- 【使用JSOUP实现网络爬虫】修改数据-设置属性的值
- 使用Socket通信实现Silverlight客户端实时数据的获取(模拟GPS数据,地图实时位置)
- 数据结构探索,使用C++实现最简单的数据结构代码(二) ——队列(Queue)
- 使用FDO封装XML&ADO实现与服务端数据通信
- 【使用JSOUP实现网络爬虫】修改数据-设置元素的文本内容
- 【使用JSOUP实现网络爬虫】修改数据-设置元素的文本内容
- 使用net模块实现基于TCP的数据通信
- 简单的使用Socket实现数据通信
- python多进程、多线程、协程向mysql插入10000条数据
- 使用Python的BeautifulSoup库实现一个可以爬取1000条百度百科数据的爬虫
- 使用Socket通信实现Silverlight客户端实时数据的获取(模拟GPS数据,地图实时位置)
- Android中sqlite基本使用2,(使用事务实现数据完成或回滚)
- 使用NSOperation与NSOperationQueue实现多线程
- 使用Java语言完成数据报之间的通信即使用udp数据传输
- 多线程间通信和多进程之间通信有什么不同,分别怎么实现?
- android端和pc端使用usb进行socket通信,其中android是服务器端,pc是客户端。如何实现安卓端输入的数据通过按钮发送到pc端?
- python 多线程、多进程、协程性能对比(以爬虫为例)
- java使用TCP编程实现文件的传输,在文件传输结束后,client端需要额外传输一句话,表示文件传输完成,服务器端接收到这句话之后,停止接收client端发送的数据