您的位置:首页 > 编程语言 > Python开发

python 多线程、多进程、协程性能对比(以爬虫为例)

2017-10-09 16:34 579 查看

  基本配置:阿里云服务器低配,单核2G内存

  首先是看协程的效果:

  
import requests
import lxml.html as HTML
import sys
import time
import gevent
from gevent import monkey
monkey.patch_all()

# create url
urls = []
for i in range(int(sys.argv[1]),int(sys.argv[2])):
url = 'http://grri94kmi4.app.tianmaying.com/songs?page='+str(i)
urls.append(url)

def get_data(url):
t1 = time.time()
res = requests.get(url)
if res.status_code == 200:
print(url+' : '+'url open success'+'  time use: '+ str(time.time()-t1))
html = HTML.fromstring(res.content)
trs = html.xpath('//tbody/tr')
data = []
for tr in trs:
s = {}
s['name'] = tr.xpath('./td/a/text()')[0]
s['url'] = tr.xpath('./td/a/@href')[0]
s['id'] = s['url'][30:]
s['comment'] = tr.xpath('./td[last()]/text()')[0]
data.append(s)

if __name__ == '__main__':
total = time.time()
task = []
for url in urls:
task.append(gevent.spawn(get_data,url))
gevent.joinall(task)
print('total time use :', time.time()-total)
View Code

  在爬取20个链接的情况下,用时为4s:

  total time use : 4.873192071914673

 

  线程和进程差不多 ,用时6s左右

  

import requests
import lxml.html as HTML
import sys
import time
from multiprocessing import Pool as ThreadPool
# create url
urls = []
for i in range(int(sys.argv[1]),int(sys.argv[2])):
url = 'http://grri94kmi4.app.tianmaying.com/songs?page='+str(i)
urls.append(url)

def get_data(url):
t1 = time.time()
res = requests.get(url)
if res.status_code == 200:
print(url+' : '+'url open success'+'  time use: '+ str(time.time()-t1))
html = HTML.fromstring(res.content)
trs = html.xpath('//tbody/tr')
data = []
for tr in trs:
s = {}
s['name'] = tr.xpath('./td/a/text()')[0]
s['url'] = tr.xpath('./td/a/@href')[0]
s['id'] = s['url'][30:]
s['comment'] = tr.xpath('./td[last()]/text()')[0]
data.append(s)

if __name__ == '__main__':
total = time.time()
pool = ThreadPool()
results = pool.map(get_data,urls)
pool.close()
pool.join()
print('total time use :', time.time()-total)

 

  

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: