您的位置:首页 > 编程语言 > Python开发

python爬虫爬取糗百成人图片多线程版本

2017-11-09 15:43 309 查看
来个简单的多线程,爬取速度比单线程要快很多,下面上代码:

#encoding:utf-8
#多线程爬取

import requests
import os
from bs4 import BeautifulSoup
import threading
import urllib.request

FIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'
PAGE_URL_LIST = []
IMG_URL_LIST = [] #所有的图片链接
NAME_LIST = []
gLock = threading.Lock()
for x in range(1,100):
page_url = FIRST_PAGE_URL.format(x)
PAGE_URL_LIST.append(page_url)

def get_page():
while True:
gLock.acquire()
if len(PAGE_URL_LIST) == 0:
gLock.release()
break
else:
page_url = PAGE_URL_LIST.pop()
gLock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
src = soup.find_all('div', class_='mala-text')
imgs = soup.find_all('img')
for img in src:
url = img.find('img')
link = url.get('src')
title = url.get('alt')
split_list = link.split('/')
final = split_list.pop()
t_split_list = final.split('.')
suffix = t_split_list.pop()
filename = title + '.' + suffix # 名字加后缀
gLock.acquire()
NAME_LIST.append(filename)
IMG_URL_LIST.append(link)
gLock.release()
#download_image(link, filename)

def download_image():
while True:
gLock.acquire()
if len(IMG_URL_LIST) == 0:
gLock.release()
continue
else:
url = IMG_URL_LIST.pop()
filename = NAME_LIST.pop()
gLock.release()
path = os.path.join('images', filename)
urllib.request.urlretrieve(url, filename=path)

def main():
for x in range(4):
th = threading.Thread(target=get_page)
th.start()
for x in range(5):
th = threading.Thread(target=download_image)
th.start()
if __name__ == "__main__":
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫 多线程