python爬虫爬取糗百成人图片多线程版本
2017-11-09 15:43
309 查看
来个简单的多线程,爬取速度比单线程要快很多,下面上代码:
#encoding:utf-8
#多线程爬取
import requests
import os
from bs4 import BeautifulSoup
import threading
import urllib.request
FIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'
PAGE_URL_LIST = []
IMG_URL_LIST = [] #所有的图片链接
NAME_LIST = []
gLock = threading.Lock()
for x in range(1,100):
page_url = FIRST_PAGE_URL.format(x)
PAGE_URL_LIST.append(page_url)
def get_page():
while True:
gLock.acquire()
if len(PAGE_URL_LIST) == 0:
gLock.release()
break
else:
page_url = PAGE_URL_LIST.pop()
gLock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
src = soup.find_all('div', class_='mala-text')
imgs = soup.find_all('img')
for img in src:
url = img.find('img')
link = url.get('src')
title = url.get('alt')
split_list = link.split('/')
final = split_list.pop()
t_split_list = final.split('.')
suffix = t_split_list.pop()
filename = title + '.' + suffix # 名字加后缀
gLock.acquire()
NAME_LIST.append(filename)
IMG_URL_LIST.append(link)
gLock.release()
#download_image(link, filename)
def download_image():
while True:
gLock.acquire()
if len(IMG_URL_LIST) == 0:
gLock.release()
continue
else:
url = IMG_URL_LIST.pop()
filename = NAME_LIST.pop()
gLock.release()
path = os.path.join('images', filename)
urllib.request.urlretrieve(url, filename=path)
def main():
for x in range(4):
th = threading.Thread(target=get_page)
th.start()
for x in range(5):
th = threading.Thread(target=download_image)
th.start()
if __name__ == "__main__":
main()
#encoding:utf-8
#多线程爬取
import requests
import os
from bs4 import BeautifulSoup
import threading
import urllib.request
FIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'
PAGE_URL_LIST = []
IMG_URL_LIST = [] #所有的图片链接
NAME_LIST = []
gLock = threading.Lock()
for x in range(1,100):
page_url = FIRST_PAGE_URL.format(x)
PAGE_URL_LIST.append(page_url)
def get_page():
while True:
gLock.acquire()
if len(PAGE_URL_LIST) == 0:
gLock.release()
break
else:
page_url = PAGE_URL_LIST.pop()
gLock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
src = soup.find_all('div', class_='mala-text')
imgs = soup.find_all('img')
for img in src:
url = img.find('img')
link = url.get('src')
title = url.get('alt')
split_list = link.split('/')
final = split_list.pop()
t_split_list = final.split('.')
suffix = t_split_list.pop()
filename = title + '.' + suffix # 名字加后缀
gLock.acquire()
NAME_LIST.append(filename)
IMG_URL_LIST.append(link)
gLock.release()
#download_image(link, filename)
def download_image():
while True:
gLock.acquire()
if len(IMG_URL_LIST) == 0:
gLock.release()
continue
else:
url = IMG_URL_LIST.pop()
filename = NAME_LIST.pop()
gLock.release()
path = os.path.join('images', filename)
urllib.request.urlretrieve(url, filename=path)
def main():
for x in range(4):
th = threading.Thread(target=get_page)
th.start()
for x in range(5):
th = threading.Thread(target=download_image)
th.start()
if __name__ == "__main__":
main()
相关文章推荐
- python爬虫爬取糗百成人图片单线程版本
- [python爬虫]爬取贴吧某页美女图片+爬取糗百美女图片
- python3.0 图片爬虫(增加多线程)
- python爬虫入门教程之糗百图片爬虫代码分享
- python下多线程爬虫爬取斗图网的所有最新图片
- python爬虫入门教程之糗百图片爬虫代码分享
- 爬虫案例---Python2X版本抓取京东手机页面的图片
- Python爬虫抓取糗百的图片,并存储在本地文件夹
- python多线程图片爬虫
- python多线程图片爬虫
- Python爬虫之多线程下载豆瓣Top250电影图片
- Python 爬虫多线程爬取美女图片保存到本地
- 一个简单的python爬虫爬取糗百图片
- Python之多线程爬虫抓取网页图片的示例代码
- 07精通Python网络爬虫——爬取京东手机图片
- 通过qq缓存图片,找到QQ号码,python版本
- python 3.x网络爬虫 下载图片
- Python爬虫__爬取贴吧图片和文本
- Python爬虫获取图片并下载保存至本地
- python爬虫设计刷博客访问量(刷访问量,赞,爬取图片)