您的位置:首页 > 编程语言 > Python开发

python脚本多线程爬虫爬电脑壁纸

2016-08-09 14:58 465 查看
安装python 3.4.2

安装相关的库:

pip install beautifulsoup4

pip install threadpool


#!/usr/bin/python
#fileencoding:utf-8
'''
EXCEL DATA ANALYSIS
Created on 2016年08月09日
'''
import threadpool
from bs4 import BeautifulSoup
import os
#当前目录
BASE_DIR = os.path.split(os.path.realpath(__file__))[0]
#起始和结束页
begin_page  = 1
endindex = 5
#分辨率
scale = "2560x1600"
base_page_url = "http://wallpaperswide.com/page/"
base_url = "http://wallpaperswide.com"
import urllib.request
import socket
#测试进行一页
def main(page):
socket.setdefaulttimeout(20)
#http://wallpaperswide.com/page/2
response = urllib.request.urlopen(base_page_url+str(page))
html = response.read().decode('utf-8',"ignore")
soup = BeautifulSoup(html,"html.parser")
url_list = []
for ul in soup.select("ul.wallpapers"):
for a in ul.select("a"):
url_list.append(a.get('href'))
#print (url_list)
list_target = []
if url_list:
for item in url_list:
response = urllib.request.urlopen(base_url+item)
html = response.read().decode('utf-8',"ignore")
soup1 = BeautifulSoup(html,"html.parser")
for div in soup1.select("div.wallpaper-resolutions"):
for target_a in div.select("a"):
if target_a.string == scale:
list_target.append(base_url+target_a.get('href'))
for img_url in list_target:
imgName = img_url[img_url.rindex("/")+1:]
#targetDir = os.path.join(BASE_DIR,"2560x1600")
#targetPath = os.path.join(targetDir,imgName)
#urllib.urlretrieve(dowloadUrl, targetPath)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/35.0.1916.114 Safari/537.36',
'Cookie': 'AspxAutoDetectCookieSupport=1'
}
request = urllib.request.Request(img_url, None, header)
target_path = os.path.join(BASE_DIR,"2560x1600")
if not os.path.exists(target_path):
os.makedirs(target_path)
targetDir = os.path.join(target_path,imgName)
response = urllib.request.urlopen(request)
#print response.headers['Content-Length']
with open(targetDir,"wb") as f:
f.write(response.read())
print ("page %s" % page)
print ("dowload %s ok" % img_url)
def print_now(request, n):
print ('%s - %s' % (request.requestID, n))
if __name__ == "__main__":

if not type(begin_page) == type(1) or not type(endindex) == type(1) or not begin_page <endindex:
print("开始和结束页必须为整数,且开始页必须小于结束页")
else:
pool = threadpool.ThreadPool(50)
requests = threadpool.makeRequests(main, range(begin_page,endindex+1), print_now)
[pool.putRequest(req) for req in requests]
pool.wait()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: