python脚本多线程爬虫爬电脑壁纸
2016-08-09 14:58
465 查看
安装python 3.4.2
安装相关的库:
安装相关的库:
pip install beautifulsoup4 pip install threadpool
#!/usr/bin/python #fileencoding:utf-8 ''' EXCEL DATA ANALYSIS Created on 2016年08月09日 ''' import threadpool from bs4 import BeautifulSoup import os #当前目录 BASE_DIR = os.path.split(os.path.realpath(__file__))[0] #起始和结束页 begin_page = 1 endindex = 5 #分辨率 scale = "2560x1600" base_page_url = "http://wallpaperswide.com/page/" base_url = "http://wallpaperswide.com" import urllib.request import socket #测试进行一页 def main(page): socket.setdefaulttimeout(20) #http://wallpaperswide.com/page/2 response = urllib.request.urlopen(base_page_url+str(page)) html = response.read().decode('utf-8',"ignore") soup = BeautifulSoup(html,"html.parser") url_list = [] for ul in soup.select("ul.wallpapers"): for a in ul.select("a"): url_list.append(a.get('href')) #print (url_list) list_target = [] if url_list: for item in url_list: response = urllib.request.urlopen(base_url+item) html = response.read().decode('utf-8',"ignore") soup1 = BeautifulSoup(html,"html.parser") for div in soup1.select("div.wallpaper-resolutions"): for target_a in div.select("a"): if target_a.string == scale: list_target.append(base_url+target_a.get('href')) for img_url in list_target: imgName = img_url[img_url.rindex("/")+1:] #targetDir = os.path.join(BASE_DIR,"2560x1600") #targetPath = os.path.join(targetDir,imgName) #urllib.urlretrieve(dowloadUrl, targetPath) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)\ AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/35.0.1916.114 Safari/537.36', 'Cookie': 'AspxAutoDetectCookieSupport=1' } request = urllib.request.Request(img_url, None, header) target_path = os.path.join(BASE_DIR,"2560x1600") if not os.path.exists(target_path): os.makedirs(target_path) targetDir = os.path.join(target_path,imgName) response = urllib.request.urlopen(request) #print response.headers['Content-Length'] with open(targetDir,"wb") as f: f.write(response.read()) print ("page %s" % page) print ("dowload %s ok" % img_url) def print_now(request, n): print ('%s - %s' % (request.requestID, n)) if __name__ == "__main__": if not type(begin_page) == type(1) or not type(endindex) == type(1) or not begin_page <endindex: print("开始和结束页必须为整数,且开始页必须小于结束页") else: pool = threadpool.ThreadPool(50) requests = threadpool.makeRequests(main, range(begin_page,endindex+1), print_now) [pool.putRequest(req) for req in requests] pool.wait()
相关文章推荐
- 在ubuntu中cosmos动态壁纸的python实现脚本
- python自动下载太平洋电脑网上的壁纸
- 多线程网页爬虫 python 实现
- 【原创】编写多线程Python爬虫来过滤八戒网上的发布任务
- python爬虫实战,多线程爬取京东jd html页面:无需登录的网站的爬虫实战 推荐
- 07-爬虫的多线程调度 | 01.数据抓取 | Python
- rsync多线程备份脚本 --python
- python编写网页爬虫脚本并实现APScheduler调度
- python编写网页爬虫脚本并实现APScheduler调度
- python多线程运维脚本
- 利用python脚本抓取AC的代码[爬虫+HTMLParser+handle_entityref+正则表达式+模拟登陆+文件操作]
- [Python脚本]——网页爬虫开始
- [python脚本]一个简单的web爬虫(1)
- Python多线程自动刷票脚本
- python多线程图片爬虫
- [python脚本]一个简单的web爬虫(1)
- python多线程图片爬虫
- python 多线程+gzip压缩 爬虫
- 多线程网页爬虫 python 实现(二)
- Python小爬虫,(多线程)