python3.x爬虫实战:爬今天头条的图集
2017-03-27 19:14
447 查看
爬今日头条的图集
import os from _md5 import md5 from hashlib import md5 from multiprocessing import Pool import requests import json from bs4 import BeautifulSoup import re KEYWORD='美女,清纯'#关键词 GROUP_START=0 GROUP_END=20 #关键词搜索,获取关键词列表 def get_page_index(url,offset,keyword,code='utf-8'): params = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } try: r=requests.get(url,params=params) print(r.status_code) r.raise_for_status() r.encoding=code #print(r.text) return r.text except: return "有误" #解析返回关键词查询结果的json数据,获取每一个链接url def parser_page_index(html): data= json.loads(html) #判定返回json数据是否为空,是否包含字典中keyword:data if data and 'data' in data.keys(): for itemUrl in data.get('data'): #生成器来存储url yield itemUrl.get('article_url') #获取图片集详情信息 def get_page_detail(url,code='utf-8'): try: r=requests.get(url) #print(r.status_code) r.raise_for_status() r.encoding=code #print(r.text) return r.text except: return "有误" #解析图片集中的每一张图片的信息:url,title, def parser_page_detail(html,url): soup=BeautifulSoup(html,'html.parser') title=soup.select('title')[0].get_text() print(title) #re.S 使得匹配换行符 images_pattern = re.compile('var gallery = (.*?);', re.S) result= re.search(images_pattern,html) print(result) if result: data = json.loads(result.group(1)) # print(result.group(1)) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) #print(images) return { 'title': title, 'url': url, 'images': images } #下载 def download_image(url): print('Downloading', url) try: r = requests.get(url) r.raise_for_status() save_image(r.content) return None except ConnectionError: return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')#MD5防止图片重复 print(file_path) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() def main(offset): url='http://www.toutiao.com/search_content/?' html=get_page_index(url,offset,KEYWORD) for url in parser_page_index(html): html = get_page_detail(url) if html: parser_page_detail(html,url) if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()
相关文章推荐
- [python爬虫小实战2]根据用户输入关键词爬取今日头条图集,并批量下载图片
- Python3.X 爬虫实战(并发爬取)
- python3.x爬虫实战:阿里巴巴网站定向信息抓取
- Python3.X 爬虫实战(缓存与持久化)
- Python3.X 爬虫实战缓存与持久化
- Python爬虫实战入门五:获取JS动态内容—爬取今日头条
- Python3.X 爬虫实战(先爬起来嗨)
- Python3.X 爬虫实战(动态页面爬取解析)
- Python爬虫实战入门五:获取JS动态内容—爬取今日头条
- Python3.X 爬虫实战(并发爬取)
- Python3.X 爬虫实战(静态下载器与解析器)
- Python3.X 爬虫实战(先爬起来嗨)
- Python爬虫实战02:分析Ajax请求并抓取今日头条街拍
- python爬虫实战,多线程爬取京东jd html页面:无需登录的网站的爬虫实战 推荐
- Python爬虫实战一之爬取糗事百科段子
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- Python爬虫实战二之爬取百度贴吧帖子
- Python爬虫实战三之计算大学本学期绩点
- Python爬虫框架Scrapy 学习笔记 10.3 -------【实战】 抓取天猫某网店所有宝贝详情
- Python3.x爬虫教程:爬网页、爬图片、自动登录