您的位置:首页 > 编程语言 > Python开发

python3.x爬虫实战:爬今天头条的图集

2017-03-27 19:14 447 查看
爬今日头条的图集

import os
from _md5 import md5
from hashlib import md5
from multiprocessing import Pool
import requests
import json
from bs4 import BeautifulSoup
import re
KEYWORD='美女,清纯'#关键词
GROUP_START=0
GROUP_END=20

#关键词搜索,获取关键词列表
def get_page_index(url,offset,keyword,code='utf-8'):
params = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
try:
r=requests.get(url,params=params)
print(r.status_code)
r.raise_for_status()
r.encoding=code
#print(r.text)
return r.text
except:
return "有误"
#解析返回关键词查询结果的json数据,获取每一个链接url
def parser_page_index(html):
data= json.loads(html)
#判定返回json数据是否为空,是否包含字典中keyword:data
if data and 'data' in data.keys():
for itemUrl in data.get('data'):
#生成器来存储url
yield  itemUrl.get('article_url')

#获取图片集详情信息
def get_page_detail(url,code='utf-8'):
try:
r=requests.get(url)
#print(r.status_code)
r.raise_for_status()
r.encoding=code
#print(r.text)
return r.text
except:
return "有误"
#解析图片集中的每一张图片的信息:url,title,
def parser_page_detail(html,url):
soup=BeautifulSoup(html,'html.parser')
title=soup.select('title')[0].get_text()
print(title)
#re.S 使得匹配换行符
images_pattern = re.compile('var gallery = (.*?);', re.S)
result= re.search(images_pattern,html)

print(result)
if result:
data  = json.loads(result.group(1))
# print(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:
download_image(image)
#print(images)
return {
'title': title,
'url': url,
'images': images
}
#下载
def download_image(url):
print('Downloading', url)
try:
r = requests.get(url)
r.raise_for_status()
save_image(r.content)
return None
except ConnectionError:
return None

def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')#MD5防止图片重复
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()

def main(offset):
url='http://www.toutiao.com/search_content/?'
html=get_page_index(url,offset,KEYWORD)
for url in parser_page_index(html):
html = get_page_detail(url)
if html:
parser_page_detail(html,url)

if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python爬虫 实战 python