您的位置:首页 > 编程语言 > Python开发

python 抓取今日头条街拍图片并下载到本地

2017-10-10 17:50 579 查看
基于python3.6.2,mac,废话不多说,直接上代码~

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from urllib.parse import urlencode
import urllib.request
import chardet
import urllib
import json
import re
from bs4 import BeautifulSoup
import sys
import requests
import os

# 定义全局变量,设置图片命名规则
imageIndex = 0
def modifyConstant():
global imageIndex
imageIndex += 1
return

# 通过chardet通用编解码神器,解决了UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 1278: invalid continuation byte
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()

htmlCharsetGuess = chardet.detect(html)
htmlCharsetEncoding = htmlCharsetGuess["encoding"]
htmlCode_decode = html.decode(htmlCharsetEncoding)
type = sys.getfilesystemencoding()
htmlCode_encode = htmlCode_decode.encode(type)

return htmlCode_encode

# 获取每个页面中的详细信息
def parse_page_detail(html):
soup = BeautifulSoup(html, 'lxml')
images_pattern = re.compile('sub_images(.*?)max_img_width', re.S)
html = html.decode('utf8')
result = re.search(images_pattern, html)
if result:
str = result.group(0)[12:-15]
data = json.loads(str)
for i in range(0, len(data)):
getImg(data[i].get('url'))

# 下载图片到本地
# def getImg(imgurl):
#     path = "/Users/luoxiaohui/Desktop/test/"
#     if not os.path.isdir(path):
#         os.makedirs(path)
#     paths = path + ""
#     urllib.request.urlretrieve(imgurl, ''.format(paths))
# 下载图片到本地,改用了requests库
def getImg(imgUrl):
modifyConstant()
path = "/Users/luoxiaohui/Desktop/test/"
if not os.path.isdir(path):
os.makedirs(path)
try:
pic = requests.get(imgUrl, timeout=10)
print('第' + str(imageIndex) + '张图片下载完成-->' + imgUrl)
except requests.exceptions.ConnectionError:
print('【错误】当前图片无法下载')
string = path + str(imageIndex) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()

# 获取索引页面
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
return getHtml(url)

# 解析套图中的图片
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')

html = get_page_index(0, '街拍')
for url in parse_page_index(html):
html = getHtml(url)
if html:
parse_page_detail(html)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: