您的位置:首页 > 编程语言 > Python开发

Python3用requests,multiprocessing多线程爬取今日头条图片

2020-02-07 07:12 330 查看

仅供交流学习

#coding=utf-8

import json
import requests
import re
import os
from multiprocessing import Pool
from urllib.parse import urlencode
from fake_useragent import UserAgent
from hashlib import md5
from bs4 import BeautifulSoup

ua=UserAgent()

keyword="街拍"

def get_page(offset):
param={
'offset': offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count': 20
}
base="https://www.toutiao.com/api/search/content/?"
url=base+urlencode(param)
content=get_content(url)
data=json.loads(content)
if data and "data" in data.keys():
article_list=data.get('data')
return [item.get('article_url') for item in article_list]
return None

#保存结果到文件
def write_to_file(content):
with open("res.txt","a",encoding="utf-8") as f:
f.write(content)
#解析获取内页的图片
def parse_page_image(url):
content=get_content(url)
if content!=None:
#获取标题
soup=BeautifulSoup(content,'lxml')
res=soup.select('title')
title=res[0].get_text()

pattern=re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)
items=pattern.findall(content)
for item in items:
item=eval("'{}'".format(item))
data=json.loads(item)
if data and "sub_images" in data.keys():
items=[item.get("url") for item in data.get('sub_images')]
res={
'title':title,
'imgList':items,
'url':url
}
write_to_file(json.dumps(res,ensure_ascii=False)+"\n")
for url in items:
get_img(url)

#保存图片
def save_img(content):
path_file="{0}/{1}/{2}.{3}".format(os.getcwd(),"img",md5(content).hexdigest(),"jpg")
print(path_file)
with open(path_file,"wb") as f:
f.write(content)
#获取远程图片
def get_img(url):
try:
headers={'User-Agent':ua.chrome}
response=requests.get(url,headers=headers)
if response.status_code==200:
save_img(response.content)
except:
pass
#获取文本内容
def get_content(url):
try:
headers={'User-Agent':ua.chrome}
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except:
return None

def main(offset):
items=get_page(offset)
if items!=None:
for item in items:
parse_page_image(item)

if __name__=='__main__':
pool=Pool()
pool.map(main,[i*10 for i in range(10)])
pool.close()
pool.join()

原文: https://rumenz.com/rumenbiji/python-requests-multiprocessing.html

  • 点赞
  • 收藏
  • 分享
  • 文章举报
入门小站 发布了3 篇原创文章 · 获赞 0 · 访问量 50 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: