您的位置:首页 > 编程语言 > Python开发

使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科(糗事百科)

2019-05-26 22:21 169 查看
'''
3、 糗事百科:xpath
http://www.qiushibaike.com/8hr/page/1
获取列表页每个帖子里的图片、用户昵称、段子内容、点赞次数和评论次数
选做:翻页
写到json文件中
'''

import requests,os,json
from lxml import etree

class Qiubai:
def __call__(self, *args, **kwargs):
self.get_xml(pages)

def get_xml(self,pages):

for page in range(1,pages+1):
base_url = 'https://www.qiushibaike.com/8hr/page/'+str(page)+'/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
params = {}
#网页转码为xml
xml = etree.HTML(requests.get(base_url,headers=headers).text)

self.get_data(xml,page)

#获取终止型号
end_signal = xml.xpath( "/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li/a/span[@class='next']/text()")
print('————————————————————',base_url)
print('————————————————————',page,end_signal)
if end_signal[0] == '\n更多\n' or int(page) == int(pages):
print('##################################下载完成##################################')
print(f'##################################共爬取前{page}页##################################')
break

def pages(selfm,xml):
'''
/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[8]/a/span[@class='next']
:return:
'''
#自动爬取到所有页面
end_signal_get = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[9]/a/span[@class='next']")

def get_data(self,xml,page):
path = './糗百/' + str(page) + '/'
if not os.path.exists(path):
os.makedirs(path)
print(f'————————————————————————————————————开始爬取第{page}页————————————————————————————————————')

#创建数据字典
page_info_dict = {
f'第{page}页': {},
}

#在大盒子获取每一个小盒子
small_divs = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li")
# print(small_divs)
# print(len(small_divs))
for index,small_div in enumerate(small_divs):
#相关推荐_标题
div_title = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/a[@class='recmd-content']/text()".format(str(index+1)))
if not div_title:
div_title = '未获取到数据'

#相关推荐_图片地址
div_imgs_path = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/a/img/@src".format(str(index+1)))
if not div_imgs_path:
div_imgs_path = '未获取到数据'

#相关推荐 推荐_用户名
div_username = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/a[@class='recmd-user']/span[@class='recmd-name']/text()".format(str(index+1)))
if not div_username:
div_username = '未获取到数据'

#相关推荐_likes_好笑span[2]_好笑指数span[1]
div_likes = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[1]/text()".format(str(index+1)))
if not div_likes:
div_likes = '未获取到数据'

div_likes_title =   small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[2]/text()".format(str(index+1)))
# 相关推荐_评论信息_评论数量[4]_评论标题span[5]
div_comment = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[4]/text()".format(str(index+1)))
if not div_comment:
div_comment = '未获取到数据'

small_div_infos = {
'标题': div_title[0],
'首图地址': div_imgs_path[0],
'用户名': div_username[0],
'好笑': div_likes[0],
'评论数量': div_comment[0],
}
#动态键值对
page_info_dict[f"第{page}页"][f'第{index + 1}条'] = small_div_infos
# print(page_info_dict)

# json_data = json.dumps(page_info_dict,indent=4)
json_data = json.dumps(page_info_dict)
file_name = path + '第' +str(page) + '页'
with open(file_name  ,'w',encoding='utf-8') as f :
f.write(str(json_data))
print(json_data)

if __name__ == '__main__':
pages = int(input('请输入需要爬取的页数'))
# pages = 1
qiubai = Qiubai()
qiubai(pages)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: