使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科(糗事百科)
2019-05-26 22:21
169 查看
''' 3、 糗事百科:xpath http://www.qiushibaike.com/8hr/page/1 获取列表页每个帖子里的图片、用户昵称、段子内容、点赞次数和评论次数 选做:翻页 写到json文件中 ''' import requests,os,json from lxml import etree class Qiubai: def __call__(self, *args, **kwargs): self.get_xml(pages) def get_xml(self,pages): for page in range(1,pages+1): base_url = 'https://www.qiushibaike.com/8hr/page/'+str(page)+'/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', } params = {} #网页转码为xml xml = etree.HTML(requests.get(base_url,headers=headers).text) self.get_data(xml,page) #获取终止型号 end_signal = xml.xpath( "/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li/a/span[@class='next']/text()") print('————————————————————',base_url) print('————————————————————',page,end_signal) if end_signal[0] == '\n更多\n' or int(page) == int(pages): print('##################################下载完成##################################') print(f'##################################共爬取前{page}页##################################') break def pages(selfm,xml): ''' /html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[8]/a/span[@class='next'] :return: ''' #自动爬取到所有页面 end_signal_get = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[9]/a/span[@class='next']") def get_data(self,xml,page): path = './糗百/' + str(page) + '/' if not os.path.exists(path): os.makedirs(path) print(f'————————————————————————————————————开始爬取第{page}页————————————————————————————————————') #创建数据字典 page_info_dict = { f'第{page}页': {}, } #在大盒子获取每一个小盒子 small_divs = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li") # print(small_divs) # print(len(small_divs)) for index,small_div in enumerate(small_divs): #相关推荐_标题 div_title = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/a[@class='recmd-content']/text()".format(str(index+1))) if not div_title: div_title = '未获取到数据' #相关推荐_图片地址 div_imgs_path = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/a/img/@src".format(str(index+1))) if not div_imgs_path: div_imgs_path = '未获取到数据' #相关推荐 推荐_用户名 div_username = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/a[@class='recmd-user']/span[@class='recmd-name']/text()".format(str(index+1))) if not div_username: div_username = '未获取到数据' #相关推荐_likes_好笑span[2]_好笑指数span[1] div_likes = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[1]/text()".format(str(index+1))) if not div_likes: div_likes = '未获取到数据' div_likes_title = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[2]/text()".format(str(index+1))) # 相关推荐_评论信息_评论数量[4]_评论标题span[5] div_comment = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[4]/text()".format(str(index+1))) if not div_comment: div_comment = '未获取到数据' small_div_infos = { '标题': div_title[0], '首图地址': div_imgs_path[0], '用户名': div_username[0], '好笑': div_likes[0], '评论数量': div_comment[0], } #动态键值对 page_info_dict[f"第{page}页"][f'第{index + 1}条'] = small_div_infos # print(page_info_dict) # json_data = json.dumps(page_info_dict,indent=4) json_data = json.dumps(page_info_dict) file_name = path + '第' +str(page) + '页' with open(file_name ,'w',encoding='utf-8') as f : f.write(str(json_data)) print(json_data) if __name__ == '__main__': pages = int(input('请输入需要爬取的页数')) # pages = 1 qiubai = Qiubai() qiubai(pages)
相关文章推荐
- python使用百度翻译进行中翻英示例
- Python开发的单词频率统计工具wordsworth使用方法
- 使用python写糗事百科的爬虫
- Python爬取扇贝“【无老师】7天搞定TOEFL单词”
- 【扇贝批量添加单词到词库】利用python调用扇贝API (oauth2)
- python3爬取猫眼电影(电影名称和图片)存到本地使用进程池
- python.从一个文本文件中选出使用频率最多的若干个单词实例
- 【扇贝批量添加单词到词库】利用python调用扇贝API (oauth2)
- 使用Python从有道词典网页获取单词翻译
- python3使用requests爬取糗事百科入mongodb库
- 使用Python统计字符串中单词数量
- 爬虫:爬取扇贝上python常用单词,减少登陆和贝壳的繁琐
- 使用python的requests库爬取糗事百科并用xpath解析数据后存入MongoDB
- Python开发的单词频率统计工具wordsworth使用方法
- 自己手写使用python爬取糗事百科段子
- [python]使用Counter统计文章中出现频率最高的单词
- 树莓派使用Python3获取天气信息
- python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子
- 使用python对文件中的单词进行提取
- python里使用capwords()函数来把字符里每一个英语单词首字母变大写