python爬虫——使用代理和xpath爬取豆瓣读书
2019-04-10 12:22
197 查看
版权声明:转载需注明来源 https://blog.csdn.net/weixin_44024393/article/details/89179373
根据豆瓣读书的所有标签自动创建文件夹,使用代理防止被反爬。但是我的代理好像是假的,还是被反爬了…通过设置cookie爬取,但是爬取一定数量后需要在浏览器手动进行验证码输入…总的来说,代码写的很麻烦
import requests from lxml import etree from fake_useragent import UserAgent import threading import queue import os from urllib.parse import urljoin import json import csv # 导入代理池 from proxies.get_proxy import get_ip_port import random # 获取ip temp = list(map(lambda x: x[1], get_ip_port())) temp = [x for x in temp if x] # 构建请求头 ua = UserAgent() headers = { 'user-agent': ua.Chrome, 'Cookie': '自己设置' } # 创建锁 LOCK = threading.Lock() # 创建全局队列 q_url = queue.Queue() q_path = queue.Queue() # 创建多线程 class MyThread(threading.Thread): def __init__(self): threading.Thread.__init__(self) # 声明一个列表存储字典 self.data_list = [] LOCK.acquire() self.path = q_path.get() self.url = q_url.get() self.flag = False LOCK.release() def run(self) -> None: self.parse_url(self.url, random.choice(temp)) self.save_file() # 解析url,爬取数据 def parse_url(self, temp_url, proxy): if not self.flag: # 设置代理 proxies = { 'https:': random.choice(temp) } else: proxies = proxy try: url = temp_url resp = requests.get(url, headers=headers, proxies=proxies, timeout=3) if resp.status_code == 200: self.flag = True html = etree.HTML(resp.content) lis = html.xpath('//ul[@class="subject-list"]/li') if lis: for li in lis: # 声明一个字典存储数据 data_dict = {} # 题目 title = li.xpath('.//h2/a/text()')[0] # 清洗数据 title = title.strip() data_dict['title'] = title # 作者、出版社、年份、价格 all_info = li.xpath('.//div[@class="pub"]/text()')[0] # 清洗数据 all_info = all_info.strip().split('/') # 判断作者有多少个,如果all_info的长度是4那么作者有一个,如果是5个那么作者是2个 if len(all_info) == 4: # 作者 author = all_info[0] # 出版社 pub_house = all_info[1] # 日期 datetime = all_info[2] price = all_info[3] data_dict['author'] = author data_dict['pub_house'] = pub_house.strip() data_dict['datetime'] = datetime.strip() data_dict['price'] = price.strip() elif len(all_info) == 5: # 作者 author = all_info[0] + '/' + all_info[1] # 出版社 pub_house = all_info[2] # 日期 datetime = all_info[3] price = all_info[4] data_dict['author'] = author data_dict['pub_house'] = pub_house.strip() data_dict['datetime'] = datetime.strip() data_dict['price'] = price.strip() # 评分 rating_nums = li.xpath('.//span[@class="rating_nums"]/text()') if rating_nums: rating_nums = rating_nums[0].strip() # 评论数 comment_count = li.xpath('.//span[@class="pl"]/text()')[0] # 清洗数据 comment_count = comment_count.strip().replace('(', '').replace('人评价)', '') # 摘要 text = li.xpath('.//p/text()') if text: # 数据清洗 text = text[0].strip().replace('\r', '').replace('\n', '') else: text = None data_dict['rating_nums'] = rating_nums data_dict['comment_count'] = comment_count data_dict['text'] = text print(data_dict) self.data_list.append(data_dict) next_url = html.xpath('//span[@class="next"]/a/@href')[0] # 拼接url if next_url: next_url = urljoin(url, next_url) print(next_url) return self.parse_url(next_url, proxies) else: self.flag = False return self.parse_url(url, proxies) except Exception as e: LOCK.acquire() q_url.put(self.url) q_path.put(self.path) LOCK.release() # 将数据存入json文件和csv文件 def save_file(self): if self.data_list: if not os.path.exists(self.path + '.json'): with open(self.path + '.json', 'a+', encoding='utf-8') as f: json.dump(self.data_list, f, ensure_ascii=False, indent=4) print('json文件写入完成') if not os.path.exists(self.path + '.csv'): with open(self.path + '.csv', 'w', encoding='utf-8', newline='') as f: # 表头 title = self.data_list[0].keys() # 创建writer对象 writer = csv.writer(f) # 写入表头 writer.writerow(title) # 批量写入 for row in self.data_list: writer.writerow(row.values()) print('csv文件写入完成') # 获取标签名字并新建文件夹 def get_tag(): # 文件的基本路径 path = r'E:\PycharmCode\dbds_spiders' # 设置代理 proxies = { 'https': random.choice(temp) } # 声明大标签 big_tag_list = [] # 获取标签名字的链接 tag_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all' try: # 请求url resp = requests.get(tag_url, headers=headers, proxies=proxies, timeout=3) if resp.status_code is 200: # 将返回体转换成xpath的格式 html = etree.HTML(resp.content) # 获取大标签的名字 big_tag = html.xpath('//div[@class="article"]/div[2]/div') # 遍历大标签 for tag in big_tag: # 获取标签名,原始数据是文学 · · · · · · tag_name = tag.xpath('.//h2/text()')[0] # 数据清洗一下 big_tag_name = tag_name.split('·')[0].strip() big_tag_list.append(big_tag_name) # 获取小标签的名字 # 获取每个大标签下的小标签 tds = tag.xpath('.//td') # 获取每个小标签的名字 for td in tds: min_tag_name = td.xpath('./a/text()')[0] min_tag_url = td.xpath('./a/@href')[0] min_tag_url = urljoin(tag_url, min_tag_url) # 组合路径 file_path = path + '\\' + big_tag_name + '\\' + min_tag_name # 将路径和url放入对于队列 q_url.put(min_tag_url) q_path.put(file_path) return big_tag_list else: return get_tag() except Exception as e: return get_tag() # 创建文件 def make_file(big_tag_list): # 文件的基本路径 path = r'E:\PycharmCode\dbds_spiders' # 判断文件是否存在,如果不存在就新建文件夹 # 新建文件夹路径 for big_temp in big_tag_list: big_path = path + '\\' + big_temp # 新建文件夹 if not os.path.exists(big_path): os.makedirs(big_path) print('文件夹新建完成') def main(): # 先将小标签的路径和url提取出来并放入队列 big_tag_list = get_tag() make_file(big_tag_list) while not q_url.empty() and not q_path.empty(): # print(q_url.get()) # print(q_path.get()) # print('++++++++++++++++++++++++') ths = [] # 每次创建3个线程 for i in range(3): t = MyThread() ths.append(t) for t in ths: t.start() for t in ths: t.join() if __name__ == '__main__': main()
相关文章推荐
- 讲解Python的Scrapy爬虫框架使用代理进行采集的方法
- python3爬虫必学Xpath,快速使用lxml.etree
- Python 爬虫入门(二)—— IP代理使用
- python3[爬虫实战] 使用selenium,xpath爬取京东手机(上)
- python3实现网络爬虫(7)-- 使用ip代理抓取网页
- python3[爬虫实战] 使用selenium,xpath爬取京东手机(下)
- 爬虫代理和本地封装类库的实现和使用(Python)
- Python中的requests框架使用爬虫代理的方案
- python爬虫之Scrapy 使用代理配置
- Python3 爬虫使用User Agent和代理IP隐藏身份
- Python爬虫之爬取——使用代理
- python爬虫初学(二)——使用代理
- 鱼c笔记——Python爬虫(三):使用User Agent和代理IP隐藏身份
- python爬虫之xpath的基本使用详解
- 使用代理访问百度网站 ProxyHandler python 爬虫 入门
- Python爬虫开发——XPath的使用
- Python:爬虫使用代理防封IP:HTTP Error 403: Forbidden
- 【python爬虫】scrapy框架笔记(一):创建工程,使用scrapy shell,xpath
- 讲解Python的Scrapy爬虫框架使用代理进行采集的方法
- 【Python3.6爬虫学习记录】(六)urllib详细使用方法(header,代理,超时,认证,异常处理)