您的位置:首页 > 编程语言 > Python开发

python爬虫——使用代理和xpath爬取豆瓣读书

2019-04-10 12:22 197 查看
版权声明:转载需注明来源 https://blog.csdn.net/weixin_44024393/article/details/89179373

根据豆瓣读书的所有标签自动创建文件夹,使用代理防止被反爬。但是我的代理好像是假的,还是被反爬了…通过设置cookie爬取,但是爬取一定数量后需要在浏览器手动进行验证码输入…总的来说,代码写的很麻烦

import requests
from lxml import etree
from fake_useragent import UserAgent
import threading
import queue
import os
from urllib.parse import urljoin
import json
import csv
# 导入代理池
from proxies.get_proxy import get_ip_port
import random

# 获取ip
temp = list(map(lambda x: x[1], get_ip_port()))
temp = [x for x in temp if x]
# 构建请求头
ua = UserAgent()
headers = {
'user-agent': ua.Chrome,
'Cookie': '自己设置'
}
# 创建锁
LOCK = threading.Lock()

# 创建全局队列
q_url = queue.Queue()
q_path = queue.Queue()

# 创建多线程
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
# 声明一个列表存储字典
self.data_list = []
LOCK.acquire()
self.path = q_path.get()
self.url = q_url.get()
self.flag = False
LOCK.release()

def run(self) -> None:
self.parse_url(self.url, random.choice(temp))
self.save_file()

# 解析url,爬取数据
def parse_url(self, temp_url, proxy):
if not self.flag:
# 设置代理
proxies = {
'https:': random.choice(temp)
}
else:
proxies = proxy

try:
url = temp_url
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
if resp.status_code == 200:

self.flag = True

html = etree.HTML(resp.content)
lis = html.xpath('//ul[@class="subject-list"]/li')
if lis:
for li in lis:
# 声明一个字典存储数据
data_dict = {}
# 题目
title = li.xpath('.//h2/a/text()')[0]
# 清洗数据
title = title.strip()
data_dict['title'] = title
# 作者、出版社、年份、价格
all_info = li.xpath('.//div[@class="pub"]/text()')[0]
# 清洗数据
all_info = all_info.strip().split('/')
# 判断作者有多少个,如果all_info的长度是4那么作者有一个,如果是5个那么作者是2个
if len(all_info) == 4:
# 作者
author = all_info[0]
# 出版社
pub_house = all_info[1]
# 日期
datetime = all_info[2]
price = all_info[3]

data_dict['author'] = author
data_dict['pub_house'] = pub_house.strip()
data_dict['datetime'] = datetime.strip()
data_dict['price'] = price.strip()
elif len(all_info) == 5:
# 作者
author = all_info[0] + '/' + all_info[1]
# 出版社
pub_house = all_info[2]
# 日期
datetime = all_info[3]
price = all_info[4]

data_dict['author'] = author
data_dict['pub_house'] = pub_house.strip()
data_dict['datetime'] = datetime.strip()
data_dict['price'] = price.strip()
# 评分
rating_nums = li.xpath('.//span[@class="rating_nums"]/text()')
if rating_nums:
rating_nums = rating_nums[0].strip()
# 评论数
comment_count = li.xpath('.//span[@class="pl"]/text()')[0]
# 清洗数据
comment_count = comment_count.strip().replace('(', '').replace('人评价)', '')
# 摘要
text = li.xpath('.//p/text()')
if text:
# 数据清洗
text = text[0].strip().replace('\r', '').replace('\n', '')
else:
text = None

data_dict['rating_nums'] = rating_nums
data_dict['comment_count'] = comment_count
data_dict['text'] = text

print(data_dict)
self.data_list.append(data_dict)

next_url = html.xpath('//span[@class="next"]/a/@href')[0]
# 拼接url
if next_url:
next_url = urljoin(url, next_url)
print(next_url)
return self.parse_url(next_url, proxies)
else:
self.flag = False

return self.parse_url(url, proxies)

except Exception as e:
LOCK.acquire()
q_url.put(self.url)
q_path.put(self.path)
LOCK.release()

# 将数据存入json文件和csv文件
def save_file(self):

if self.data_list:
if not os.path.exists(self.path + '.json'):
with open(self.path + '.json', 'a+', encoding='utf-8') as f:
json.dump(self.data_list, f, ensure_ascii=False, indent=4)
print('json文件写入完成')
if not os.path.exists(self.path + '.csv'):
with open(self.path + '.csv', 'w', encoding='utf-8', newline='') as f:
# 表头
title = self.data_list[0].keys()
# 创建writer对象
writer = csv.writer(f)
# 写入表头
writer.writerow(title)
# 批量写入
for row in self.data_list:
writer.writerow(row.values())
print('csv文件写入完成')

# 获取标签名字并新建文件夹
def get_tag():
# 文件的基本路径
path = r'E:\PycharmCode\dbds_spiders'
# 设置代理
proxies = {
'https': random.choice(temp)
}
# 声明大标签
big_tag_list = []
# 获取标签名字的链接
tag_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
try:
# 请求url
resp = requests.get(tag_url, headers=headers, proxies=proxies, timeout=3)
if resp.status_code is 200:
# 将返回体转换成xpath的格式
html = etree.HTML(resp.content)
# 获取大标签的名字
big_tag = html.xpath('//div[@class="article"]/div[2]/div')
# 遍历大标签
for tag in big_tag:
# 获取标签名,原始数据是文学 · · · · · ·
tag_name = tag.xpath('.//h2/text()')[0]
# 数据清洗一下
big_tag_name = tag_name.split('·')[0].strip()
big_tag_list.append(big_tag_name)

# 获取小标签的名字
# 获取每个大标签下的小标签
tds = tag.xpath('.//td')
# 获取每个小标签的名字
for td in tds:
min_tag_name = td.xpath('./a/text()')[0]
min_tag_url = td.xpath('./a/@href')[0]
min_tag_url = urljoin(tag_url, min_tag_url)
# 组合路径
file_path = path + '\\' + big_tag_name + '\\' + min_tag_name
# 将路径和url放入对于队列
q_url.put(min_tag_url)
q_path.put(file_path)

return big_tag_list
else:
return get_tag()
except Exception as e:
return get_tag()

# 创建文件
def make_file(big_tag_list):
# 文件的基本路径
path = r'E:\PycharmCode\dbds_spiders'

# 判断文件是否存在,如果不存在就新建文件夹
# 新建文件夹路径
for big_temp in big_tag_list:
big_path = path + '\\' + big_temp
# 新建文件夹
if not os.path.exists(big_path):
os.makedirs(big_path)

print('文件夹新建完成')

def main():

# 先将小标签的路径和url提取出来并放入队列
big_tag_list = get_tag()
make_file(big_tag_list)

while not q_url.empty() and not q_path.empty():
# print(q_url.get())
# print(q_path.get())
# print('++++++++++++++++++++++++')
ths = []
# 每次创建3个线程
for i in range(3):
t = MyThread()
ths.append(t)
for t in ths:
t.start()
for t in ths:
t.join()

if __name__ == '__main__':

main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: