您的位置：首页 > 编程语言 > Python开发

python爬虫——使用xpath爬取搜狗微信文章

2019-04-07 23:25 225 查看

缺点：爬取一定数量之后会出现验证码，导致不能继续爬取，需要更换ip才可以继续爬取，或者在浏览器重新输入验证码之后复制cookie后方能继续爬取。

import requests
from fake_useragent import UserAgent
from urllib.parse import urlencode
from lxml import etree
import re
import json
import csv
import queue
import threading

# 声明一个列表，存储字典
data_list = []
# 构造请求头
ua = UserAgent()
# 设置cookie，可以爬取多页，如果不设置最多只能爬取10页
headers = {
'user-agent': ua.Chrome,
'cookie': '自己设置'

}

# 创建多线程
class MyThread(threading.Thread):
def __init__(self, q):
threading.Thread.__init__(self)
self.q = q

def run(self) -> None:
self.get_index()

def get_index(self):
url = self.q.get()
try:
# 访问改网址
resp = requests.get(url, headers=headers)
# 将返回的数据转成lxml格式，之后使用xpath进行抓取
html = etree.HTML(resp.content)
# 分析网页代码先抓取全部li标签
lis = html.xpath('//ul[@class="news-list"]/li')
# 循环遍历li标签，抓取每章微信文章的题目，作者，发表时间，摘要，链接等
for li in lis:
# 声明一个字典存储数据
data_dict = {}
# 爬取文章的题目
title = ''.join(li.xpath('.//h3//text()')).strip()
# 爬取文章的链接
href = li.xpath('.//h3/a/@data-share')[0]
# 爬取作者名称
author = li.xpath('.//div[@class="s-p"]/a/text()')[0]
# 爬取文章发表的时间
# 先爬取时间戳，抓取到的时间戳文本为："document.write(timeConvert('1526012173'))"
time_convert = li.xpath('.//div[@class="s-p"]/span/script/text()')[0]
# 使用正则表达式匹配时间戳
datetime = re.findall(r'\d+', time_convert)[0]

# 将爬取到的数据存储到json文件
data_dict['title'] = title
data_dict['author'] = author
data_dict['datetime'] = datetime
data_dict['href'] = href
print(data_dict)
# 将字典存入列表中
data_list.append(data_dict)

except Exception as e:
# 如果访问超时就打印错误信息，并将该条url放入队列，防止出错的url没有爬取
self.q.put(url)
print(e)

def main():
# 创建队列存储url
q = queue.Queue()
for page in range(1, 10):
# 构造参数
data = {
'query': 'python爬虫',
'type': '2',
'page': page
}
# 将url的参数进行编码后拼接到url
url = 'https://weixin.sogou.com/weixin?' + urlencode(data)
# 将拼接好的url放入队列中
q.put(url)

# 如果队列不为空，就继续爬
while not q.empty():
# 创建3个线程
ts = []
for count in range(3):
t = MyThread(q)
ts.append(t)
for t in ts:
t.start()
for t in ts:
t.join()

if __name__ == '__main__':
# 启动爬虫
main()
if data_list:
# 将数据写入json
with open('weixin_data_json.json', 'a+', encoding='utf-8') as f:
# 使用json.dump方法将数据写入json文件
json.dump(data_list, f, ensure_ascii=False, indent=4)
print('写入json文件完成')

# 将json文件中的数据写入csv文件
# csv的表头
title = data_list[0].keys()
with open('weixin_data_json.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
# 写入表头
writer.writerow(title)
# 批量写入数据
for row in data_list:
writer.writerow(row.values())
print('写入csv文件完成')

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航