您的位置：首页 > 其它

使用Xpath+多进程爬取诗词名句网的史书典籍类所有文章。update~

2019-08-09 13:24 106 查看

上次写了爬取这个网站的程序，有一些地方不完善，而且爬取速度较慢，今天完善一下并开启多进程爬取，速度就像坐火箭。。

# 需要的库
from lxml import etree
import requests
from multiprocessing import Pool
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
# 保存文本的地址
pathname=r'E:\爬虫\诗词名句网\\'
# 获取书籍名称的函数
def get_book(url):
try:
response = requests.get(url,headers)
etrees = etree.HTML(response.text)
url_infos = etrees.xpath('//div[@class="bookmark-list"]/ul/li')
urls = []
for i in url_infos:
url_info = i.xpath('./h2/a/@href')
book_name = i.xpath('./h2/a/text()')[0]
print('开始下载.'+book_name)
urls.append('http://www.shicimingju.com' + url_info[0])
# print('http://www.shicimingju.com'+url_info[0])
# get_index('http://www.shicimingju.com'+url_info[0])
# 开启多进程
pool.map(get_index,urls)
except Exception:
print('get_book failed')
# 获取书籍目录的函数
def get_index(url):
try:
response = requests.get(url, headers)
etrees = etree.HTML(response.text)
url_infos = etrees.xpath('//div[@class="book-mulu"]/ul/li')
for i in url_infos:
url_info = i.xpath('./a/@href')
# print('http://www.shicimingju.com' + url_info[0])
get_content('http://www.shicimingju.com' + url_info[0])
except Exception as e:
print(e)
# 获取书籍内容并写入.txt文件
def get_content(url):
try:
response = requests.get(url, headers)
etrees = etree.HTML(response.text)
title = etrees.xpath('//div[@class="www-main-container www-shadow-card "]/h1/text()')[0]
content = etrees.xpath('//div[@class="chapter_content"]/p/text()')
if not content:
content = etrees.xpath('//div[@class="chapter_content"]/text()')
content = ''.join(content)
book_name = etrees.xpath('//div[@class="nav-top"]/a[3]/text()')[0]
with open(pathname + book_name + '.txt', 'a+', encoding='utf-8') as f:
f.write(title + '\n\n' + content + '\n\n\n')
print(title + '..下载完成')
else:
content = ''.join(content)
book_name=etrees.xpath('//div[@class="nav-top"]/a[3]/text()')[0]
with open(pathname+book_name+'.txt','a+',encoding='utf-8') as f:
f.write(title+'\n\n'+content+'\n\n\n')
print(title+'..下载完成')
except Exception:
print('get_content failed')
# 程序入口
if __name__ == '__main__':
url = 'http://www.shicimingju.com/book/'
# 开启进程池
pool = Pool()
# 启动函数
get_book(url)

控制台输出；
查看文件夹，可以发现文件是多个多个的同时在下载；

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航