您的位置:首页 > 其它

Scrapy 爬取全职高手小说

BradyCC 2019-05-29 22:58 78 查看 https://blog.csdn.net/bradycc/

Scrapy 爬取全职高手小说

应用 Scrapy框架 ,爬取全职高手小说数据,存于本地json文件。

# items 配置抓取数据字段
import scrapy

bookName = scrapy.Field()
bookTitle = scrapy.Field()
chapterNum = scrapy.Field()
chapterName = scrapy.Field()
chapterUrl = scrapy.Field()
chapterContent = scrapy.Field()
# spider 抓取数据

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from novel.items import NovelItem

class QuanzhigaoshouSpider(CrawlSpider):
name = 'quanzhigaoshou'
# allowed_domains = ['qu.la']
start_urls = ['https://www.qu.la/book/32/']

def parse_start_url(self, response):
print(response.url)

rules = (
Rule(LinkExtractor(allow=r'/book/32/\d+.html'), callback='parse_item', follow=True),
)

def parse_item(self, response):
print(response.url)
item = NovelItem()
item['bookName'] = response.xpath('//div[@class="con_top"]/a[2]/text()').get()
title = response.xpath('//div[@class="bookname"]/h1/text()').get().split(' ', 1)
if len(title) == 1:
item['chapterNum'] = ''
item['chapterName'] = title[0]
elif len(title) == 2:
item['chapterNum'] = title[0]
item['chapterName'] = title[1]
item['chapterUrl'] = response.url
item['chapterContent'] = ''.join(response.xpath('//div[contains(@id, "content")]/text()').extract()).strip()
yield item
# pipeline 处理数据

import json
import codecs
import os

class NovelPipeline(object):

def __init__(self):
self.file = codecs.open('quanzhigaoshou.json', 'w', 'utf-8')
self.file.write('[')

def open_spider(self, spider):
print('This spider is starting!')

def process_item(self, item, spider):
if spider.name == 'quanzhigaoshou':
data = json.dumps(dict(item), ensure_ascii=False) + ',\n'
self.file.write(data)
return item

def close_spider(self, spider):
print('This spider is end!')
self.file.seek(-2, os.SEEK_END)     # 定位到倒数第二个字符,即最后一个逗号
self.file.truncate()                # 删除最后一个逗号
self.file.write(']')
self.file.close()
标签: