您的位置:首页 > 其它

Scrapy 爬取盗墓笔记小说

2019-05-26 15:11 295 查看

Scrapy 爬取盗墓笔记小说

应用 Scrapy框架 爬取盗墓笔记小说数据,存入MongoDB 数据库。

# settings 配置mongodb
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'MySpider'
MONGODB_DOCNAME = 'daomubiji'
# items 配置抓取数据字段
import scrapy

class NovelItem(scrapy.Item):
bookName = scrapy.Field()
bookTitle = scrapy.Field()
chapterNum = scrapy.Field()
chapterName = scrapy.Field()
chapterUrl = scrapy.Field()
# spider 抓取数据
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from novel.items import NovelItem

class DaomubijiSpider(CrawlSpider):
name = 'daomubiji'
allowed_domains = ['daomubiji.com']
start_urls = ['http://www.daomubiji.com/']

def parse_start_url(self, response):
pass

rules = (
Rule(LinkExtractor(restrict_xpaths='//article[@class="article-content"]//a'), callback='parse_item', follow=True),
)

def parse_item(self, response):
item = NovelItem()
list = response.xpath('//body')
for listItem in list:
item['bookName'] = listItem.xpath('.//h1[@class="focusbox-title"]/text()').get().split(':')[0]
subList = listItem.xpath('.//div[@class="excerpts"]//article')
for subListItem in subList:
item['bookTitle'] = subListItem.xpath('.//a/text()').get().split(' ')[0]
item['chapterNum'] = subListItem.xpath('.//a/text()').get().split(' ')[1]
item['chapterName'] = subListItem.xpath('.//a/text()').get().split(' ')[2]
item['chapterUrl'] = subListItem.xpath('.//a/@href').get()
yield item
# pipeline 处理数据
from scrapy.conf import settings
import pymongo

class NovelPipeline(object):

def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
db = client[dbName]
self.post = db[settings['MONGODB_DOCNAME']]

def open_spider(self, spider):
print('This spider is starting!')

def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item

def close_spider(self, spider):
print('This spider is end!')
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: