您的位置:首页 > 其它

使用scrapy 抓取顶点小说

2017-08-15 19:46 453 查看
# -- coding: utf-8 --

import scrapy

from urllib import parse

from scrapy import Request

from ding.items import DingItem

scrapy spider爬取部分



#将要获取的网址

class DingdianSpider(scrapy.Spider):

name = 'dingdian'   #scrapy框架name
start_urls = ['http://www.x23us.com/'] #顶点小说网址

#解析
def parse(self, response):
title = response.xpath('/html/body/div[2]/ul/li/a/@href').extract()[1:12]#全部类型
for i in title:#便利获取值,传入下一个解析函数
yield Request(url=parse.urljoin(response.url,i), callback=self.parse_get)




#获取最大页数

def parse_get(self, response):

name = response.xpath(‘//*[@id=”pagelink”]/a[14]/text()’).extract()[0]

for i in range(1, int(name)):

if response.url != ‘http://www.x23us.com/quanben/1‘:

left_url = response.url[:-6]

right_url = response.url[-5:]

yield Request(left_url + str(i) + right_url, callback=self.get_parse)#每页的链接

else:

yield Request(parse.urljoin(response.url, str(i)), callback=self.get_parse)

#全本小说的类型与其他类型的url不一致,返回上一级函数重新匹配,在传入下一级函数中



#每章内容url和作者

def get_parse(self, response):

try:

article_url = response.xpath(‘//*[@id=”content”]/dd/table/tr/td/a/@href’).extract()[1]

author = response.xpath(‘//*[@id=”content”]/dd/table/tr/td/text()’).extract()[0]

yield Request(article_url, callback=self.page_list, meta={‘author’:author})#传入下一级函数

except:

pass



#每章标题和标题的url

def page_list(self,response):

title = response.xpath(‘//*[@id=”a_main”]/div/dl/dd/h1/text()’).extract()[0]

content_url = response.xpath(‘//*[@id=”at”]/tr/td/a/@href’).extract()[0]

yield Request(parse.urljoin(response.url, content_url), callback=self.content_html, meta={‘title’:title, ‘author’:response.meta[‘author’]})

#传入下一级函数



#解析每章的内容,从网页中抽取项目,清晰,验证你要抓取的数据进入items

def content_html(self, response):

item = DingItem()

title1 = response.xpath(‘//*[@id=”amain”]/dl/dd[1]/h1/text()’).extract()[0]

item[‘book’] = response.meta[‘title’]

item[‘article_title’] = title1

item[‘author’] = response.meta[‘author’]

# content_all =”

content_con = response.xpath(‘//*[@id=”contents”]/text()’).extract()

for i in content_con:

content_all = i.replace(‘\xa0’,”)

item[‘content’] = content_all

yield item

scrapy item 部分

#定义抓取的字段

import scrapy

class DingItem(scrapy.Item):

author = scrapy.Field()

article_title = scrapy.Field()

content = scrapy.Field()

book = scrapy.Field()

pass

scrapy pipelines部分

#存入PYmongo

import pymongo

class DingPipeline(object):

def init(self):

client = pymongo.MongoClient()

self.db = client[‘Dingdian’][‘xiaoshuo’]

def process_item(self, item, spider):

self.db.insert(dict(item))

return item

运行代码

from scrapy.cmdline import execute

execute([‘scrapy’, ‘crawl’, ‘dingdian’])
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  parse 编辑器