您的位置:首页 > 数据库 > Mongodb

Python+Scrapy爬取起点小说网数据存入Mysql与MongoDB数据库

2018-02-21 17:04 549 查看
抓包分析,找到整页数据所在,可以看到是在"lang"下。




继续分析,找到每一本数据所在









分析完毕,进行编码爬取。

1.编写item

class BookItem(scrapy.Item):
# 小说名
book_name = scrapy.Field()
# 作者
author = scrapy.Field()
# 小说类型
book_type = scrapy.Field()
# 小说状态
book_state = scrapy.Field()
# 小说更新
book_update = scrapy.Field()

book_time = scrapy.Field()
# 最新一章地址
new_href = scrapy.Field()
# 小说简介
book_intro = scrapy.Field()
pass


2.编写Spider

from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector

from Mycrawl.items import BookItem
import requests

class BookSpider(Spider):
# 爬虫名字,重要
name = 'book'
# 反爬措施
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
#url = "https://www.qidian.com/rank/yuepiao?style=1"
#start_urls = ['qidian.com']

def start_requests(self):
url = "https://www.qidian.com/rank/yuepiao?style=1"
yield Request(url, headers=self.headers, callback=self.parse)

def parse(self, response):
item = BookItem()
selector = Selector(response)
books = selector.xpath('//div[@class="book-mid-info"]')
for book in books:
name = book.xpath('h4/a/text()').extract()
author = book.xpath('p[@class="author"]/a[@class="name"]/text()').extract()
type = book.xpath('p[@class="author"]/a[@data-eid="qd_C42"]/text()').extract()
state = book.xpath('p[@class="author"]/span/text()').extract()
intro = book.xpath('p[@class="intro"]/text()').extract()
update = book.xpath('p[@class="update"]/a[@target="_blank"]/text()').extract()
href = book.xpath('p[@class="update"]/a/@href').extract()
time = book.xpath('p[@class="update"]/span/text()').extract()

item['book_name'] = name[0]
item['author'] = author[0]
item['book_type'] = type[0]
item['book_state'] = state[0]
item['book_update'] = update[0]
item['book_time'] = time[0]
item['new_href'] = 'https:' + href[0]
item['book_intro'] = ''.join(intro).replace(' ','').replace('\n','')
yield item


3.编写piplines连接数据库

Mysql

import pymysql
class BookPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor = self.conn.cursor()
self.cursor.execute('truncate table Book')
self.conn.commit()

def process_item(self, item, spider):
try:
self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
item['book_state'], item['book_update'], item['book_time'],
item['new_href'], item['book_intro']))
return item


MongoDB

import pymongo
class BookPipeline(object):
def __init__(self):
# 连接数据库
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.test = self.client['TESTDB']
self.post = self.test['movie']
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item


4.编写setting

BOT_NAME = 'Mycrawl'

SPIDER_MODULES = ['Mycrawl.spiders']
NEWSPIDER_MODULE = 'Mycrawl.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Mycrawl (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
#'Mycrawl.pipelines.MoviePipeline': 100,
'Mycrawl.pipelines.BookPipeline': 300,
}


5.结果

Mysql



MongoDB



本文只爬取了第一页的,大家可以自己尝试爬取所有页的数据。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: