您的位置:首页 > 数据库 > Mongodb

Scrapy爬取数据存入Mongodb中

2019-01-18 16:25 183 查看

导读

这次使用

scrapy
简单的爬取一些多列表电影数据,存储在csv文件及json文件中,最后把这些数据全部存储在Mongodb中。涉及的知识点有pipeline,yield,中间件,xpath,items 的使用。

入口文件编写

  • 文件名:
    douban_spider.py
  • 实现代码如下:
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem

class DoubanSpiderSpider(scrapy.Spider):
# 爬虫的名字
name = 'douban_spider'
# 允许的域名
allowed_domains = ['movie.douban.com']
# 入口url
start_urls = ['https://movie.douban.com/top250']

def parse(self, response):
movie_list = response.xpath('//div[@class="article"]//ol[@class="grid_view"]/li')
for it in movie_list:
douban_item = DoubanItem()
douban_item['serial_number'] = it.xpath(".//div[@class='item']//em/text()").extract_first()
douban_item['movie_name'] = it.xpath('.//div[@class="hd"]//a/span[1]/text()').extract_first()
content = it.xpath('.//div[@class="bd"]//p[1]/text()').extract()
for c_introduce in content:
douban_item['introduce'] = "".join(c_introduce.split())
douban_item['star'] = it.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract_first()
douban_item['evaluate'] = it.xpath('.//div[@class="star"]/span[4]/text()').extract_first()
douban_item['describe'] = it.xpath('.//p[@class="quote"]/span/text()').extract_first()
print(douban_item)
yield douban_item
next_link = response.xpath('//span[@class="next"]/a/@href').extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request("https://movie.douban.com/top250"+next_link, callback=self.parse)

items字段定义

import scrapy

class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 序号
serial_number = scrapy.Field()
# 电影名称
movie_name = scrapy.Field()
# 电影的简介
introduce = scrapy.Field()
# 电影的评分
star = scrapy.Field()
# 评论数
evaluate = scrapy.Field()
# 描述
describe = scrapy.Field()
# pass

中间件新增随机user-agent

class my_user_agent(object):

def process_request(self,request,spider):
# user agent 列表
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# 随机生成user agent
USER_AGENT = random.choice(USER_AGENT_LIST)
request.headers['User_Agent'] = USER_AGENT

定义pipelines将数据存储在Mongodb

import pymongo
from douban.settings import mongo_host,mongo_port,mongo_db_name,mongo_db_collecttion

class DoubanPipeline(object):

def __init__(self):
host = mongo_host
port = mongo_port
dbname = mongo_db_name
cname = mongo_db_collecttion
client = pymongo.MongoClient(host=host,port=port)
mydb = client[dbname]
self.post = mydb[cname]

def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item

其他知识点

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: