您的位置:首页 > 运维架构

MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline

2017-11-11 16:00 134 查看
改进版的MongoPipeline

MongoPipeline

ImagePipeline

CsvPipeline

JsonPipeline

XmlWritePipeline

改进版的MongoPipeline

2017/11/5

import pymongo
from scrapy.conf import settings

## 在settings.py中配置MONGO_URI,MONGO_DATABASE,MONGO_COLLECTION,如果不配置,则默认使用localhost, 项目名和爬虫名
class MongoPipeline(object):
def __init__(self):
self.mongo_uri = settings.get('MONGO_URI','localhost')
self.mongo_db = settings.get('MONGO_DATABASE', settings['BOT_NAME'])
self.mongo_collection = settings.get('MONGO_COLLECTION')
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def process_item(self, item, spider):
if not self.mongo_collection:
self.mongo_collection = spider.name
self.db[self.mongo_collection].insert_one(dict(item))
return item

def close_spider(self, spider):
self.client.close()


MongoPipeline

来自官网,需要在settings中定义MONGO_URI和MONGO_DATABASE

import pymongo

class MongoPipeline(object):

collection_name = 'scrapy_items'

def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def close_spider(self, spider):
self.client.close()

def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item


ImagePipeline

自定义的图片下载器,以列表形式保存img_urls,title为文件夹的名字,图片的名称为index+1

在settings.py中需要制定保存路径

IMAGES_STORE=’F:/images’

from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
import os

class ImagePipeline(ImagesPipeline):
# 自定义图片下载器
def get_media_requests(self, item, info):
'''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段,
以数组的方式存储,遍历数组请求图片'''
for i, image_url in enumerate(item['imgs']):
yield Request(image_url, meta={'item':item, 'index':i+1})

def file_path(self, request, response=None, info=None):
item = request.meta['item']
name = item['name']
index = request.meta['index']
image_guid = name + str(index) + '.' + request.url.split('.')[-1]
imagepath = 'full/{}/{}'.format(name, image_guid)
return imagepath


CsvPipeline

import csv
class CsvPipeline(object):
def __init__(self):
self.csvfp = open('pipeline.csv', 'w', encoding='utf8')
fieldnames = ['tea_hd', 'name', 'title', 'img_url', 'content']
self.writer =csv.DictWriter(self.csvfp, fieldnames=fieldnames)
self.writer.writeheader()

def process_item(self, item, spider):
self.writer.writerow(item)
return item

def close_spider(self, spider):
self.csvfp.close()


JsonPipeline

import json
class JsonPipeline(object):
def open_spider(self, spider):
self.fp = open('itcast.json','w', encoding='utf8')
self.fp.write('[')

def process_item(self, item, spider):
dict_data = dict(item)
str_data = json.dumps(dict_data,ensure_ascii=False) + ',\n'
self.fp.write(str_data)
return item

def close_spider(self, spider):
self.fp.seek(self.fp.tell() - 3, 0)
self.fp.write(']')
self.fp.close()


XmlWritePipeline

from scrapy import signals
from scrapy import log
from TestSpider.items import TestspiderItem
from twisted.enterprise import adbapi
from scrapy.contrib.exporter import XmlItemExporter

# class TestspiderPipeline(object):
#     def process_item(self, item, spider):
#         return item
class XmlWritePipeline(object):
def __init__(self):
pass
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('bbsData.xml', 'wb')
self.expoter = XmlItemExporter(self.file)
self.expoter.start_exporting()
def spider_closed(self, spider):
self.expoter.finish_exporting()
self.file.close()
# process the crawled data, define and call dataProcess function
# dataProcess('bbsData.xml', 'text.txt')
def process_item(self, item, spider):
self.expoter.export_item(item)
return item
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  scrapy pipeline