MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline
2017-11-11 16:00
134 查看
改进版的MongoPipeline
MongoPipeline
ImagePipeline
CsvPipeline
JsonPipeline
XmlWritePipeline
在settings.py中需要制定保存路径
IMAGES_STORE=’F:/images’
MongoPipeline
ImagePipeline
CsvPipeline
JsonPipeline
XmlWritePipeline
改进版的MongoPipeline
2017/11/5
import pymongo from scrapy.conf import settings ## 在settings.py中配置MONGO_URI,MONGO_DATABASE,MONGO_COLLECTION,如果不配置,则默认使用localhost, 项目名和爬虫名 class MongoPipeline(object): def __init__(self): self.mongo_uri = settings.get('MONGO_URI','localhost') self.mongo_db = settings.get('MONGO_DATABASE', settings['BOT_NAME']) self.mongo_collection = settings.get('MONGO_COLLECTION') self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): if not self.mongo_collection: self.mongo_collection = spider.name self.db[self.mongo_collection].insert_one(dict(item)) return item def close_spider(self, spider): self.client.close()
MongoPipeline
来自官网,需要在settings中定义MONGO_URI和MONGO_DATABASEimport pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) return item
ImagePipeline
自定义的图片下载器,以列表形式保存img_urls,title为文件夹的名字,图片的名称为index+1在settings.py中需要制定保存路径
IMAGES_STORE=’F:/images’
from scrapy.pipelines.images import ImagesPipeline from scrapy.http import Request import os class ImagePipeline(ImagesPipeline): # 自定义图片下载器 def get_media_requests(self, item, info): '''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段, 以数组的方式存储,遍历数组请求图片''' for i, image_url in enumerate(item['imgs']): yield Request(image_url, meta={'item':item, 'index':i+1}) def file_path(self, request, response=None, info=None): item = request.meta['item'] name = item['name'] index = request.meta['index'] image_guid = name + str(index) + '.' + request.url.split('.')[-1] imagepath = 'full/{}/{}'.format(name, image_guid) return imagepath
CsvPipeline
import csv class CsvPipeline(object): def __init__(self): self.csvfp = open('pipeline.csv', 'w', encoding='utf8') fieldnames = ['tea_hd', 'name', 'title', 'img_url', 'content'] self.writer =csv.DictWriter(self.csvfp, fieldnames=fieldnames) self.writer.writeheader() def process_item(self, item, spider): self.writer.writerow(item) return item def close_spider(self, spider): self.csvfp.close()
JsonPipeline
import json class JsonPipeline(object): def open_spider(self, spider): self.fp = open('itcast.json','w', encoding='utf8') self.fp.write('[') def process_item(self, item, spider): dict_data = dict(item) str_data = json.dumps(dict_data,ensure_ascii=False) + ',\n' self.fp.write(str_data) return item def close_spider(self, spider): self.fp.seek(self.fp.tell() - 3, 0) self.fp.write(']') self.fp.close()
XmlWritePipeline
from scrapy import signals from scrapy import log from TestSpider.items import TestspiderItem from twisted.enterprise import adbapi from scrapy.contrib.exporter import XmlItemExporter # class TestspiderPipeline(object): # def process_item(self, item, spider): # return item class XmlWritePipeline(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('bbsData.xml', 'wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() # process the crawled data, define and call dataProcess function # dataProcess('bbsData.xml', 'text.txt') def process_item(self, item, spider): self.expoter.export_item(item) return item
相关文章推荐
- solr6.6 导入 文本(txt/json/xml/csv)文件
- python cookbook第三版学习笔记七:python解析csv,json,xml文件
- 如何将 JSON, Text, XML, CSV 数据文件导入 MySQL
- 读取五种格式的配置文件(xml(两种方式),txt,excel,csv,json)
- 如何将 JSON、Text、 XML、 CSV 数据文件导入 MySQL
- 【第6篇】使用Json-lib的XMLSerializer的write和reader操作json数据和xml
- .NET操作Excel/CSV文件以及分析JSON/XML数据类型
- 如何将 JSON, Text, XML, CSV 数据文件导入 MySQL
- solr File Upload "Unsupported ContentType: application/vnd.ms-excel Not in: [application/xml, application/csv, application/json, text/json, text/csv, text/xml, application/javabin]",
- Web开发中常见的数据结构(CSV,XML,JSON)
- solr5.3.1 json xml csv 等文件类型,建立索引
- 读写csv、xml、mat、json、exel
- python读写csv_xml_json配置文件
- Table表格导出为Excel、csv、txt、sql、json、xml、Word格式
- xml,json未必是最好的数据传输方案,csv或许更适合
- 如何将 JSON, Text, XML, CSV 数据文件导入 MySQL
- Table表格导出为Excel、csv、txt、sql、json、xml、Word格式
- 2Python进阶强化训练之csv|json|xml|excel高
- PHP如何自动识别第三方Restful API的内容,自动渲染成 json、xml、html、serialize、csv、php等数据
- nested exception is java.lang.NoSuchMethodError: com.fasterxml.jackson.core.JsonGenerator.writeStart