您的位置:首页 > 数据库 > SQL

常见的scrapy持久化方式(mongo,mysql,imgpipeline)

2019-08-09 18:41 1031 查看
版权声明:本文为博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。 本文链接:https://blog.csdn.net/weixin_44220464/article/details/98975283

mongo常见方式:
连接插入,在spider中进行,这种比较简洁。

import pymongo

class WynewsPipeline(object):
conn = pymongo.MongoClient('localhost', 27017)
db = conn.wynews
table = db.newsinfo
def process_item(self, item, spider):
self.table.insert(dict(item))
return item
# 标准方法,官方推荐使用,两种方式的具体区别我也没看出来
open_spider(self, spider): spider开启是被调用
close_spider(self, spider): spider关闭是被调用
from_crawler(cls, crawler): 类方法, 用@classmethod标识, 可以获取配置信息
Process_item(self, item, spider): 与数据库交互存储数据, 该方法必须实现
import Pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_url = mongo_uri
self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_url = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def process_item(self, item, spider):
self.db[collection].insert(dict(item))
return item

def close_spider(self, spider):
self.client.close()
# 需在settings中指定库
MONGO_URI = 'localhost'
MONGO_DB = 'wynews'
# MySQL交互:
import pymysql

def MysqlPipeline(object):
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(self, crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST')
database = crawler.settings.get('MYSQL_DATABASE')
user = crawler.settings.get('MYSQL_USER')
password= crawler.settings.get('MYSQL_PASSWORD')
port = crawler.settings.get('MYSQL_PORT')
)

def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf-8', port=self.port)
self.cursor = self.db.cursor()

def process_item(self, item, spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
sql = 'insert into %s (%s) values (%s)' % (table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item

imgpipeline大文件存储

底层使用了PIL库的方法,比普通方法更稳定。

# 用于文件下载的管道类
# spider编码:
import scrapy
from ..items import XhxhItem
class XhSpider(scrapy.Spider):
name = 'xh'
# allowed_domains = ['www.baidu.com']
start_urls = ['http://www.521609.com/qingchunmeinv/']
def parse(self, response):
li_list = response.xpath('//div[@class="index_img list_center"]/ul/li')
for li in li_list:
item = XhxhItem()
link = li.xpath('./a[1]/img/@src').extract_first()
item['img_link'] = 'http://www.521609.com' + link
print(item)
yield item
# items编码:
import scrapy
class XhxhItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_link = scrapy.Field()
# 管道编码:
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class XhxhPipeline(object):
def process_item(self, item, spider):
return item

class ImgPipeLine(ImagesPipeline):

def get_media_requests(self, item, info):
yield scrapy.Request(url=item['img_link'])

def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name

def item_completed(self, results, item, info):
return item
# settings编码:
ITEM_PIPELINES = {
'xhxh.pipelines.XhxhPipeline': 300,
'xhxh.pipelines.ImgPipeLine': 301,
}
IMAGES_STORE = './mvs'
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: