您的位置：首页 > 编程语言 > Python开发

python爬虫网站图片

2017-08-28 18:08 387 查看

1.创建斗鱼工程

[root@namenode02 mySpider]# scrapy startproject douyu

New Scrapy project 'douyu', using template directory '/usr/lib64/python2.7/site-packages/scrapy/templates/project', created in:

/root/mySpider/douyu

You can start your first spider with:

cd douyu

scrapy genspider example example.com

[root@namenode02 mySpider]# cd douyu/
[root@namenode02 douyu]# cd douyu/

2.编辑items文件

[root@namenode02 douyu]# vi items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy

class DouyuItem(scrapy.Item):

# define the fields for your item here like:

nickname = scrapy.Field()

imagelink = scrapy.Field()

imagePath = scrapy.Field()

[root@namenode02 spiders]# pwd

/root/mySpider/douyu/douyu/spiders

[root@namenode02 spiders]# scrapy genspider douyumeinv "capi.douyucdn.cn"

Created spider 'douyumeinv' using template 'basic' in module:

douyu.spiders.douyumeinv

[root@namenode02 douyu]# cat settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for douyu project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'douyu'

SPIDER_MODULES = ['douyu.spiders']

NEWSPIDER_MODULE = 'douyu.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'douyu (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {

"User-Agent" : "DYZB/1 CFNetwork/808.2.16 Darwin/16.3.0"

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

}

# Enable or disable spider middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {

# 'douyu.middlewares.DouyuSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {

# 'douyu.middlewares.MyCustomDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {

'douyu.pipelines.ImagesPipeline': 300,

}

#图片存储路径

IMAGES_STORE = "/root/mySpider/douyu/Images"

# Enable and configure the AutoThrottle extension (disabled by default)

# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

[root@namenode02 douyu]# cat pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy

from scrapy.utils.project import get_project_settings

from scrapy.pipelines.images import ImagesPipeline

import os

class ImagesPipeline(ImagesPipeline):

#def process_item(self, item, spider):

# return item

#获取settings 文件里设置的变量值

IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

def get_media_requests(self,item,info):

image_url = item["imagelink"]

yield scrapy.Request(image_url)

def item_completed(self,result,item,info):

image_path = [x["path"] for ok, x in result if ok]

os.rename(self.IMAGES_STORE + "/" + image_path[0],self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")

item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]

return item

[root@namenode02 spiders]# cat douyumeinv.py

# -*- coding: utf-8 -*-

import scrapy

from douyu.items import DouyuItem

import json

class DouyumeinvSpider(scrapy.Spider):

name = 'douyumeinv'

allowed_domains = ['capi.douyucdn.cn']

offset = 0

url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="

start_urls = [url + str(offset)]

def parse(self, response):

#把json格式的数据转换为python格式，data段是列表

data = json.loads(response.text)["data"]

for each in data:

item = DouyuItem()

item["nickname"] = each["nickname"]

item["imagelink"] = each["vertical_src"]

yield item

self.offset +=20

yield scrapy.Request(self.url + str(self.offset),callback =self.parse)

[root@namenode02 douyu]# cat pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy

from scrapy.utils.project import get_project_settings

from scrapy.pipelines.images import ImagesPipeline

import os

class ImagesPipeline(ImagesPipeline):

#def process_item(self, item, spider):

# return item

#获取settings 文件里设置的变量值

IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

def get_media_requests(self,item,info):

image_url = item["imagelink"]

yield scrapy.Request(image_url)

def item_completed(self,result,item,info):

image_path = [x["path"] for ok, x in result if ok]

os.rename(self.IMAGES_STORE + "/" + image_path[0],self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")

item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]

return item

[root@namenode02 spiders]#

[root@namenode02 spiders]# scrapy crawl douyumeinv

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航