您的位置:首页 > 编程语言 > Python开发

python爬虫网站图片

2017-08-28 18:08 387 查看
1.创建斗鱼工程

[root@namenode02 mySpider]# scrapy startproject douyu

New Scrapy project 'douyu', using template directory '/usr/lib64/python2.7/site-packages/scrapy/templates/project', created in:

    /root/mySpider/douyu

You can start your first spider with:

    cd douyu

    scrapy genspider example example.com

[root@namenode02 mySpider]# cd douyu/
[root@namenode02 douyu]# cd douyu/

2.编辑items文件

[root@namenode02 douyu]# vi items.py 

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy

class DouyuItem(scrapy.Item):

    # define the fields for your item here like:

    nickname = scrapy.Field()

    imagelink = scrapy.Field()

    imagePath = scrapy.Field()

[root@namenode02 spiders]# pwd

/root/mySpider/douyu/douyu/spiders

[root@namenode02 spiders]# scrapy  genspider douyumeinv "capi.douyucdn.cn"

Created spider 'douyumeinv' using template 'basic' in module:

  douyu.spiders.douyumeinv

[root@namenode02 douyu]# cat settings.py 

# -*- coding: utf-8 -*-

# Scrapy settings for douyu project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'douyu'

SPIDER_MODULES = ['douyu.spiders']

NEWSPIDER_MODULE = 'douyu.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'douyu (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {

    "User-Agent" : "DYZB/1 CFNetwork/808.2.16 Darwin/16.3.0"

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

}

# Enable or disable spider middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {

#    'douyu.middlewares.DouyuSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {

#    'douyu.middlewares.MyCustomDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {    

     'douyu.pipelines.ImagesPipeline': 300,

}

#图片存储路径

IMAGES_STORE = "/root/mySpider/douyu/Images"

# Enable and configure the AutoThrottle extension (disabled by default)

# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

[root@namenode02 douyu]# cat pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy

from scrapy.utils.project import get_project_settings

from scrapy.pipelines.images import ImagesPipeline

import os

class ImagesPipeline(ImagesPipeline):

    #def process_item(self, item, spider):

     #   return item

    #获取settings 文件里设置的变量值

    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

    def get_media_requests(self,item,info):

        image_url = item["imagelink"]

        yield scrapy.Request(image_url)

    def item_completed(self,result,item,info):

        image_path = [x["path"] for ok, x in result if ok]

        os.rename(self.IMAGES_STORE + "/" + image_path[0],self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")

        item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]

        return item

[root@namenode02 spiders]# cat douyumeinv.py

# -*- coding: utf-8 -*-

import scrapy

from douyu.items import DouyuItem 

import json

class DouyumeinvSpider(scrapy.Spider):

    name = 'douyumeinv'

    allowed_domains = ['capi.douyucdn.cn']

    offset = 0

    url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="

    start_urls = [url + str(offset)]

    def parse(self, response):

        #把json格式的数据转换为python格式,data段是列表

        data = json.loads(response.text)["data"]

        for each in data:

            item = DouyuItem()

            item["nickname"] = each["nickname"]

            item["imagelink"] = each["vertical_src"]

            yield item

        self.offset +=20

        yield scrapy.Request(self.url + str(self.offset),callback =self.parse)

[root@namenode02 douyu]# cat pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy

from scrapy.utils.project import get_project_settings

from scrapy.pipelines.images import ImagesPipeline

import os

class ImagesPipeline(ImagesPipeline):

    #def process_item(self, item, spider):

     #   return item

    #获取settings 文件里设置的变量值

    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

    def get_media_requests(self,item,info):

        image_url = item["imagelink"]

        yield scrapy.Request(image_url)

    def item_completed(self,result,item,info):

        image_path = [x["path"] for ok, x in result if ok]

        os.rename(self.IMAGES_STORE + "/" + image_path[0],self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")

        item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]

        return item

[root@namenode02 spiders]#  

[root@namenode02 spiders]# scrapy crawl douyumeinv
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: