您的位置:首页 > 运维架构 > 网站架构

开始学习爬虫:爬虫之爬取电影天堂网站资源到本地mysql数据库

2020-04-26 18:51 841 查看

刚刚开始为毕业设计做一个爬虫项目作为数据准备,花费了几天学习爬虫的知识,写了一个爬取电影天堂的爬虫项目,主要是爬取电影天堂的下载链接,图片,导演这些信息保存到本地的mysql数据库中,具体的字段有:

具体代码如下:
demo_scrapy.py:

//#爬虫主体
import scrapy
import json
from movie.items import MovieItem
import re

from scrapy.utils.project import get_project_settings

settings = get_project_settings()
class DmozSpider(scrapy.spiders.Spider):
name = "demo"
allowed_domains = ['www.dytt8.net']
start_urls = ['https://www.dytt8.net/html/gndy/dyzz/index.html']
i = 0
def parse(self, response):
info_url_xpath='//td/b/a/@href'
next_url_xpath='//div[@class="x"]/td/a[last()-1]/@href'
#titles=response.xpath(titles_xpath).extract()
# 电影介绍页面url
info_urls=response.xpath(info_url_xpath).extract()
next_urls=response.xpath(next_url_xpath).extract()
# 下一页url
next_url='https://www.dytt8.net/html/gndy/dyzz/'+next_urls[0]
#print(next_url)
a=0
while a in range(len(info_urls)):
#print(a)
#print(titles[a])
info_url='https://www.dytt8.net'+info_urls[a]
a+=1
yield scrapy.Request(url=info_url,callback=self.def_info)
yield scrapy.Request(next_url,callback=self.parse)
pass
#获取电影标题、下载地址:
def def_info(self,response):
#print(response.text)
i_item = MovieItem()
data=response.body.decode("gb2312","ignore")
#title_xpath='//title/text()'
#title=response.xpath(title_xpath).extract_first()
down_url_xpath='//tbody/tr/td/a/text()'

imageurl_xpath='//img[@alt=""]/@src'
imageurl=response.xpath(imageurl_xpath).extract_first()
down_url=response.xpath(down_url_xpath).extract_first()
pat1='类  别 (.*?)<br />'
pat2='年  代 (.*?)<br />'
pat3='IMDb评分 (.*?)/10'
pat4='导  演 (.*?)<br />'
pat5='简  介 <br /><br />  (.*?) <br />'
pat6='片  名 (.*?)<br />'
pat7='译  名 (.*?)<br />'
name = re.compile(pat6, re.S).findall(data)
type = re.compile(pat1, re.S).findall(data)
time = re.compile(pat2, re.S).findall(data)
averageratings = re.compile(pat3, re.S).findall(data)
directors = re.compile(pat4, re.S).findall(data)
intro = re.compile(pat5, re.S).findall(data)
transname=re.compile(pat7, re.S).findall(data)
#print(title,'\n',down_url)
re.sub("'",".",pat1)
re.sub("'",".",pat2)
re.sub("'",".",pat3)
re.sub("'",".",pat4)
re.sub("'",".",pat5)
re.sub("'",".",pat6)
re.sub("'",".",pat7)
if len(averageratings):
i_item['averageratings'] = averageratings[0]
else:
i_item['averageratings'] = '0'
if len(name):
i_item['title'] = name[0]
else:
i_item['title'] = ' '

if down_url is not None:
i_item['down_url'] = down_url
else:
i_item['down_url'] = ' '

if imageurl is not None:
i_item['imageurl'] = imageurl
else:
i_item['imageurl'] = ' '

if len(type):
i_item['type'] = type[0]
else:
i_item['type'] = ' '

if len(time):
i_item['time'] = time[0]
else:
i_item['time'] = ' '

if len(directors):
i_item['directors'] = directors[0]
else:
i_item['directors'] = ' '

if len(intro):
i_item['intro'] = intro[0]
else:
i_item['intro'] = ' '

if len(transname):
i_item['transname'] = transname[0]
else:
i_item['transname'] = ' '

yield i_item

item.py:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title= scrapy.Field()    #电影名
down_url = scrapy.Field()  #下载地址
type=scrapy.Field()   #类别
time=scrapy.Field()  #时间
averageratings = scrapy.Field()  #评分
directors=scrapy.Field()  #导演
intro=scrapy.Field()   #简介
transname=scrapy.Field()   #译名
imageurl=scrapy.Field()    #图片地址

pass

middlewares.py:

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
import base64
from scrapy import signals
from movie.settings import USER_AGENT_LIST

class ProxyMiddleware(object):
def process_request(self,request,spider):
if request.url.startswith("http://"):
request.meta['proxy']="http://"+'222.95.144.65:3000'          # http代理
elif request.url.startswith("https://"):
request.meta['proxy']="https://"+'222.95.144.65:3000'         # https代理

class MovieSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request, dict
# or Item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

class MovieDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
#   installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
rand_use  = random.choice(USER_AGENT_LIST)
if rand_use:
request.headers.setdefault('User-Agent', rand_use)

pipelines.py:

# -*- coding: utf-8 -*-
import mysql.connector

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import csv
class DyttPipeline(object):
def process_item(self, item, spider):
#存sql
conn=pymysql.connect(host='127.0.0.1',port=3306,user='root',password='******',database='javatest',charset='utf8')
cursor=conn.cursor()
sql="insert into dytt(title,type,time,averageratings,directors,intro,url,transname,image) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['title'],item['type'],item['time'],item['averageratings'],item['directors'],item['intro'],item['down_url'],item['transname'],item['imageurl'])
#cursor.execute("insert into dytt(title,type,time,averageratings,directors,intro,url,transname) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s')"%(item['title'],item['type'],item['time'],item['averageratings'],item['directors'],item['intro'],item['down_url'],item['transname']))
conn.query(sql)
conn.commit()
return item

setting.py中就是一些浏览器头的设置,就不贴出来了。
使用scrapy框架爬取电影天堂遇到的一些问题

  • 代理池的使用,如果不使用代理爬取两次后会被电影天堂拒绝访问,但是不同代理的速度差别很大,一共5000多条的数据,好的代理10分钟全部爬完,一般代理打一把lol也才爬取1000来条,代理具体是在西刺代理网寻找的,当遇到代理速度慢时果断换代理。
  • 正则表达式 在使用正则表达式爬取电影资讯时,电影天堂较新的网页与较老的网页有差距,差在几个标签之间会有几个多出来的空格,如果使用新网页的代码格式爬取,可能18年之前的网页都爬取不了,但多加几个空格后能爬取
  • 正则表达式提取出来的结果是个列表,如果你不提取位置,不写下标[0],那么提取的格式会是[‘电影’],而不是 电影,而‘与mysql的‘会相冲突,写了个去除‘的正则但是好像没起效果,也懒得去弄了,结果是在英文标题有单引号的情况下插不进数据库。
  • 点赞
  • 收藏
  • 分享
  • 文章举报
m0_46491669 发布了2 篇原创文章 · 获赞 0 · 访问量 151 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: