开始学习爬虫:爬虫之爬取电影天堂网站资源到本地mysql数据库
2020-04-26 18:51
841 查看
刚刚开始为毕业设计做一个爬虫项目作为数据准备,花费了几天学习爬虫的知识,写了一个爬取电影天堂的爬虫项目,主要是爬取电影天堂的下载链接,图片,导演这些信息保存到本地的mysql数据库中,具体的字段有:
具体代码如下:
demo_scrapy.py:
//#爬虫主体 import scrapy import json from movie.items import MovieItem import re from scrapy.utils.project import get_project_settings settings = get_project_settings() class DmozSpider(scrapy.spiders.Spider): name = "demo" allowed_domains = ['www.dytt8.net'] start_urls = ['https://www.dytt8.net/html/gndy/dyzz/index.html'] i = 0 def parse(self, response): info_url_xpath='//td/b/a/@href' next_url_xpath='//div[@class="x"]/td/a[last()-1]/@href' #titles=response.xpath(titles_xpath).extract() # 电影介绍页面url info_urls=response.xpath(info_url_xpath).extract() next_urls=response.xpath(next_url_xpath).extract() # 下一页url next_url='https://www.dytt8.net/html/gndy/dyzz/'+next_urls[0] #print(next_url) a=0 while a in range(len(info_urls)): #print(a) #print(titles[a]) info_url='https://www.dytt8.net'+info_urls[a] a+=1 yield scrapy.Request(url=info_url,callback=self.def_info) yield scrapy.Request(next_url,callback=self.parse) pass #获取电影标题、下载地址: def def_info(self,response): #print(response.text) i_item = MovieItem() data=response.body.decode("gb2312","ignore") #title_xpath='//title/text()' #title=response.xpath(title_xpath).extract_first() down_url_xpath='//tbody/tr/td/a/text()' imageurl_xpath='//img[@alt=""]/@src' imageurl=response.xpath(imageurl_xpath).extract_first() down_url=response.xpath(down_url_xpath).extract_first() pat1='类 别 (.*?)<br />' pat2='年 代 (.*?)<br />' pat3='IMDb评分 (.*?)/10' pat4='导 演 (.*?)<br />' pat5='简 介 <br /><br /> (.*?) <br />' pat6='片 名 (.*?)<br />' pat7='译 名 (.*?)<br />' name = re.compile(pat6, re.S).findall(data) type = re.compile(pat1, re.S).findall(data) time = re.compile(pat2, re.S).findall(data) averageratings = re.compile(pat3, re.S).findall(data) directors = re.compile(pat4, re.S).findall(data) intro = re.compile(pat5, re.S).findall(data) transname=re.compile(pat7, re.S).findall(data) #print(title,'\n',down_url) re.sub("'",".",pat1) re.sub("'",".",pat2) re.sub("'",".",pat3) re.sub("'",".",pat4) re.sub("'",".",pat5) re.sub("'",".",pat6) re.sub("'",".",pat7) if len(averageratings): i_item['averageratings'] = averageratings[0] else: i_item['averageratings'] = '0' if len(name): i_item['title'] = name[0] else: i_item['title'] = ' ' if down_url is not None: i_item['down_url'] = down_url else: i_item['down_url'] = ' ' if imageurl is not None: i_item['imageurl'] = imageurl else: i_item['imageurl'] = ' ' if len(type): i_item['type'] = type[0] else: i_item['type'] = ' ' if len(time): i_item['time'] = time[0] else: i_item['time'] = ' ' if len(directors): i_item['directors'] = directors[0] else: i_item['directors'] = ' ' if len(intro): i_item['intro'] = intro[0] else: i_item['intro'] = ' ' if len(transname): i_item['transname'] = transname[0] else: i_item['transname'] = ' ' yield i_item
item.py:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class MovieItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title= scrapy.Field() #电影名 down_url = scrapy.Field() #下载地址 type=scrapy.Field() #类别 time=scrapy.Field() #时间 averageratings = scrapy.Field() #评分 directors=scrapy.Field() #导演 intro=scrapy.Field() #简介 transname=scrapy.Field() #译名 imageurl=scrapy.Field() #图片地址 pass
middlewares.py:
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import random import base64 from scrapy import signals from movie.settings import USER_AGENT_LIST class ProxyMiddleware(object): def process_request(self,request,spider): if request.url.startswith("http://"): request.meta['proxy']="http://"+'222.95.144.65:3000' # http代理 elif request.url.startswith("https://"): request.meta['proxy']="https://"+'222.95.144.65:3000' # https代理 class MovieSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MovieDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class RandomUserAgentMiddleware(object): def process_request(self, request, spider): rand_use = random.choice(USER_AGENT_LIST) if rand_use: request.headers.setdefault('User-Agent', rand_use)
pipelines.py:
# -*- coding: utf-8 -*- import mysql.connector # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import csv class DyttPipeline(object): def process_item(self, item, spider): #存sql conn=pymysql.connect(host='127.0.0.1',port=3306,user='root',password='******',database='javatest',charset='utf8') cursor=conn.cursor() sql="insert into dytt(title,type,time,averageratings,directors,intro,url,transname,image) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['title'],item['type'],item['time'],item['averageratings'],item['directors'],item['intro'],item['down_url'],item['transname'],item['imageurl']) #cursor.execute("insert into dytt(title,type,time,averageratings,directors,intro,url,transname) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s')"%(item['title'],item['type'],item['time'],item['averageratings'],item['directors'],item['intro'],item['down_url'],item['transname'])) conn.query(sql) conn.commit() return item
setting.py中就是一些浏览器头的设置,就不贴出来了。
使用scrapy框架爬取电影天堂遇到的一些问题
- 代理池的使用,如果不使用代理爬取两次后会被电影天堂拒绝访问,但是不同代理的速度差别很大,一共5000多条的数据,好的代理10分钟全部爬完,一般代理打一把lol也才爬取1000来条,代理具体是在西刺代理网寻找的,当遇到代理速度慢时果断换代理。
- 正则表达式 在使用正则表达式爬取电影资讯时,电影天堂较新的网页与较老的网页有差距,差在几个标签之间会有几个多出来的空格,如果使用新网页的代码格式爬取,可能18年之前的网页都爬取不了,但多加几个空格后能爬取
- 正则表达式提取出来的结果是个列表,如果你不提取位置,不写下标[0],那么提取的格式会是[‘电影’],而不是 电影,而‘与mysql的‘会相冲突,写了个去除‘的正则但是好像没起效果,也懒得去弄了,结果是在英文标题有单引号的情况下插不进数据库。
- 点赞
- 收藏
- 分享
- 文章举报
相关文章推荐
- Python scrapy实践应用,爬取电影网站的影片资源并存入数据库
- 学习记录0321-数据库基础-利用JDBC连接本地MySQL
- 【菜鸟数据库学习日记】从头开始学MySQL(2)
- 爬虫学习-今天我爬了电影天堂
- Python多线程爬虫爬取电影天堂资源
- Python爬虫学习之路(4)--数据存储之关系型数据库存储mysql
- 菜鸡学习日记-大一年度项目-数据库学习-1.22-开始学习MySQL的操作吧!
- 【菜鸟数据库学习日记】从头开始学MySQL(1)
- 爬虫学习笔记4——关系型数据库MySQL
- 使用Python多线程爬虫爬取电影天堂资源
- 【原创】python爬虫获取网站数据并存入本地数据库
- 【菜鸟数据库学习日记】从头开始学MySQL(4)
- 爬取电影天堂的所有资源到mysql
- Python爬虫-爬取xixizhan.com站点的所有电影列表并写入文件和数据库mysql
- 资源---2020考研---英语网站---资料3(考研英语,英语学习。宣言:自从用了这个英语网站,七大姑八大姨开始担心他家孩子比不过我了~~~~~~~~~FT中文网)
- python爬虫 猫眼电影和电影天堂数据csv和mysql存储过程解析
- 多线程爬虫爬取电影天堂资源
- 使用Python多线程爬虫爬取电影天堂资源
- 【菜鸟数据库学习日记】从头开始学MySQL(3)
- 数据库学习网站