您的位置:首页 > 编程语言 > Python开发

python3-爬虫之电影天堂

2018-11-30 02:29 316 查看

pyhon3爬虫之——电影天堂最新电影信息

#导入所需要的库
import requests
from lxml import etree
定义全局变量
BASE_URL="https://www.dytt8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
#解析最新电影列表页面,获取当前页面每个电影的url
def get_detil_urls(url):
response = requests.get(url, headers=HEADERS)
text = response.text
html = etree.HTML(text)
detail_urls = html.xpath('//table[@class="tbspan"]//a/@href')
# for deteil_url in deteil_urls:
#     print(BASE_URL + deteil_url)
detail_urls=map(lambda url:BASE_URL+url,detail_urls)
return detail_urls
#解析电影详情页面,获取电影详情信息
def parse_detail_page(url):
movie={}
response=requests.get(url,headers=HEADERS)
text=response.content.decode("gbk")
html=etree.HTML(text)
title=html.xpath("//div[@class='title_all']//font/text()")[0]
movie["title"]=title

zoomE=html.xpath("//div[@id='Zoom']")[0]
imgs=zoomE.xpath(".//img/@src")
cover=imgs[0]
screenshot=imgs[1]
movie["cover"]=cover
movie["screenshot"]=screenshot

def parse_info(info,rule):
return info.replace(rule,"").strip()
infos=zoomE.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎年  代"):
info=parse_info(info,"◎年  代")
movie['year']=info
elif info.startswith("◎产  地"):
info=parse_info(info,"◎产  地")
movie["country"]=info
elif info.startswith("◎类  别"):
info=parse_info(info,"◎类  别")
movie["category"]=info
elif info.startswith("◎豆瓣评分"):
info=parse_info(info,"◎豆瓣评分")
movie["score"]=info
elif info.startswith("◎片  长"):
info=parse_info(info,"◎片  长")
movie["duration"]=info
elif info.startswith("◎导  演"):
info=parse_info(info,"◎导  演")
movie["director"]=info
elif info.startswith("◎主  演"):
info=parse_info(info,"◎主  演")
actors=[info]
for x in range(index+1,len(infos)):
actor=infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie["actors"]=actors
elif info.startswith("◎简  介"):
info=parse_info(info,"◎简  介")
text=[]
for x in range(index+1,len(infos)):
profile=infos[x].strip()
if profile.startswith("【下载地址】"):
break
text.append(profile)
movie['profile']=text[0]

download_url=html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
movie["download_url"]=download_url
return movie
def spider(n):
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies=[]
for x in range(1,n):
#第一个for循环是用来控制总共有几页
url=base_url.format(x)
detail_urls=get_detil_urls(url)
for detail_url in detail_urls:
#第二个for循环是用来遍历每一页的所有电影的详情url
movie=parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ =="__main__":
spider()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: