您的位置:首页 > 编程语言 > Python开发

python爬虫实战-爬取视频网站下载视频至本地(selenium)

2019-02-25 15:45 696 查看

#python爬虫实战-爬取视频网站下载视频至本地(selenium)

import requests
from lxml import etree
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
max_behot_time=0
video_list=[]
def get_json():
global max_behot_time
url='http://www.365yg.com/api/pc/feed/?min_behot_time='+str(max_behot_time)
r=requests.get(url,headers=headers)
obj=json.loads(r.text)
max_behot_time=obj['next']['max_behot_time']
data=obj['data']
for video_data in data:
title=video_data['title']
a_href='http://www.365yg.com'+video_data['source_url']
down_video(title,a_href)

def down_video(title,href):
#通过selenium来解析视频网址,
path=r'E:\Student\python\day05\driver\chromedriver.exe'
chrome_options=Options()
chrome_options.add_argument('--Headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"')
browser=webdriver.Chrome(executable_path=path,chrome_options=chrome_options)
browser.get(href)
time.sleep(5)
browser.save_screenshot(r'PjPhoto\baidu.png')
#获取源码,生成对象,查找video 里面的src
tree=etree.HTML(browser.page_source)
video_url=tree.xpath('//video[@mediatype="video"]/@src')[0]
video_url='http:'+video_url
filepath='video/'+title+'.mp4'
print('正在下载视频 %s'%title)
r=requests.get(video_url)
with open(filepath,'wb') as fp:
fp.write(r.content)
print('%s已下载'%title)
browser.quit()

def main():
# page=int(input('请输入你要下载的页数(每页7个):'))
page=1
for x in range(0,page):
get_json()

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐