您的位置:首页 > 其它

Step05:爬虫小项目,爬取最新电影迅雷下载地址

2018-10-01 14:29 134 查看

1.简述

由于电影天堂的广告实在令人不厌其烦,但其视频资源却的确有可取之处。因此,趁着学习爬虫技术的这段时间,简单实现了一个完整的小项目。
完整代码——链接

2.技术准备

IDE:Pycharm,python3.6.5,使用requests+re从电影天堂爬取最新电影资源的下载地址。使用tkinter设计简单的界面,中间还涉及使用了多线程技术,python对于多线程有threading库支持,简化了许多工作。

3.项目步骤

进入Pycharm建立project,实现以下目录结构:

\ThunderAndSpider\message_spider\spider_config.py

headers={
'Cookie':'37cs_user=37cs63629906334; XLA_CI=3e976860bea5549a9a73e10df8153fcd; 37cs_pidx=2; 37cs_show=253%2C75; cscpvrich5041_fidx=3',
'Host':'www.dytt8.net',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
}

other_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
'Referer':'http://www.dytt8.net/',
'Host':'www.dytt8.net',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

url_dytt = "http://www.dytt8.net"
re_strIndex = r'>最新电影下载</a>]<a\shref=\'(.*?)\'>(.*?)</a><br'
re_strLink = r'<td\sstyle="WORD-WRAP.*?<a\shref=".*?">(.*?)</a></td>'

\ThunderAndSpider\message_spider\dytt_spider.py(简单实现的爬虫类)

import requests
import re

from message_spider.spider_config import *
from requests.exceptions import RequestException

class dytt_spider:
def __init__(self):
self.url = url_dytt

def get_html(self, url, headers):
try:
response = requests.get(url, headers=headers)
# Python HTTP库requests中文页面乱码解决方案
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
return None
except RequestException:
return None

def _get_re_findall_items(self, html, re_str):
pattern = re.compile(re_str, re.S)
items = re.findall(pattern, html)
return items

def get_index(self, html, re_str):
items = self._get_re_findall_items(html, re_str)
for item in items:
yield {
"name": item[1],
"url": url_dytt + item[0]
}

def get_thunder_link(self, html, re_str):
items = self._get_re_findall_items(html, re_str)
for item in items:
yield {
"thunder": item
}

def get_all_thunderlink(self):
index = self.get_html(self.url, headers)
for item in self.get_index(index, re_strIndex):
html = self.get_html(item['url'], other_headers)
if html:
for x in self.get_thunder_link(html, re_strLink):
yield {
"影片名:": item["name"],
"磁力链接:": x['thunder'],
}

if __name__=="__main__":
dytt = dytt_spider(url_dytt)
for mess in dytt.get_all_thunderlink():
print(mess)

\ThunderAndSpider\thunder\thunder_config.py

#==注意,此处要修改为迅雷所在目录的完整路径==
thunder_path = 'E:\Thunder.exe'
#==注意,此处修改为迅雷下载文件存放目录==
save_path= 'G:\\thunder_download\\'

\ThunderAndSpider\thunder\dytt_thunder.py

import os, time
#import threading
from thunder.thunder_config import *

class my_thunder:
def __init__(self, url):
self.url = url
self.filename = os.path.split(self.url)[1]
self.args = r'"{thunder_path}" {url}'.format(thunder_path=thunder_path, url=url)

def start_target(self):
print("准备下载---{name}".format(name=self.filename))
os.system(self.args)
#new_thread = threading.Thread(target=os.system, args=(self.args,))
#new_thread.start()

def check_start(self):
che
4000
ck_file=self.filename+".xltd"
return os.path.exists(os.path.join(save_path, check_file))

def check_end(self):
return os.path.exists(os.path.join(save_path, self.filename))
'''
def download(self):
self.start_target()
print("正在下载{name}".format(name=self.filename))
if self.check_start():
while True:
time.sleep(60)
if self.check_end():
print("下载完成")
return True
else:
print("下载失败")
return False
'''

\ThunderAndSpider\win_gui\main_gui.py(此处设计界面)

from tkinter import *

class MainGUI:
def __init__(self):
self.root=Tk()
self.root.title("电影下载")
self.root.geometry("700x500")
self.root.resizable(False, False)
self._set_gui()

def open_gui(self):
self.root.mainloop()

def _set_gui(self):
Label(self.root, text="资源来源:").grid(row=0, column=0)
self.entry_01 = Entry(self.root)
self.entry_01.grid(row=0,column=1,sticky=W)
Label(self.root, text="资源种子:").grid(row=1, column=0)
self.text_01 = Text(self.root)
self.text_01.grid(row=1,column=1, sticky=W)
Label(self.root, text="当前电影:").grid(row=2, column=0)
self.entry_02 = Entry(self.root, width=300)
self.entry_02.grid(row=2,column=1,sticky=W)
self.frm = Frame(self.root)
self.frm.grid(row=3, column=1, sticky=W)
self.btn_01 = Button(self.frm, text="上一部")
self.btn_01.grid(row=0, column=1)
self.btn_02 = Button(self.frm, text="下一部")
self.btn_02.grid(row=0, column=2)
self.btn_03 = Button(self.frm, text="下载当前部")
self.btn_03.grid(row=0, column=3)

\ThunderAndSpider\main.py

from win_gui.main_gui import *
from thunder.dytt_thunder import *
from message_spider.dytt_spider import *
from message_spider.spider_config import url_dytt

import threading

urls = []
link_message = ""
urls_index = 0

def get_urls_and_linkmessage(spider):
global urls
global link_message
for x in spider.get_all_thunderlink():
link_message += x["影片名:"]
link_message += "\n"
link_message += x["磁力链接:"]
link_message += "\n\n"
urls.append(x['磁力链接:'])

def change_entry_a(entry):
global urls_index
if urls_index == 0:
urls_index = len(urls) - 1
else:
urls_index = urls_index - 1
entry.delete(0, END)
entry.insert(20, urls[urls_index])

def change_entry_b(entry):
global urls_index
if urls_index ==len(urls)-1:
urls_index = 0
else:
urls_index = urls_index + 1
entry.delete(0,END)
entry.insert(20,urls[urls_index])

def download_current():
thunder = my_thunder(urls[urls_index])
#new_thread = threading.Thread(target=thunder.download)
new_thread = threading.Thread(target=thunder.start_target)
new_thread.start()
#thunder.download()

def mainGUI_config(mainWin):
# INSERT索引表示在光标处插入,END索引号表示在最后插入
mainWin.entry_01.insert(END, url_dytt)
mainWin.text_01.insert(1.0, link_message)
mainWin.entry_02.insert(20, urls[0])
mainWin.btn_01.config(command=lambda: change_entry_a(mainWin.entry_02))
mainWin.btn_02.config(command=lambda: change_entry_b(mainWin.entry_02))
mainWin.btn_03.config(command=download_current)

def main():
print("++++主程序启动++++")
mainWin = MainGUI()
spider = dytt_spider()

get_urls_and_linkmessage(spider)
mainGUI_config(mainWin)
mainWin.open_gui()

if __name__ =="__main__":
main()

4.项目成果

目标站点:

项目主界面

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: