Step05:爬虫小项目,爬取最新电影迅雷下载地址
2018-10-01 14:29
134 查看
1.简述
由于电影天堂的广告实在令人不厌其烦,但其视频资源却的确有可取之处。因此,趁着学习爬虫技术的这段时间,简单实现了一个完整的小项目。
(完整代码——链接)
2.技术准备
IDE:Pycharm,python3.6.5,使用requests+re从电影天堂爬取最新电影资源的下载地址。使用tkinter设计简单的界面,中间还涉及使用了多线程技术,python对于多线程有threading库支持,简化了许多工作。
3.项目步骤
进入Pycharm建立project,实现以下目录结构:
\ThunderAndSpider\message_spider\spider_config.py
headers={ 'Cookie':'37cs_user=37cs63629906334; XLA_CI=3e976860bea5549a9a73e10df8153fcd; 37cs_pidx=2; 37cs_show=253%2C75; cscpvrich5041_fidx=3', 'Host':'www.dytt8.net', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400', } other_headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400', 'Referer':'http://www.dytt8.net/', 'Host':'www.dytt8.net', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } url_dytt = "http://www.dytt8.net" re_strIndex = r'>最新电影下载</a>]<a\shref=\'(.*?)\'>(.*?)</a><br' re_strLink = r'<td\sstyle="WORD-WRAP.*?<a\shref=".*?">(.*?)</a></td>'
\ThunderAndSpider\message_spider\dytt_spider.py(简单实现的爬虫类)
import requests import re from message_spider.spider_config import * from requests.exceptions import RequestException class dytt_spider: def __init__(self): self.url = url_dytt def get_html(self, url, headers): try: response = requests.get(url, headers=headers) # Python HTTP库requests中文页面乱码解决方案 response.encoding = response.apparent_encoding if response.status_code == 200: return response.text return None except RequestException: return None def _get_re_findall_items(self, html, re_str): pattern = re.compile(re_str, re.S) items = re.findall(pattern, html) return items def get_index(self, html, re_str): items = self._get_re_findall_items(html, re_str) for item in items: yield { "name": item[1], "url": url_dytt + item[0] } def get_thunder_link(self, html, re_str): items = self._get_re_findall_items(html, re_str) for item in items: yield { "thunder": item } def get_all_thunderlink(self): index = self.get_html(self.url, headers) for item in self.get_index(index, re_strIndex): html = self.get_html(item['url'], other_headers) if html: for x in self.get_thunder_link(html, re_strLink): yield { "影片名:": item["name"], "磁力链接:": x['thunder'], } if __name__=="__main__": dytt = dytt_spider(url_dytt) for mess in dytt.get_all_thunderlink(): print(mess)
\ThunderAndSpider\thunder\thunder_config.py
#==注意,此处要修改为迅雷所在目录的完整路径== thunder_path = 'E:\Thunder.exe' #==注意,此处修改为迅雷下载文件存放目录== save_path= 'G:\\thunder_download\\'
\ThunderAndSpider\thunder\dytt_thunder.py
import os, time #import threading from thunder.thunder_config import * class my_thunder: def __init__(self, url): self.url = url self.filename = os.path.split(self.url)[1] self.args = r'"{thunder_path}" {url}'.format(thunder_path=thunder_path, url=url) def start_target(self): print("准备下载---{name}".format(name=self.filename)) os.system(self.args) #new_thread = threading.Thread(target=os.system, args=(self.args,)) #new_thread.start() def check_start(self): che 4000 ck_file=self.filename+".xltd" return os.path.exists(os.path.join(save_path, check_file)) def check_end(self): return os.path.exists(os.path.join(save_path, self.filename)) ''' def download(self): self.start_target() print("正在下载{name}".format(name=self.filename)) if self.check_start(): while True: time.sleep(60) if self.check_end(): print("下载完成") return True else: print("下载失败") return False '''
\ThunderAndSpider\win_gui\main_gui.py(此处设计界面)
from tkinter import * class MainGUI: def __init__(self): self.root=Tk() self.root.title("电影下载") self.root.geometry("700x500") self.root.resizable(False, False) self._set_gui() def open_gui(self): self.root.mainloop() def _set_gui(self): Label(self.root, text="资源来源:").grid(row=0, column=0) self.entry_01 = Entry(self.root) self.entry_01.grid(row=0,column=1,sticky=W) Label(self.root, text="资源种子:").grid(row=1, column=0) self.text_01 = Text(self.root) self.text_01.grid(row=1,column=1, sticky=W) Label(self.root, text="当前电影:").grid(row=2, column=0) self.entry_02 = Entry(self.root, width=300) self.entry_02.grid(row=2,column=1,sticky=W) self.frm = Frame(self.root) self.frm.grid(row=3, column=1, sticky=W) self.btn_01 = Button(self.frm, text="上一部") self.btn_01.grid(row=0, column=1) self.btn_02 = Button(self.frm, text="下一部") self.btn_02.grid(row=0, column=2) self.btn_03 = Button(self.frm, text="下载当前部") self.btn_03.grid(row=0, column=3)
\ThunderAndSpider\main.py
from win_gui.main_gui import * from thunder.dytt_thunder import * from message_spider.dytt_spider import * from message_spider.spider_config import url_dytt import threading urls = [] link_message = "" urls_index = 0 def get_urls_and_linkmessage(spider): global urls global link_message for x in spider.get_all_thunderlink(): link_message += x["影片名:"] link_message += "\n" link_message += x["磁力链接:"] link_message += "\n\n" urls.append(x['磁力链接:']) def change_entry_a(entry): global urls_index if urls_index == 0: urls_index = len(urls) - 1 else: urls_index = urls_index - 1 entry.delete(0, END) entry.insert(20, urls[urls_index]) def change_entry_b(entry): global urls_index if urls_index ==len(urls)-1: urls_index = 0 else: urls_index = urls_index + 1 entry.delete(0,END) entry.insert(20,urls[urls_index]) def download_current(): thunder = my_thunder(urls[urls_index]) #new_thread = threading.Thread(target=thunder.download) new_thread = threading.Thread(target=thunder.start_target) new_thread.start() #thunder.download() def mainGUI_config(mainWin): # INSERT索引表示在光标处插入,END索引号表示在最后插入 mainWin.entry_01.insert(END, url_dytt) mainWin.text_01.insert(1.0, link_message) mainWin.entry_02.insert(20, urls[0]) mainWin.btn_01.config(command=lambda: change_entry_a(mainWin.entry_02)) mainWin.btn_02.config(command=lambda: change_entry_b(mainWin.entry_02)) mainWin.btn_03.config(command=download_current) def main(): print("++++主程序启动++++") mainWin = MainGUI() spider = dytt_spider() get_urls_and_linkmessage(spider) mainGUI_config(mainWin) mainWin.open_gui() if __name__ =="__main__": main()
4.项目成果
目标站点:
项目主界面
相关文章推荐
- 用python做一个可以下载电影天堂最新电影的爬虫
- 米奇妙妙屋最新迅雷下载地址!
- 用php实现一个简单的爬虫,抓取电影网站的视频下载地址
- Python 利用requests+BeautifulSoup4编写原生爬虫,爬取电影天堂最新电影,并打造最新电影下载及查询器
- 关于迅雷电影的下载地址
- 电影《绿箭侠第一季》迅雷中英双字下载地址
- (官网搬运)android studio sdk 最新下载地址 2016.7.23 (迅雷可用)
- anroid adt离线下载地址(可自己选最新版本使用迅雷下载)
- 《忠犬八公的故事》电影迅雷下载地址
- repo 最新的下载地址
- Genymotion配置及使用教程(最新最完整版附各部分下载地址)
- sqlserver 2008 r2 直接下载地址,可用迅雷下载
- oracle10G/11G官方下载地址集合 直接迅雷下载
- 在线聊天项目1.4版 使用Gson方法解析Json字符串以便重构request和response的各种请求和响应 解决聊天不畅问题 Gson包下载地址
- Android Studio 3.0 最新国内下载地址
- DateChooser控件发布ASP.NET 2.0新版(我的ASP.NET 2.0控件开发书的第二个阶段项目)[请大家一定注意版本的更新,下载最新版]
- Spring最新下载地址
- 微软Windows原版光盘系列 资源共享(机会难得 迅雷下载 不断更新!)(20070528更新,21个地址)
- 最新最全的Ubuntu 9.10 下载地址
- 团队项目软件下载地址