Python3.7 + Selenium + BeautifulSoup4 + Requests + Threading 爬虫异步加载网站
2019-02-15 03:29
796 查看
一个用于爬取采用JS脚本防爬虫漫画网站的脚本
具体看注释
# coding=utf-8 import pdfkit import requests from urllib.request import urlretrieve from bs4 import BeautifulSoup import os,time,threading from urllib.parse import urljoin from selenium import webdriver from selenium.webdriver.common.by import By # 标识 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #网页分析,获取网址和标题 def parse_url_to_html(url,name, istart, iend): heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} response=requests.get(url,headers = heads) print(response.status_code) soup=BeautifulSoup(response.content,"html.parser") mainpages = [] maintitles= [] allpages = []#获取所有的网址 alltitles = []#获取对应的标题 tag_main = soup.find_all(class_ = "cy_plist")[0]#获取第一个id为"nav"的标签,这个里面包含了网址和标题 for i in tag_main.find_all("li"): if i == None: continue else: mainpages.append(i.a.get('href')) maintitles.append(i.a.get_text()) mainpages.reverse() maintitles.reverse() print("write begin++++++++++++++++>>>>>>>>>>>>>....") #获取的只是标签集,需要加html前缀 suffix = '_' + str(istart) + '-' + str(iend) #htmls = "<html><head><meta charset='UTF-8'></head><body> \n"#+str(tag_main) #with open(name+suffix+".html",'w',encoding='utf-8') as f: # f.write(htmls) #with open("stat.pic"+suffix,'w',encoding='utf-8') as f: # f.write("stats picture info \n") print(mainpages) return mainpages, maintitles def downlaodImage(url, maintitles, chapter, istart, iend): heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} tag = '<br/><div class="pr tbCenter clearfix" id="tbCenter"> <br/> \n \ <div id="images"> <br/> \n \ </div><br/>' + '<div style="text-align:center"> \n ' tag1 = '<div class="img-loading" id="imgLoading"><i></i><span><h1 <strong> <p align="center">'+maintitles +'</p></strong> </h1></span></div><br/> \n ' options = webdriver.ChromeOptions() options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"') # 下面两行代码关闭窗口的显示出现 options.add_argument('--headless') options.add_argument('--disable-gpu') #不加载图片 prefs = {"profile.managed_default_content_settings.images":2} options.add_experimental_option("prefs",prefs) driver=webdriver.Chrome('C:\\Users\\test\\Desktop\\chromedriver_win32\\chromedriver.exe',options=options) driver.get(url) # 10s 内只要出现id为k——next的标签,就停止加载 element = WebDriverWait(driver, 1000).until(EC.presence_of_element_located((By.ID, "k_next"))) soup=BeautifulSoup(driver.page_source,"lxml") tag_tmp=soup.find_all('a',attrs={'href':'javascript:;','class':'BarTit'}) npicutre = int(tag_tmp[0].getText().split('/')[1][:-1]) suffix = '_' + str(istart) + '-' + str(iend) for ipic in range(npicutre): soup=BeautifulSoup(driver.page_source,"html.parser") # 注意BS4的特殊用法,避免使用正则表达式 tag_tmp=soup.find_all('div', attrs={'class':'mh_box'}) filepath = tag_tmp[0].img['src'] # with open('log_ljm/' + str(chapter) + '-' + str(ipic+1) + '.log','w',encoding='utf-8') as f: # f.write(str(driver.page_source)) filename = str(chapter) + '-' + str(ipic+1) +'.jpg' response = requests.get(filepath, headers = heads) #with open("stat.pic"+suffix,'a',encoding='utf-8') as f: # f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + ' ' + response.url + '\n') if os.path.exists('imagehuyao/' + filename): pass else: response = requests.get(filepath, headers = heads) urlretrieve(response.url, 'imagehuyao/' + filename) # with open("stat.pic",'a',encoding='utf-8') as f: # f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + ' ' + response.url + '\n') tag = tag + '<img src="imagehuyao/'+filename + '" height="1132" width="800" style> <br/> <p class="img_info"></p> \n' # 使用Selenium模拟执行js脚本(下一页) driver.execute_script("javascript:a_f_qTcms_Pic_nextUrl_Href();") #del(url1, filepath, soup, tag_tmp, driver, filename, response) driver.close() tag = tag1 + tag + '</div>'+ '</div>' return tag #网页操作,获取正文及 def get_htmls(url, mainpages,maintitles, name, istart, iend): if not url.endswith('/'): url = url+'/' url = url.split("index.html/")[0] #istart = 0 suffix = '_' + str(istart) + '-' + str(iend) for i in range(istart, iend, 1): #len(mainpages): urll = urljoin(url, mainpages[i]) # ########## 必须谨记 print(urll) if urll == None: h = "<div><h1>"+maintitles[i]+"</h1></div>" #htmls = '' #htmls= h + htmls #str(tag[0]) #with open(name+".html",'a',encoding='utf-8') as f: # f.write(htmls) continue tag = downlaodImage(urll,maintitles[i], i, istart, iend) #time.sleep(100) htmls= tag #with open(name+suffix+".html",'a',encoding='utf-8') as f: # f.write(htmls) # print(" (%s) [%s] download end"%(i,mainpages[i])) # 闭合parse_url_to_html中的标签 #htmls="</body></html> \n" #with open(name+".html",'a',encoding='utf-8') as f: # f.write(htmls) def save_pdf(html,name): """ 把所有html文件转换成pdf文件 """ options = { 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': "UTF-8", 'custom-header': [ ('Accept-Encoding', 'gzip') ], 'cookie': [ ('cookie-name1', 'cookie-value1'), ('cookie-name2', 'cookie-value2'), ], 'outline-depth': 10, 'footer-font-name':'Times New Roman', 'header-font-name':'Times New Roman', 'minimum-font-size':24, } pdfkit.from_file(html, name+".pdf", options=options) def get_url(url_mainpage): response=requests.get(url_mainpage) print(response.status_code) soup=BeautifulSoup(response.content,"html.parser") s=[]#获取所有的网址 title=[]#获取对应的标题 tag=soup.find_all(class_ = "thumbnail")#获取第一个id为"nav"的标签,这个里面包含了网址和标题 for i in tag: tmp = i.a.get('href') if tmp.startswith("http"): s.append(i.a.get('href')) title.append(i.img.get('alt')) return s, title def runfile(url, name, istart, iend): suffix = '_' + str(istart) + '-' + str(iend) if os.path.exists(name+suffix+'.html'): os.remove(name+suffix+'.html') if os.path.exists(name+suffix+'.pdf'): os.remove(name+suffix+'.pdf') mainpages, maintitles = parse_url_to_html(url,name, istart, iend) get_htmls(url,mainpages, maintitles, name, istart, iend) if __name__ == '__main__': url = '一个漫画网站' name = '灵剑山' threads = [] start = 400 end = 450 step = 1 for i in range(start, end, step): istart = i iend = i + step t = threading.Thread(target=runfile,args=(url,name, istart, iend)) threads.append(t) for i in range(len(range(start, end, step))): threads[i].setDaemon(True) threads[i].start() for i in range(len(range(start, end, step))): threads[i].join()
相关文章推荐
- Python爬虫实例:爬取B站《工作细胞》短评——异步加载信息的爬取
- Python爬虫——4.4爬虫案例——requests和xpath爬取招聘网站信息
- Python入门级爬虫:某电竞赛事网站(JS加载)数据模型+Sqlite3数据库练习
- python 爬虫动态加载网站
- python爬虫爬取简单的动态数据-异步加载问题
- 使用python+selenium制作浏览器爬虫,彻底解决ajax异步加载问题(待更新)
- python爬虫日志(3)-爬取异步加载网页
- 使用爬虫抓取网站异步加载数据
- python爬虫爬取异步加载网页信息(python抓取网页中无法通过网页标签属性抓取的内容)
- python3 [爬虫入门实战] 查看网站有多少个网页(站点)
- Python3网络爬虫:requests爬取动态网页内容
- Python爬虫爬取51job招聘网站
- [Python] - 爬虫之Requests基本使用
- 爬虫不过如此(python的Re 、Requests、BeautifulSoup 详细篇)
- Python 爬虫入门——requests
- Python爬虫模拟登录带验证码网站
- python3.X爬虫针对拉钩,直聘,大街等招聘网站的简历爬虫抓取心得总结一 (ide pycharm运行)
- Python_大众点评网站数据爬虫
- python爬虫--scrapy爬取腾讯招聘网站