您的位置:首页 > 运维架构 > 网站架构

Python3.7 + Selenium + BeautifulSoup4 + Requests + Threading 爬虫异步加载网站

2019-02-15 03:29 796 查看

一个用于爬取采用JS脚本防爬虫漫画网站的脚本
具体看注释

# coding=utf-8
import pdfkit
import requests

from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import os,time,threading
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By # 标识
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

#网页分析,获取网址和标题
def parse_url_to_html(url,name, istart, iend):
heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
response=requests.get(url,headers = heads)
print(response.status_code)
soup=BeautifulSoup(response.content,"html.parser")
mainpages = []
maintitles= []
allpages  = []#获取所有的网址
alltitles = []#获取对应的标题
tag_main  = soup.find_all(class_ = "cy_plist")[0]#获取第一个id为"nav"的标签,这个里面包含了网址和标题

for i in tag_main.find_all("li"):

if i == None:
continue
else:
mainpages.append(i.a.get('href'))
maintitles.append(i.a.get_text())
mainpages.reverse()
maintitles.reverse()

print("write begin++++++++++++++++>>>>>>>>>>>>>....")
#获取的只是标签集,需要加html前缀
suffix = '_' + str(istart) + '-' + str(iend)
#htmls = "<html><head><meta charset='UTF-8'></head><body> \n"#+str(tag_main)
#with open(name+suffix+".html",'w',encoding='utf-8') as f:
#    f.write(htmls)
#with open("stat.pic"+suffix,'w',encoding='utf-8') as f:
#    f.write("stats picture info \n")
print(mainpages)

return mainpages, maintitles

def downlaodImage(url, maintitles, chapter, istart, iend):

heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

tag = '<br/><div class="pr tbCenter clearfix" id="tbCenter"> <br/> \n \
<div id="images"> <br/> \n \
</div><br/>' + '<div style="text-align:center"> \n '
tag1 = '<div class="img-loading" id="imgLoading"><i></i><span><h1 <strong> <p align="center">'+maintitles +'</p></strong> </h1></span></div><br/> \n '

options = webdriver.ChromeOptions()
options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"')
# 下面两行代码关闭窗口的显示出现
options.add_argument('--headless')
options.add_argument('--disable-gpu')

#不加载图片
prefs = {"profile.managed_default_content_settings.images":2}
options.add_experimental_option("prefs",prefs)

driver=webdriver.Chrome('C:\\Users\\test\\Desktop\\chromedriver_win32\\chromedriver.exe',options=options)
driver.get(url)

# 10s 内只要出现id为k——next的标签,就停止加载
element = WebDriverWait(driver, 1000).until(EC.presence_of_element_located((By.ID, "k_next")))

soup=BeautifulSoup(driver.page_source,"lxml")
tag_tmp=soup.find_all('a',attrs={'href':'javascript:;','class':'BarTit'})

npicutre = int(tag_tmp[0].getText().split('/')[1][:-1])

suffix = '_' + str(istart) + '-' + str(iend)
for ipic in range(npicutre):

soup=BeautifulSoup(driver.page_source,"html.parser")
# 注意BS4的特殊用法,避免使用正则表达式
tag_tmp=soup.find_all('div', attrs={'class':'mh_box'})

filepath = tag_tmp[0].img['src']
# with open('log_ljm/' + str(chapter) + '-' + str(ipic+1) + '.log','w',encoding='utf-8') as f:
#    f.write(str(driver.page_source))

filename = str(chapter) + '-' + str(ipic+1) +'.jpg'

response = requests.get(filepath, headers = heads)
#with open("stat.pic"+suffix,'a',encoding='utf-8') as f:
#    f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + '  ' + response.url + '\n')

if os.path.exists('imagehuyao/' + filename):
pass
else:
response = requests.get(filepath, headers = heads)
urlretrieve(response.url, 'imagehuyao/' + filename)
# with open("stat.pic",'a',encoding='utf-8') as f:
#     f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + '  ' + response.url + '\n')
tag = tag + '<img src="imagehuyao/'+filename + '" height="1132" width="800" style> <br/> <p class="img_info"></p> \n'

# 使用Selenium模拟执行js脚本(下一页)
driver.execute_script("javascript:a_f_qTcms_Pic_nextUrl_Href();")
#del(url1, filepath, soup, tag_tmp, driver, filename, response)

driver.close()
tag = tag1 + tag + '</div>'+ '</div>'

return tag

#网页操作,获取正文及
def get_htmls(url, mainpages,maintitles, name, istart, iend):

if not url.endswith('/'):
url = url+'/'
url = url.split("index.html/")[0]

#istart = 0
suffix = '_' + str(istart) + '-' + str(iend)
for i in range(istart, iend, 1): #len(mainpages):

urll = urljoin(url, mainpages[i]) # ########## 必须谨记
print(urll)
if urll == None:
h = "<div><h1>"+maintitles[i]+"</h1></div>"
#htmls = ''
#htmls= h + htmls #str(tag[0])
#with open(name+".html",'a',encoding='utf-8') as f:
#        f.write(htmls)

continue

tag = downlaodImage(urll,maintitles[i], i, istart, iend)

#time.sleep(100)
htmls= tag
#with open(name+suffix+".html",'a',encoding='utf-8') as f:
#    f.write(htmls)
#    print("  (%s)  [%s] download end"%(i,mainpages[i]))

# 闭合parse_url_to_html中的标签
#htmls="</body></html> \n"
#with open(name+".html",'a',encoding='utf-8') as f:
#    f.write(htmls)

def save_pdf(html,name):
"""
把所有html文件转换成pdf文件
"""
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
'footer-font-name':'Times New Roman',
'header-font-name':'Times New Roman',
'minimum-font-size':24,
}
pdfkit.from_file(html, name+".pdf", options=options)

def get_url(url_mainpage):
response=requests.get(url_mainpage)
print(response.status_code)
soup=BeautifulSoup(response.content,"html.parser")
s=[]#获取所有的网址
title=[]#获取对应的标题
tag=soup.find_all(class_ = "thumbnail")#获取第一个id为"nav"的标签,这个里面包含了网址和标题
for i in tag:
tmp = i.a.get('href')
if tmp.startswith("http"):
s.append(i.a.get('href'))
title.append(i.img.get('alt'))
return s, title

def runfile(url, name, istart, iend):
suffix = '_' + str(istart) + '-' + str(iend)
if os.path.exists(name+suffix+'.html'):
os.remove(name+suffix+'.html')
if os.path.exists(name+suffix+'.pdf'):
os.remove(name+suffix+'.pdf')
mainpages, maintitles = parse_url_to_html(url,name, istart, iend)
get_htmls(url,mainpages, maintitles, name, istart, iend)

if __name__ == '__main__':

url = '一个漫画网站'
name = '灵剑山'

threads = []

start = 400
end = 450
step = 1

for i in range(start, end, step):

istart = i
iend  = i + step
t = threading.Thread(target=runfile,args=(url,name, istart, iend))
threads.append(t)

for i in range(len(range(start, end, step))):
threads[i].setDaemon(True)
threads[i].start()
for i in range(len(range(start, end, step))):
threads[i].join()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: