您的位置:首页 > 编程语言 > Python开发

python 广度多线程爬虫

2017-11-15 23:46 232 查看
#coding:utf-8

from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from threading import RLock
import requests
import time
from bs4 import BeautifulSoup
class Screap(Thread):
def __init__(self,urlList,targetnum):
super(Screap,self).__init__()
self.num=len(urlList)
self.lock=RLock()
self.header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400"}
self.taskurlList=urlList
self.threads=[]
self.okurl=set()
self.visitedUrl=[]
self._targetNum=targetnum
self._totalnum=0
self.filewrite=open("a.txt","w+")
def now(self):
t=time.strftime("%H:%M:%S",time.localtime(time.time()))
print(t)
def checkThread(self):
while True:
time.sleep(1)
flag=False
threadQum=0
threadNum=len(self.threads)
for td in self.threads:
if td.is_alive():
continue
               else:
threadQum+=1
if threadQum==threadNum:
flag=True
            if flag:
break
   def craw(self):
index=-1
i=0
while i<self._targetNum:
j=0
self.now()
while j<self.num:
threadson=Thread(target=self.download,args=(self.taskurlList[j],self.header,self.lock))

threadson.start()
self.threads.append(threadson)
j+=1
for son in self.threads:
son.join(2)
self.checkThread()
self.num+=1
if self.num>=100:
self.num=20
self.threads=[]
self.taskurlList=list(set(self.taskurlList)-set(self.visitedUrl))
self._totalnum=0
self.writeUrl(self.okurl)
i=self.checkUrl()
print("total url:%s"%len(self.taskurlList))
print("total visitedurl:%s"%len(self.visitedUrl))
print("collect url:%s"%i)
def writeUrl(self,okurl):
with open("a.txt","w+") as f:
for i in okurl:
f.write(i.strip()+"\n")
def checkUrl(self):
with open("a.txt") as f:
for line in f:
self._totalnum+=1
return self._totalnum
def download(self,url,header,lock):
try:
response=requests.get(url,headers=header,timeout=20)
self.visitedUrl.append(url)
except:
self.visitedUrl.append(url)
else:
self.okurl.add(url)
if response.status_code==200:
context=response.content
soup=BeautifulSoup(context)
allTag=soup.find_all(self.has_href)
lock.acquire()
for tag in allTag:
hrefvalue=tag.get("href",None)
if hrefvalue and "http" in hrefvalue and "www" in hrefvalue:
self.taskurlList.append(hrefvalue)
lock.release()
def has_href(self,tag):
return tag.has_attr("href")
if __name__ == '__main__':
targetUrl=["http://www.sina.com","http://www.sohu.com","https://daohang.qq.com/",
"http://www.163.com"]
a=Screap(targetUrl,100000)
a.craw()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: