python 广度多线程爬虫
2017-11-15 23:46
232 查看
#coding:utf-8 from threading import Thread from concurrent.futures import ThreadPoolExecutor from threading import RLock import requests import time from bs4 import BeautifulSoup class Screap(Thread): def __init__(self,urlList,targetnum): super(Screap,self).__init__() self.num=len(urlList) self.lock=RLock() self.header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400"} self.taskurlList=urlList self.threads=[] self.okurl=set() self.visitedUrl=[] self._targetNum=targetnum self._totalnum=0 self.filewrite=open("a.txt","w+") def now(self): t=time.strftime("%H:%M:%S",time.localtime(time.time())) print(t) def checkThread(self): while True: time.sleep(1) flag=False threadQum=0 threadNum=len(self.threads) for td in self.threads: if td.is_alive(): continue else: threadQum+=1 if threadQum==threadNum: flag=True if flag: break def craw(self): index=-1 i=0 while i<self._targetNum: j=0 self.now() while j<self.num: threadson=Thread(target=self.download,args=(self.taskurlList[j],self.header,self.lock)) threadson.start() self.threads.append(threadson) j+=1 for son in self.threads: son.join(2) self.checkThread() self.num+=1 if self.num>=100: self.num=20 self.threads=[] self.taskurlList=list(set(self.taskurlList)-set(self.visitedUrl)) self._totalnum=0 self.writeUrl(self.okurl) i=self.checkUrl() print("total url:%s"%len(self.taskurlList)) print("total visitedurl:%s"%len(self.visitedUrl)) print("collect url:%s"%i) def writeUrl(self,okurl): with open("a.txt","w+") as f: for i in okurl: f.write(i.strip()+"\n") def checkUrl(self): with open("a.txt") as f: for line in f: self._totalnum+=1 return self._totalnum def download(self,url,header,lock): try: response=requests.get(url,headers=header,timeout=20) self.visitedUrl.append(url) except: self.visitedUrl.append(url) else: self.okurl.add(url) if response.status_code==200: context=response.content soup=BeautifulSoup(context) allTag=soup.find_all(self.has_href) lock.acquire() for tag in allTag: hrefvalue=tag.get("href",None) if hrefvalue and "http" in hrefvalue and "www" in hrefvalue: self.taskurlList.append(hrefvalue) lock.release() def has_href(self,tag): return tag.has_attr("href") if __name__ == '__main__': targetUrl=["http://www.sina.com","http://www.sohu.com","https://daohang.qq.com/", "http://www.163.com"] a=Screap(targetUrl,100000) a.craw()
相关文章推荐
- python多线程爬虫学习--去除字符串中间空格
- python 多线程 爬虫
- 【Python3.6爬虫学习记录】(十一)使用代理IP及用多线程测试IP可用性--刷访问量
- python 多线程+gzip压缩 爬虫
- 一个用Python实现的多入口全网爬的多线程爬虫的实现
- 使用Python多线程爬虫爬取电影天堂资源
- python queue和多线程的爬虫 与 JoinableQueue和多进程的爬虫
- python3.0 图片爬虫(增加多线程)
- 用python写的多线程网页爬虫
- python3 多线程爬虫
- Python小爬虫,(多线程)
- python爬虫:爬取猫眼电影(分数的处理和多线程)
- Python爬虫—多线程的简单示例
- Python爬虫学习笔记(2):多线程入门
- Python 爬虫学习笔记之多线程爬虫
- Python爬虫之多线程下载豆瓣Top250电影图片
- python 多线程爬虫实例(爬取智联招聘信息)
- 多线程网页爬虫 python 实现
- python多线程爬虫学习--Queue