您的位置：首页 > 编程语言 > Python开发

[Python]多线程网址爬虫：控制线程数，爬虫深度

2017-06-01 15:42 435 查看

python多线程作用于可以并行处理的函数上，一般是执行重复操作的一些函数。

在网站URL爬虫中，多线程爬虫作用的函数我设置的是getLink()这个函数：实现功能——根据url获取网页内容再从中爬取出url。

deadUrl = [] #存放死链
crawledUrl = [] #存放全部爬取过的url，避免重复爬取
#多线程爬取类，获得多线程函数参数
#入口参数：url,depth,thread. depth = 0则无需进行爬虫
#返回：爬取的url列表
class CrawlThread(threading.Thread):
def __init__(self,url):
threading.Thread.__init__(self)
self.url = url
self.linklist = ''
# 目标url存活性判断:
# 存活返回 True;否则返回False
def urlStatus(self,url):
try:
status = urllib.urlopen(url).code
if status == 200:
return True
else:
deadUrl.append(url)
return False
except:
return False
#判断url域名是否为当前域名
def judgeDomain(self,testLink):
domain = urlparse.urlparse(self.url).netloc #当前域名
if domain == urlparse.urlparse(testLink).netloc:
return True
else:
return False
# 读取整个网页
def getHtml(self,url):
try:
page = urllib.urlopen(url)
htmlbody = page.read()
return htmlbody
except:
return None

# 爬取url页面下的全部链接，多线程作用的函数
def getLink(self,url):
tmpLinks = []
html = self.getHtml(url)
soup = BeautifulSoup.BeautifulSoup(html)
links = soup.findAll('a') # 返回一个列表
###获取<a>中href的值
bad_links = {None, '', '#', ' '} # 无用链接列表
bad_protocol = {'javascript', 'mailto', 'tel', 'telnet'} # 无用的头部协议，如javascript等
right_protocol = {'http', 'https'} # 存放正确的协议头部
linklist = [] # 存放正常的链接
for link in links:
if link.get('href') in bad_links or link.get('href').split(':')[0] in bad_protocol: #去除无用链接
continue
else: # 列表中包含相对地址
linklist.append(link.get('href'))
# 将相对地址转换为绝对地址
linklist_tmp = []
for link in linklist:
if link.split(':')[0] in right_protocol:
if self.judgeDomain(link): #域名相同
linklist_tmp.append(link)
else:
link_temp = urlparse.urljoin(self.url, link) #相对变绝对
linklist_tmp.append(link_temp)
linklist = linklist_tmp
# 去除重复链接 set()函数
linklist = list(set(linklist))
if linklist:
for link in linklist:
if self.urlStatus(link) and link not in crawledUrl: #url存活性判断，去除死链
tmpLinks.append(link)
crawledUrl.append(link)
for i in tmpLinks:
print i
return tmpLinks
else:#不再存在未爬取链接
return None
def run(self): #线程创建后会直接运行run函数
self.linklist = self.getLink(self.url)

def getDatas(self):
return self.linklist

这里多线程采用的是函数类的方式，即CrawlThread继承threading.Thread这个类，并重写它的run()方法，将需要多线程执行的代码放到run()方法下，即可。
控制线程数：即最大并行执行m个线程，等这m个并行线程结束后，再发起m个并行线程。

控制爬虫深度：采用的广度优先爬虫策略，加一个循环判断爬虫深度即可。

代码如下：

#广度遍历，爬取指定深度全部url
def crawlDepth(url,depth,maxThread):
threadpool = [] #线程池
if depth == 0:
return url
else:
nowDepth = 1
print '爬虫深度：', nowDepth
th = CrawlThread(url)#获得深度为1时的全部url
th.setDaemon(True)
th.start()
th.join()
testLinks = Queue.deque(th.getDatas())
print 'testLinks:',testLinks
while nowDepth < depth and testLinks:
nowDepth = nowDepth + 1
print '爬虫深度：', nowDepth
tmpLinks = []
while testLinks:
while len(threadpool) < maxThread:
if testLinks:
t = CrawlThread(testLinks.pop())
t.setDaemon(True)
threadpool.append(t)
t.start()
else:
break
for thread in threadpool:#等待线程结束
thread.join()
#取出线程数据
tmp = thread.getDatas()
if tmp:
tmpLinks.extend(tmp)
threadpool = []
if tmpLinks:
testLinks = list(set(tmpLinks))
else:
testLinks = Queue.deque([])
return crawledUrl

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航