您的位置:首页 > 编程语言 > Python开发

python爬虫程序,由根目录一直爬各个网址

2015-10-07 17:20 405 查看
#coding=utf-8
import urllib2
import urllib
import re
import os
from threading import Thread
import time

class TimeoutException(Exception):
pass
ThreadStop = Thread._Thread__stop#获取私有函数
def timelimited(timeout):
def decorator(function):
def decorator2(*args,**kwargs):
class TimeLimited(Thread):
def __init__(self,_error= None,):
Thread.__init__(self)
self._error =  _error

def run(self):
try:
self.result = function(*args,**kwargs)
except Exception,e:
self._error =e

def _stop(self):
if self.isAlive():
ThreadStop(self)

t = TimeLimited()
t.start()
t.join(timeout)

if isinstance(t._error,TimeoutException):
t._stop()
raise TimeoutException('timeout for %s' % (repr(function)))

if t.isAlive():
t._stop()
raise TimeoutException('timeout for %s' % (repr(function)))

if t._error is None:
return t.result

return decorator2
return decorator

@timelimited(5)
def fn_1(url):
data=urllib.urlopen(url).read()
return data
urlbase='https://baidu.com'
patt='a href=\"(http.+?)\"'
p=re.compile(patt)
def downHtml(url):
count=1
backage='g:/downHtml/'+str(count)
#os.mkdir(backage);
path=backage+'/'+str(count)+'.html'
request = urllib2.Request(url)
#request = urllib2.Request('http://jandan.net/ooxx/page-1507#comments')
request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
opener = urllib2.build_opener()
f= opener.open(request)
doc=f.read()
m=re.findall(p,doc)
index=1
for i in m:
print index,i
index+=1
if len(m)>10000:
break
        try:
data=fn_1(i)
except:
continue
        if data is None:
continue
n=re.findall(p,data)
setm=set(m)
setn=set(n)
n=setn-setm
m.extend(n)
for i in m:
print index,i
index+=1
'''for i in m:
        print index,urlbase+i
        index+=1
        data=urllib.urlopen(url+i).read()
        n=re.findall('href="(\?.+?)"',data)
        f=open(path,'wb')
        f.write(data)
        f.close()
        count=count+1
        path=backage+'/'+str(count)+'.html'''''
downHtml(urlbase)
print 'down'
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: