python爬虫程序,由根目录一直爬各个网址
2015-10-07 17:20
405 查看
#coding=utf-8 import urllib2 import urllib import re import os from threading import Thread import time class TimeoutException(Exception): pass ThreadStop = Thread._Thread__stop#获取私有函数 def timelimited(timeout): def decorator(function): def decorator2(*args,**kwargs): class TimeLimited(Thread): def __init__(self,_error= None,): Thread.__init__(self) self._error = _error def run(self): try: self.result = function(*args,**kwargs) except Exception,e: self._error =e def _stop(self): if self.isAlive(): ThreadStop(self) t = TimeLimited() t.start() t.join(timeout) if isinstance(t._error,TimeoutException): t._stop() raise TimeoutException('timeout for %s' % (repr(function))) if t.isAlive(): t._stop() raise TimeoutException('timeout for %s' % (repr(function))) if t._error is None: return t.result return decorator2 return decorator @timelimited(5) def fn_1(url): data=urllib.urlopen(url).read() return data urlbase='https://baidu.com' patt='a href=\"(http.+?)\"' p=re.compile(patt) def downHtml(url): count=1 backage='g:/downHtml/'+str(count) #os.mkdir(backage); path=backage+'/'+str(count)+'.html' request = urllib2.Request(url) #request = urllib2.Request('http://jandan.net/ooxx/page-1507#comments') request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6') opener = urllib2.build_opener() f= opener.open(request) doc=f.read() m=re.findall(p,doc) index=1 for i in m: print index,i index+=1 if len(m)>10000: break try: data=fn_1(i) except: continue if data is None: continue n=re.findall(p,data) setm=set(m) setn=set(n) n=setn-setm m.extend(n) for i in m: print index,i index+=1 '''for i in m: print index,urlbase+i index+=1 data=urllib.urlopen(url+i).read() n=re.findall('href="(\?.+?)"',data) f=open(path,'wb') f.write(data) f.close() count=count+1 path=backage+'/'+str(count)+'.html''''' downHtml(urlbase) print 'down'
相关文章推荐
- 简洁之美 -约瑟夫环的python 解法
- python os相关操作
- Python处理JSON
- [译]学习IPython进行交互式计算和数据可视化(三)
- [译]学习IPython进行交互式计算和数据可视化(二)
- [译]学习IPython进行交互式计算和数据可视化(一)
- Python GIL 多线程机制 (C source code)
- Python中基本语法
- python序列处理函数
- 《机器学习实战》kMeans算法(K均值聚类算法)
- 《机器学习实战》二分-kMeans算法(二分K均值聚类)
- 【Python】测试题
- 【Python】Learn Python the hard way, ex15 读取文件
- python zip用法
- 【Python】Learn Python the hard way, ex14 argv参数传值
- 使用 Python 进行线程编程
- python模糊查询
- Python标准库的学习准备
- Python标准库——走马观花
- <PY>感知机