python实现网络爬虫
2012-06-18 22:41
585 查看
一.简介
该爬虫程序包含2个类,一个管理整个crawling进程(Crawler),一个检索并解析每一个下载的web页面(Retriever)。二.程序
#!/usr/bin/env python from sys import argv from os import makedirs,unlink,sep from os.path import dirname,exists,isdir,splitext from string import replace,find,lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse,urljoin from formatter import DumbWriter,AbstractFormatter from cStringIO import StringIO class Retriever(object): #download web pages def __init__(self,url): self.url = url self.file = self.filename(url) def filename(self,url,deffile='index.htm'): parsedurl = urlparse(url,'http:',0) ## parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '' : #no file,use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) #local directory if sep != '/': # os-indep. path separator ldir = replace(ldir,'/',sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): #download Web page try: retval = urlretrieve(self.url,self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % \ self.url,) return retval def parseAndGetLinks(self): #parse HTML,save links self.parser = HTMLParser(AbstractFormatter(\ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parse.close() return self.parser.anchorlist class Crawler(object): #manage entire crawling process count = 0 #static downloaded page counter def __init__(self,url): self.q = [url] self.seen = [] #have seen the url self.dom = urlparse(url)[1] def getPage(self,url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation,do not parse print retval,'... skipping parse' return Crawler.count += 1 print '\n(',Crawler.count,')' print 'URL:',url print 'FILE:',retval[0] self.seen.append(url) links = r.parseAndGetLinks() #get and process links for eachLink in links: if eachLink[:4] != 'http' and \ find(eachLink,'://') == -1: eachLink = urljoin(url,eachLink) print '* ',eachLink, if find(lower(eachLink),'mailto:') != -1: print '... discarded,mailto link' continue if eachLink not in self.seen: if find(eachLink,self.dom) == -1: print '... discarded, not in domain' else: if eachLink not in self.q: self.q.append(eachLink) print '... new, added to Q' else: print '... discarded, already in Q' else: print '... discarded, already processed' def go(self): # process links in queue while self.q: url = self.q.pop() self.getPage(url) def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL:') except(KeyboardInterrupt,EOFError): url = '' if not url: return robot = Crawler(url) robot.go() if __name__ == '__main__': main()
相关文章推荐
- 利用python实现网络爬虫
- 5行python代码实现简单的网络爬虫
- python实现简单网络爬虫
- Python实现网络爬虫
- python3实现网络爬虫(7)-- 使用ip代理抓取网页
- Python 实现网络爬虫小程序
- Python爬虫实战三之实现山东大学无线网络掉线自动重连
- Python实现简单网络爬虫功能
- 用python实现网络爬虫
- Python 实现网络爬虫小程序
- python网络爬虫——基本概念及代码实现1
- Python实现网络爬虫
- PYTHON 实现 NBA 赛程查询工具(二)—— 网络爬虫
- 【Python开发】【神经网络与深度学习】网络爬虫之python实现
- Python:入门到实现网络爬虫 Day1
- Python 实现网络爬虫小程序
- Python--通过XPath实现网络爬虫
- python3实现网络爬虫(3)--BeautifulSoup使用(2)
- python使用rabbitmq实现网络爬虫示例
- 用Python实现网络爬虫