您的位置:首页 > 理论基础 > 计算机网络

python实现网络爬虫

2012-06-18 22:41 585 查看

一.简介

该爬虫程序包含2个类,一个管理整个crawling进程(Crawler),一个检索并解析每一个下载的web页面(Retriever)。

二.程序

#!/usr/bin/env python

from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO

class Retriever(object): #download web pages
def __init__(self,url):
self.url = url
self.file = self.filename(url)

def filename(self,url,deffile='index.htm'):
parsedurl = urlparse(url,'http:',0) ## parse path
path = parsedurl[1] + parsedurl[2]
ext = splitext(path)
if ext[1] == '' : #no file,use default
if path[-1] == '/':
path += deffile
else:
path += '/' + deffile
ldir = dirname(path) #local directory
if sep != '/': # os-indep. path separator
ldir = replace(ldir,'/',sep)
if not isdir(ldir): # create archive dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
return path

def download(self): #download Web page
try:
retval = urlretrieve(self.url,self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' % \
self.url,)
return retval

def parseAndGetLinks(self): #parse HTML,save links
self.parser = HTMLParser(AbstractFormatter(\
DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parse.close()
return self.parser.anchorlist

class Crawler(object): #manage entire crawling process
count = 0 #static downloaded page counter

def __init__(self,url):
self.q = [url]
self.seen = []   #have seen the url
self.dom = urlparse(url)[1]

def getPage(self,url):
r = Retriever(url)
retval = r.download()
if retval[0] == '*': # error situation,do not parse
print retval,'... skipping parse'
return
Crawler.count += 1
print '\n(',Crawler.count,')'
print 'URL:',url
print 'FILE:',retval[0]
self.seen.append(url)

links = r.parseAndGetLinks() #get and process links
for eachLink in links:
if eachLink[:4] != 'http' and \
find(eachLink,'://') == -1:
eachLink = urljoin(url,eachLink)
print '* ',eachLink,

if find(lower(eachLink),'mailto:') != -1:
print '... discarded,mailto link'
continue

if eachLink not in self.seen:
if find(eachLink,self.dom) == -1:
print '... discarded, not in domain'
else:
if eachLink not in self.q:
self.q.append(eachLink)
print '... new, added to Q'
else:
print '... discarded, already in Q'
else:
print '... discarded, already processed'

def go(self): # process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)

def main():
if len(argv) > 1:
url = argv[1]
else:
try:
url = raw_input('Enter starting URL:')
except(KeyboardInterrupt,EOFError):
url = ''
if not url: return
robot = Crawler(url)
robot.go()

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: