您的位置:首页 > 编程语言 > Python开发

提取网址的python练习

2015-07-23 17:40 567 查看
import urllib, urllib2, cookielib
from HTMLParser import HTMLParser
import sys

reload(sys)
sys.setdefaultencoding('utf8')

class WebParser(HTMLParser):
def __init__(self, links, path):
HTMLParser.__init__(self)
self.links = links
self.path = path

def handle_starttag(self, tag, attrs):
if tag == 'a':
if len(attrs) == 0:
pass
else:
for (key, val) in attrs:
if key == 'href':
if val.startswith('http'):
self.links.add(val)
elif val.startswith('/'):
self.links.add(self.path + val)

class Crawl:
def __init__(self):
self.path = 'http://www.baidu.com'
self.cookie = cookielib.CookieJar()
handler = urllib2.HTTPCookieProcessor(self.cookie)
self.opener = urllib2.build_opener(handler)

def open(self, path):
self.response = self.opener.open(path)

def showCookie(self):
for item in self.cookie:
print 'Name = ' + item.name
print 'value = ' + item.value

def showResponse(self):
print self.response.read()

def getAllUrl(self, links, path):
try:
self.open(path)
res = self.response.read()
parser = WebParser(links, path)
parser.feed(res)
parser.close()
except Exception, e:
print e

def crawl(self):
src_links = set()
result_links = set()
self.getAllUrl(src_links, self.path)
n = 200
while len(src_links) != 0 and n > 0:
link = src_links.pop()
if link in result_links:
pass
result_links.add(link)
self.getAllUrl(src_links, link)
n -= 1
print n

return result_links | src_links

c = Crawl()
rlt = c.crawl()
for link in rlt:
print link
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: