您的位置:首页 > 编程语言 > Python开发

HTMLParser python usage

2013-12-02 22:07 267 查看
'''

Created on 2013-12-2

http://cloudaice.com/yong-pythonde-htmlparserfen-xi-htmlye-mian/

@author: Administrator

'''

from HTMLParser import HTMLParser

import urllib

import sys

class ParseLinks(HTMLParser):

def __init__(self):

HTMLParser.__init__(self)

self.data = []

self.href=0

self.linkname=''

def handle_starttag(self, tag,attrs):

if tag == 'a':

for name,value in attrs:

if name == 'href':

self.href =1

def handle_data(self, data):

if self.href :

self.linkname += data

def handle_endtag(self, tag):

if tag == 'a':

self.linkname = ''.join(self.linkname.split())

self.linkname = self.linkname.strip()

if self.linkname:

self.data.append(self.linkname)

self.linkname = ''

self.href = 0

def getresult(self):

for value in self.data:

print value

if __name__ == "__main__":

MyParser = ParseLinks()

MyParser.feed(urllib.urlopen("http://www.python.org/index.html").read())

MyParser.getresult()

MyParser.close()

上面的解析实现了下面的功能,还是正则比较方便

p = re.compile(r'<a.*?>(.*?)</a>', re.I| re.M)

match = p.findall(html)

print match
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: