HTMLParser python usage
2013-12-02 22:07
267 查看
'''
Created on 2013-12-2
http://cloudaice.com/yong-pythonde-htmlparserfen-xi-htmlye-mian/
@author: Administrator
'''
from HTMLParser import HTMLParser
import urllib
import sys
class ParseLinks(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = []
self.href=0
self.linkname=''
def handle_starttag(self, tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.href =1
def handle_data(self, data):
if self.href :
self.linkname += data
def handle_endtag(self, tag):
if tag == 'a':
self.linkname = ''.join(self.linkname.split())
self.linkname = self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.linkname = ''
self.href = 0
def getresult(self):
for value in self.data:
print value
if __name__ == "__main__":
MyParser = ParseLinks()
MyParser.feed(urllib.urlopen("http://www.python.org/index.html").read())
MyParser.getresult()
MyParser.close()
上面的解析实现了下面的功能,还是正则比较方便
p = re.compile(r'<a.*?>(.*?)</a>', re.I| re.M)
match = p.findall(html)
print match
Created on 2013-12-2
http://cloudaice.com/yong-pythonde-htmlparserfen-xi-htmlye-mian/
@author: Administrator
'''
from HTMLParser import HTMLParser
import urllib
import sys
class ParseLinks(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = []
self.href=0
self.linkname=''
def handle_starttag(self, tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.href =1
def handle_data(self, data):
if self.href :
self.linkname += data
def handle_endtag(self, tag):
if tag == 'a':
self.linkname = ''.join(self.linkname.split())
self.linkname = self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.linkname = ''
self.href = 0
def getresult(self):
for value in self.data:
print value
if __name__ == "__main__":
MyParser = ParseLinks()
MyParser.feed(urllib.urlopen("http://www.python.org/index.html").read())
MyParser.getresult()
MyParser.close()
上面的解析实现了下面的功能,还是正则比较方便
p = re.compile(r'<a.*?>(.*?)</a>', re.I| re.M)
match = p.findall(html)
print match
相关文章推荐
- python web with bottle and session (beaker)
- Beaker 1.6.4 : Python Package Index
- [修]python普通继承方式和super继承方式
- [转]Python tips: 什么是*args和**kwargs?
- Python 程序员应该知道的 10 个库
- Windows下使用Python读取Excel表格数据
- 第一个Python程序的Hello Python,竟然有问题
- Eclipse上运行Python,使用PyDev
- python 模块和包
- python 文件的读取&更新
- python 内建函数
- python 关于配置文件,日志,传参总结
- python switch函数
- python lambda函数
- python 函数多实参处理
- python return语句
- python函数全局变量和局部变量
- python学习笔记
- python函数形参和实参
- 结尾非零数的奇偶性 分类: python 小练习 2013-12-02 18:04 371人阅读 评论(0) 收藏