您的位置:首页 > 编程语言 > Python开发

python解析html/xml

2013-01-05 08:31 344 查看
解析html

from HTMLParser import HTMLParser
import sys

class TestParser(HTMLParser):
def __init__(self):
self.title = ''
self.readingtitle = 0
self.body = ''
self.readingbody = 0
HTMLParser.__init__(self)

def handle_starttag(self, tag, attrs):
if self.readingbody:
self.body += '<'+tag+'>'
if tag == 'title':
self.readingtitle = 1
elif tag == 'body':
self.readingbody = 1

def handle_data(self, data):
if self.readingtitle:
self.title += data
elif self.readingbody:
self.body += data

def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
elif tag == 'body':
self.readingbody = 0
if self.readingbody:
self.body += '<'+tag+'>'

def gettitle(self):
return self.title

def getbody(self):
return self.body

# testparser.py test.html
#fd = open(sys.argv[1])
fd = open("test.html");
tp = TestParser()
tp.feed(fd.read())
print "Title is:", tp.gettitle()
print "Body is:", tp.getbody()


对于不严格的html(比如缺少关闭tag),可以使用TidyLib。

<html>
<head>
<title>Document Title</title>
</head>
<body>
<p>This is a text</p>
</body>
</html>
输出

>>>
Title is: Document Title
Body is:
<p>This is a text<p>
解析xml

from xml.dom import minidom, Node
import sys

def scanNode(node, level = 0):
msg = node.__class__.__name__
if node.nodeType == Node.ELEMENT_NODE:
msg += ", tag: " + node.tagName
print level, msg
if node.hasChildNodes:
for child in node.childNodes:
scanNode(child, level + 1)

# testparser.py test.xml
doc = minidom.parse("test.xml");
scanNode(doc)
test.xml

<?xml version="1.0" encoding="utf-8" standalone="no"?>
<books>
<book author="moyan">
<name>fengrufeitun</name>
<price>12</price>
</book>
</books>
输出

>>>
0 Document
1 Element, tag: books
2 Text
2 Element, tag: book
3 Text
3 Element, tag: name
4 Text
3 Text
3 Element, tag: price
4 Text
3 Text
2 Text
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: