您的位置：首页 > 编程语言 > Python开发

python解析html/xml

2013-01-05 08:31 344 查看

解析html

from HTMLParser import HTMLParser
import sys

class TestParser(HTMLParser):
def __init__(self):
self.title = ''
self.readingtitle = 0
self.body = ''
self.readingbody = 0
HTMLParser.__init__(self)

def handle_starttag(self, tag, attrs):
if self.readingbody:
self.body += '<'+tag+'>'
if tag == 'title':
self.readingtitle = 1
elif tag == 'body':
self.readingbody = 1

def handle_data(self, data):
if self.readingtitle:
self.title += data
elif self.readingbody:
self.body += data

def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
elif tag == 'body':
self.readingbody = 0
if self.readingbody:
self.body += '<'+tag+'>'

def gettitle(self):
return self.title

def getbody(self):
return self.body

# testparser.py test.html
#fd = open(sys.argv[1])
fd = open("test.html");
tp = TestParser()
tp.feed(fd.read())
print "Title is:", tp.gettitle()
print "Body is:", tp.getbody()

对于不严格的html（比如缺少关闭tag），可以使用TidyLib。

<html>
<head>
<title>Document Title</title>
</head>
<body>
<p>This is a text</p>
</body>
</html>

输出

>>>
Title is: Document Title
Body is:
<p>This is a text<p>

解析xml

from xml.dom import minidom, Node
import sys

def scanNode(node, level = 0):
msg = node.__class__.__name__
if node.nodeType == Node.ELEMENT_NODE:
msg += ", tag: " + node.tagName
print level, msg
if node.hasChildNodes:
for child in node.childNodes:
scanNode(child, level + 1)

# testparser.py test.xml
doc = minidom.parse("test.xml");
scanNode(doc)

test.xml

<?xml version="1.0" encoding="utf-8" standalone="no"?>
<books>
<book author="moyan">
<name>fengrufeitun</name>
<price>12</price>
</book>
</books>

输出

>>>
0 Document
1 Element, tag: books
2 Text
2 Element, tag: book
3 Text
3 Element, tag: name
4 Text
3 Text
3 Element, tag: price
4 Text
3 Text
2 Text

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航