您的位置:首页 > 编程语言 > Python开发

[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(一)

2013-11-13 10:55 766 查看
BeautifulSoup比起HTMLParser操作起来会简单一点
(HTMLParser是边解析边回调使用, BeautifulSoup是全部解析完后再使用)

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
# 获取博客文章分类列表和文章存档列表
# File: GetCategoryAndMonth.py
import urllib2
import HTMLParser
import httplib
from bs4 import BeautifulSoup

class CHYGetCategoryAndMonth:
def Parser(self, htmlStr, type, list):
soup2 = BeautifulSoup(htmlStr)
if 0 == type:
listDiv = soup2.find_all("div", id = "panel_Category")
for divItem in listDiv:
ul = divItem.find("ul", class_ = "panel_head")
if u"文章分类" != ul.span.text:
continue
ul = divItem.find("ul", class_ = "panel_body")
listLi = ul.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
break
elif 1 == type:
div = soup2.find("div", id = "panel_Archive")
# 找出该div后面的第一个div
listDiv = div.find_next("div")
listLi = listDiv.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
'''
# 测试代码
if __name__ == '__main__':
conn = httplib.HTTPConnection("blog.csdn.net")
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/bagboy_taobao_com", headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = CHYGetCategoryAndMonth()
list1 = []
my.Parser(htmlByte, 1, list1)
list0 = []
my.Parser(htmlByte, 0, list0)
print(list1)
print(list0)
'''
#!/usr/bin/env python
# Python 2.7.3
# 获取博客文章
# File: GetArticleList.py
import urllib2
import HTMLParser
import httplib
from bs4 import BeautifulSoup

class CHYGetArticleList:
def Parser(self, htmlStr, list):
soup2 = BeautifulSoup(htmlStr)
div = soup2.find("div", id = "article_list")
listDiv = div.find_all("div", class_ = "list_item article_item")
for divItem in listDiv:
divA = divItem.find("div", class_ = "article_title")
a = divA.h3.span.a
list.append(a["href"])

'''
# http://blog.csdn.net/bagboy_taobao_com/article/month/2013/10 # 测试代码
if __name__ == '__main__':
conn = httplib.HTTPConnection("blog.csdn.net")
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/bagboy_taobao_com/article/month/2013/10", headers = headersP)
r1 = conn.getresponse()				# 获得响应
htmlByte = r1.read()				# 获得HTML
htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
list = []
my = CHYGetArticleList()
my.Parser(htmlByte, list)
print(list)
'''

#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
# 获取博客文章
# File: GetArticle.py
import urllib2
import HTMLParser
import httplib
from bs4 import BeautifulSoup

class CHYGetArticle:
def Parser(self, htmlStr, article):
soup2 = BeautifulSoup(htmlStr)
divTitle = soup2.find("div", class_ = "article_title")
article[0] = divTitle.h3.span.text
article[0] = article[0].replace("\n\r", "")		# 这里必须要重新赋值
article[0] = article[0].strip()					# 这里必须要重新赋值
divComment = soup2.find("div", class_ = "article_content")
article[1] = divComment.text

'''
# http://blog.csdn.net/bagboy_taobao_com/article/details/5582868 # 测试代码
if __name__ == '__main__':
conn = httplib.HTTPConnection("blog.csdn.net")
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/bagboy_taobao_com/article/details/5582868", headers = headersP)
r1 = conn.getresponse()				# 获得响应
htmlByte = r1.read()				# 获得HTML
htmlStr = htmlByte.decode("utf8")	# 需要转换成utf8编码, 否则分析异常
my = CHYGetArticle()
article = ["", ""]
my.Parser(htmlByte, article)
f = open("data.txt", "w")
print >> f, article[0].encode("utf8"), 			# print最后参数加一个"逗号", 这样就输出最后不换行
print >>  f, article[1].encode("utf8")
'''
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息