[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(一)
2013-11-13 10:55
766 查看
BeautifulSoup比起HTMLParser操作起来会简单一点
(HTMLParser是边解析边回调使用, BeautifulSoup是全部解析完后再使用)
#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
# 获取博客文章分类列表和文章存档列表
# File: GetCategoryAndMonth.py
import urllib2
import HTMLParser
import httplib
from bs4 import BeautifulSoup
class CHYGetCategoryAndMonth:
def Parser(self, htmlStr, type, list):
soup2 = BeautifulSoup(htmlStr)
if 0 == type:
listDiv = soup2.find_all("div", id = "panel_Category")
for divItem in listDiv:
ul = divItem.find("ul", class_ = "panel_head")
if u"文章分类" != ul.span.text:
continue
ul = divItem.find("ul", class_ = "panel_body")
listLi = ul.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
break
elif 1 == type:
div = soup2.find("div", id = "panel_Archive")
# 找出该div后面的第一个div
listDiv = div.find_next("div")
listLi = listDiv.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
'''
# 测试代码
if __name__ == '__main__':
conn = httplib.HTTPConnection("blog.csdn.net")
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/bagboy_taobao_com", headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = CHYGetCategoryAndMonth()
list1 = []
my.Parser(htmlByte, 1, list1)
list0 = []
my.Parser(htmlByte, 0, list0)
print(list1)
print(list0)
'''
(HTMLParser是边解析边回调使用, BeautifulSoup是全部解析完后再使用)
#!/usr/bin/env python
# coding=utf-8
# Python 2.7.3
# 获取博客文章分类列表和文章存档列表
# File: GetCategoryAndMonth.py
import urllib2
import HTMLParser
import httplib
from bs4 import BeautifulSoup
class CHYGetCategoryAndMonth:
def Parser(self, htmlStr, type, list):
soup2 = BeautifulSoup(htmlStr)
if 0 == type:
listDiv = soup2.find_all("div", id = "panel_Category")
for divItem in listDiv:
ul = divItem.find("ul", class_ = "panel_head")
if u"文章分类" != ul.span.text:
continue
ul = divItem.find("ul", class_ = "panel_body")
listLi = ul.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
break
elif 1 == type:
div = soup2.find("div", id = "panel_Archive")
# 找出该div后面的第一个div
listDiv = div.find_next("div")
listLi = listDiv.find_all("li")
listItem = ["", ""]
for li in listLi:
listItem[0] = li.a["href"]
listItem[1] = li.a.text
item = listItem[:]
list.append(item)
'''
# 测试代码
if __name__ == '__main__':
conn = httplib.HTTPConnection("blog.csdn.net")
# 要模拟成IE发送, 否则CSDN不接受Python的请求
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headersP = { 'User-Agent' : user_agent }
conn.request(method = "GET", url = "/bagboy_taobao_com", headers = headersP)
r1 = conn.getresponse() # 获得响应
htmlByte = r1.read() # 获得HTML
htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常
my = CHYGetCategoryAndMonth()
list1 = []
my.Parser(htmlByte, 1, list1)
list0 = []
my.Parser(htmlByte, 0, list0)
print(list1)
print(list0)
'''
#!/usr/bin/env python # Python 2.7.3 # 获取博客文章 # File: GetArticleList.py import urllib2 import HTMLParser import httplib from bs4 import BeautifulSoup class CHYGetArticleList: def Parser(self, htmlStr, list): soup2 = BeautifulSoup(htmlStr) div = soup2.find("div", id = "article_list") listDiv = div.find_all("div", class_ = "list_item article_item") for divItem in listDiv: divA = divItem.find("div", class_ = "article_title") a = divA.h3.span.a list.append(a["href"]) ''' # http://blog.csdn.net/bagboy_taobao_com/article/month/2013/10 # 测试代码 if __name__ == '__main__': conn = httplib.HTTPConnection("blog.csdn.net") # 要模拟成IE发送, 否则CSDN不接受Python的请求 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headersP = { 'User-Agent' : user_agent } conn.request(method = "GET", url = "/bagboy_taobao_com/article/month/2013/10", headers = headersP) r1 = conn.getresponse() # 获得响应 htmlByte = r1.read() # 获得HTML htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常 list = [] my = CHYGetArticleList() my.Parser(htmlByte, list) print(list) '''
#!/usr/bin/env python # coding=utf-8 # Python 2.7.3 # 获取博客文章 # File: GetArticle.py import urllib2 import HTMLParser import httplib from bs4 import BeautifulSoup class CHYGetArticle: def Parser(self, htmlStr, article): soup2 = BeautifulSoup(htmlStr) divTitle = soup2.find("div", class_ = "article_title") article[0] = divTitle.h3.span.text article[0] = article[0].replace("\n\r", "") # 这里必须要重新赋值 article[0] = article[0].strip() # 这里必须要重新赋值 divComment = soup2.find("div", class_ = "article_content") article[1] = divComment.text ''' # http://blog.csdn.net/bagboy_taobao_com/article/details/5582868 # 测试代码 if __name__ == '__main__': conn = httplib.HTTPConnection("blog.csdn.net") # 要模拟成IE发送, 否则CSDN不接受Python的请求 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headersP = { 'User-Agent' : user_agent } conn.request(method = "GET", url = "/bagboy_taobao_com/article/details/5582868", headers = headersP) r1 = conn.getresponse() # 获得响应 htmlByte = r1.read() # 获得HTML htmlStr = htmlByte.decode("utf8") # 需要转换成utf8编码, 否则分析异常 my = CHYGetArticle() article = ["", ""] my.Parser(htmlByte, article) f = open("data.txt", "w") print >> f, article[0].encode("utf8"), # print最后参数加一个"逗号", 这样就输出最后不换行 print >> f, article[1].encode("utf8") '''
相关文章推荐
- [Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(二)
- [Python下载CSDN博客]4. V3版本_使用正则表达式分析HTML(一)
- [Python下载CSDN博客]4. V3版本_使用正则表达式分析HTML(二)
- Web Scraping with Python: 使用 Python 下载 CSDN 博客图片
- Python爬虫小实践:使用BeautifulSoup+Request爬取CSDN博客的个人基本信息
- 使用Python urllib2下载CSDN博客列表到本地
- Python:[转]浅学BeautifulSoup分析html
- Python 爬虫(以赛马数据为例)之使用BeautifulSoup进行Html解析
- Python 超级简单的网站html分析框架BeautifulSoup
- Web Scraping with Python:使用 Python 导出 CSDN 博客全部文章(保留样式)和附带图片
- 使用python统计csdn博客一段时间内的访问量
- 学习]用python的BeautifulSoup分析html
- 使用python爬取csdn博客访问量
- [学习]用python的BeautifulSoup分析html
- 使用BeautifulSoup爬取CSDN博客文章
- 使用python爬取csdn博客访问量
- [Python下载CSDN博客]1. 简单实现(二)
- Python爬虫包 BeautifulSoup 学习(十) 各种html解析器的比较及使用
- 用python的BeautifulSoup分析html
- 使用python爬取csdn博客访问量