您的位置:首页 > 编程语言 > Python开发

Python实现抓取CSDN热门文章列表

2016-08-23 15:51 746 查看
1、使用工具:

Python3.5

BeautifulSoup

2、抓取网站:

csdn热门文章列表 http://blog.csdn.net/hot.html

3、分析网站代码:



4、实现代码:

__author__ = 'Administrator'
import urllib.request
import re
from bs4 import BeautifulSoup

########################################################
#
# 抓取csdn首页文章http://blog.csdn.net/?&page=1
#
#
#
########################################################
class CsdnUtils(object):
def __init__(self):
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
self.headers = {'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': user_agent,
}

def getPage(self, url=None):
request = urllib.request.Request(url, headers=self.headers)
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response.read(), "html.parser")
#print(soup.prettify())
return soup

def parsePage(self, url=None, page=None):
soup = self.getPage(url)
itemBlog = soup.find_all('div', 'blog_list')
cnArticle = CsdnUtils
print("========================第", page, "页======================================")
for i, itemSingle in enumerate(itemBlog):
cnArticle.num = i
cnArticle.author = itemSingle.find('a', 'user_name').string
cnArticle.postTime = itemSingle.find('span', 'time').string
cnArticle.articleView = itemSingle.find('a', 'view').string
if itemSingle.find('h1').find('a').has_attr('class'):
cnArticle.type = itemSingle.find('h1').find('a', 'category').string
else:
cnArticle.type = "None"
cnArticle.title = itemSingle.find('h1').find('a', attrs={'name': True}).string
cnArticle.url = itemSingle.find('h1').find('a', attrs={'name': True}).get("href")
print("数据:", cnArticle.num + 1, '\t', cnArticle.author, '\t', cnArticle.postTime, '\t',
cnArticle.articleView, '\t', cnArticle.type, '\t', cnArticle.title, '\t', cnArticle.url)

#######     执行入口    ########
if __name__ == "__main__":

#要抓取的网页地址'http://blog.csdn.net/?&page={}'.format(i+1),i+1)
url = "http://blog.csdn.net/hot.html"

cnblog = CsdnUtils()
for i in range(0, 5):
cnblog.parsePage(url, i + 1)


5、执行结果:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 抓取