您的位置:首页 > 编程语言 > Python开发

python爬取博客标题和访问量

2016-04-28 00:14 726 查看
<span style="font-size:24px;">#!usr/bin/python
#coding:utf-8
#爬取CSDN博客
#网络爬虫技术

import requests
import sys,re
'''
reload(sys)
sys.setdefaultencoding('utf-8')
'''

#拼接目标博客地址,请求并获取响应内容
def blog_acount():
#csdn账号:
acount = raw_input('输入csdn的登录账号:')

base_url ='http://blog.csdn.net/' + acount
#页号
page_num = 1
while True:

#目标博客地址
des_url = base_url+'/article/list/'+str(page_num)

#直接访问csdn拒绝,伪装成浏览器访问
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}

r = requests.get(des_url,headers=headers)
#print r.encoding

#获取返回的响应内容
string = r.text
#查找是否存在下一页来判断是否进入下一页爬取地址

result = string.find(u'尾页')
if result != -1:
#print '内容有好几页'
tlist = web_crawl_tile(string)
rlist = web_crawl_pagev(string)
page_num += 1
for i in range(len(tlist)):
print '标题为:%s,的文章访问量为:%s'%(tlist[i],rlist[i])
continue

else:
#print '内容就一页'
tlist = web_crawl_tile(string)
rlist = web_crawl_pagev(string)
for i in range(len(tlist)):
print '标题为:%s,的文章访问量为:%s'%(tlist[i],rlist[i])
break

#爬取博客标题
def web_crawl_tile(string):
#爬取CSDN博客文章标题
stitle = r'<span class="link_title"><a href=".*?">\s(.*?)\s*?</a></span>'

title = re.findall(stitle,string,re.S)
#print title
titlelist = []
for i in title:
#strip()用来去除字符串前后空白字符,lstrip用来去除左边空白字符,rstrip为右边
#先转义中文,然后再去除左右空白字符
j = i.encode('utf-8').lstrip().rstrip()
#print j
titilelist = titlelist.append(j)

return titlelist

#爬取文章的访问量
def web_crawl_pagev(string):
string = string.encode('utf-8')

#访问爬取后的csdn博客地址,取出该博客的访问量
srnum = r'<span class="link_view" title="阅读次数"><a href=".*?" title="阅读次数">阅读</a>\((\d*)\)</span>'

rnum = re.findall(srnum,string,re.S)
#print rnum
return rnum

blog_acount()

</span>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: