您的位置：首页 > 其它

统计个人CSDN的博客文章数量

2017-04-01 12:46 417 查看

统计个人CSDN的博客文章数量

第一版

原始版本比较简单

只能统计第一页，而且没有进行排序

# coding:utf-8
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'

totalList=[]
contentList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
fd=open('counter.txt','w')
page = BeautifulSoup(html,'lxml')
mytimes=page.find(id='blog_rank')
i =1
for aa in mytimes.find_all('li'):
if i<3:
print aa.text
fd.write(aa.text)
fd.write('\n')
totalList.append(aa.text)
i +=1

items = page.find_all('div',class_ ='list_item list_view')
print '总共有文章%d 篇' % len(items)
for item in items:
content=item.find('a')
read_time=item.find('span',class_ ='link_view')
comments_time=item.find('span',class_ ='link_comments')

totalcontent=content.text.strip()+read_time.text.strip()+comments_time.text.strip()
print totalcontent
contentList.append(totalcontent)
fd.write(totalcontent)
fd.write('\n')

fd.close()
return totalList,contentList
urls=getPage()

第二版

再增加一个版本

这个版本，直接能按照访问次数进行排序

2017.4.11日重新更新代码，本次更新内容：

将统计的内容，重新在程序文件下再建立一个文件夹，同时将统计内容放入到以当前时间为名字的文本中

避免了每次统计直接覆盖了上一次统计的数据

第二版

# coding:utf-8
import urllib2,re,datetime,os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
totalList=[]
contentList=[]
sortlist=[]
sortlist1=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path='count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
fd=open(fname,'w')
page = BeautifulSoup(html,'lxml')
mytimes=page
4000
.find(id='blog_rank')
i =1
for aa in mytimes.find_all('li'):
if i<3:
print aa.text
fd.write(aa.text)
fd.write('\n')
totalList.append(aa.text)
i +=1

items = page.find_all('div',class_ ='list_item list_view')
print '总共有文章%d 篇' % len(items)
fd.write('总共有文章%d 篇' % len(items))
fd.write('\n')
for item in items:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')

read_time=item.find('span',class_ ='link_view')
tmp=str(read_time.text.strip())

number = int(filter(str.isdigit, tmp))
sortlist1.append(number)

comments_time=item.find('span',class_ ='link_comments')
aa['indexs']=number
aa['content']=content.text.strip()
aa['read_time']=tmp
aa['comments_time']=comments_time.text.strip()
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1

for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['comments_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
contentList.append(totalcontent)
fd.close()
return contentList
urls=getPage()

第三版

这一个版本比较有意思

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path=u'count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
print fname
fd=open(fname,'w')
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
print u'总共有文章%d 篇' % len(items)
fd.write('总共有文章%d 篇' % len(items))
fd.write('\n')
for item in items:
aa={}
content=item.find('a')

contemtUrl=baseurl+content.get('href')
#print contemtUrl

read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime

readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1

for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList

urls=getPage()

count=random.randint(10,50)
print u'将要打开关闭浏览器次数为：',count
for i in range(5):
print urls[i]

j=0
while j< count:
if j == 15:
j=0
for i in range(5):
web.open_new_tab(urls[i+38])
time.sleep(1)
web.open_new_tab(urls[random.randint(1,44)])
time.sleep(1)
web.open_new_tab('http://blog.csdn.net/qiqiyingse/article/details/51801918')
time.sleep(3)
os.system('taskkill /f /IM Chrome.exe')
j = j+1

第四版

本次更新是博客文章大于50篇以后，可能需要2页显示，但是只能统计两页内容

因此重新更新

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
url1 = 'http://blog.csdn.net/qiqiyingse/article/list/1?viewmode=contents'
url2 = 'http://blog.csdn.net/qiqiyingse/article/list/2?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req1 = urllib2.Request(url=url1,headers=headers)
req2 = urllib2.Request(url=url2,headers=headers)
try:
html1 = urllib2.urlopen(req1).read()
html2 = urllib2.urlopen(req2).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
path=u'count'
if not os.path.exists(path):
os.makedirs(path)
fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
print fname
fd=open(fname,'w')
page1 = BeautifulSoup(html1,'lxml')
page2 = BeautifulSoup(html2,'lxml')
items1 = page1.find_all('div',class_ ='list_item list_view')
items2 = page2.find_all('div',class_ ='list_item list_view')
cont_print= u'总共有文章%d 篇' % (len(items1)+len(items2))
print cont_print
fd.write(cont_print)
fd.write('\n')
for item in items1:
aa={}
content=item.find('a')

contemtUrl=baseurl+content.get('href')
#print contemtUrl

read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime

readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
for item in items2:
aa={}
content=item.find('a')

contemtUrl=baseurl+content.get('href')
#print contemtUrl

read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime

readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1

for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList

urls=getPage()

第五版

这次版本对整个函数进行了调整

1.让每一部分看起来更易读

2.可以统计个人名下所有的博客内容了，不管你有多少篇多少页博客，都能给统计到

3.更新了排序算法，这样就修复了之前的一个bug

代码如下：

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#自定义打印函数
def self_log(msg):
print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

#获取页面内容
def  get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
return html

#得到博客页面总数
def get_last_page(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
if page.find('div',class_ ='pagelist').find_all('a'):
last_page=page.find('div',class_ ='pagelist').find_all('a')
last_page= last_page[len(last_page)-1].get('href')[-1:]
self_log('总共有%s 页博客' % last_page)
fd.write('总共有%s 页博客\n' % last_page)

return last_page
else:
return 1

#获取积分内容
def get_rank(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
rank_list=[]
if page.find(id='blog_rank'):

rank_content=page.find(id='blog_rank')
i =1
for rank in rank_content.find_all('li'):
if i<3:
self_log(rank.text)
fd.write(rank.text)
fd.write('\n')
rank_list.append(rank.text)
i +=1
return rank_list

#获取页面列表
def get_items(url):
content_html=get_html(url)
page = BeautifulSoup(content_html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
return items

#根据每一个items list 提取需要的元素
def handle_items(items,content_list,read_num_for_sort):
for item in items:
temp={}#临时变量

title=item.find('a')#标题
content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
comments_time=item.find('span',class_ ='link_comments')#评论次数

read_number = int(filter(str.isdigit, str(read_times)))	#提取出来具体阅读次数的数字，为之后的排序做准备
read_num_for_sort.append(read_number)

#将数据打包
temp['indexs']=read_number
temp['title']=title.text.strip()
temp['read_times']=read_times
temp['comments_time']=comments_time.text.strip()
temp['content_url']=content_url
content_list.append(temp)

#创建文件夹
def mkdir_folder(path):
if not os.path.exists(path):
os.makedirs(path)

#程序运行主函数
def run(url):
read_num_for_sort=[]
content_list=[]
content_totle_list=[]

#定义文件夹名字并创建文件夹
dir_path='count'
mkdir_folder(dir_path)

#定义文件名字
count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
fd=open(count_file_name,'w')

#1.从主页进入获取页面总数
main_html=get_html(url)
last_page=get_last_page(main_html,fd)

#2.获取积分内容
rank_list=get_rank(main_html,fd)

#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容
for i in range(1,int(last_page)+1):
main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i
self_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))

items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item list
handle_items(items,content_list,read_num_for_sort)#处理item list

#4.根据阅读次数 进行排序
read_num_for_sort.sort()
print read_num_for_sort
'''
这也是一种排序思想，其中有一些缺陷
for i in read_num_for_sort:
for a in content_list:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']
'''
self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数
content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)#根据 indexs（阅读次数）这个索引值进行排序

article_index = 1
for a in content_list:
#组装打印语句
totalcontent= '第'+str(article_index)+'篇  |'+a['title']+'\t|'+a['read_times']+'\t|'+a['comments_time']+'\t|'+a['content_url']
self_log(totalcontent)
#将其存贮到本地
fd.write(totalcontent)
fd.write('\n')
article_index +=1
content_totle_list.append(totalcontent)
fd.close()

return content_totle_list

if __name__ == '__main__':
print ''''
*****************************************
**    Welcome to Spider of Count CSDN  **
**      Created on 2017-04-12          **
**      @author: Jimy_Fengqi           **
*****************************************'''
url='http://blog.csdn.net/qiqiyingse?viewmode=contents'
run(url)

第六版

此次更新，优化一点点内容
主要是可以将文章内容存贮到本地
同时，将统计信息重新放到一个excel 里面
代码如下：

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
from pyExcelerator import * #导入excel相关包
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#自定义打印函数
def self_log(msg):
print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

#获取页面内容
def  get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
return html

#得到博客页面总数
def get_last_page(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
if page.find('div',class_ ='pagelist').find_all('a'):
last_page=page.find('div',class_ ='pagelist').find_all('a')
last_page= last_page[len(last_page)-1].get('href')[-1:]
self_log('总共有%s 页博客' % last_page)
fd.write('总共有%s 页博客\n' % last_page)

return last_page
else:
return 1

#获取积分内容
def get_rank(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
rank_list=[]
if page.find(id='blog_rank'):

rank_content=page.find(id='blog_rank')
i =1
for rank in rank_content.find_all('li'):
if i<3:
self_log(rank.text)
fd.write(rank.text)
fd.write('\n')
rank_list.append(rank.text)
i +=1
return rank_list

#获取页面列表
def get_items(url):
content_html=get_html(url)
page = BeautifulSoup(content_html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
return items

#根据每一个items list 提取需要的元素
def handle_items(items,content_list,read_num_for_sort):
for item in items:
temp={}#临时变量

title=item.find('a')#标题
content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
comments_time=item.find('span',class_ ='link_comments')#评论次数

read_number = int(filter(str.isdigit, str(read_times)))	#提取出来具体阅读次数的数字，为之后的排序做准备
read_num_for_sort.append(read_number)

#将数据打包
temp['indexs']=read_number
temp['title']=title.text.strip()
temp['read_times']=read_times
temp['comments_time']=comments_time.text.strip()
temp['content_url']=content_url
content_list.append(temp)

#创建文件夹
def mkdir_folder(path):
if not os.path.exists(path):
os.makedirs(path)

#获取页面信息
def getContent(html):
page = BeautifulSoup(html,'lxml')
try:
title=page.find('div',class_='article_title').find('a').text
title=title.strip()
except Exception,e:
print e
try:
content=page.find('div',class_='article_content')
dir_path='count'
artitle_name_path=dir_path+'/'+title+'.txt'
with open(artitle_name_path+'.txt','w') as f:
f.write(content.text)
self_log(u'存贮文章：%s 完毕' % title)
except Exception,e:
print e

#存贮每一篇文章到本地
def run_to_get_article(content_total_list):
self_log('start save every article  ')
for article_content in content_total_list:
article_url=article_co
f2dc
ntent.split('|')[4]
self_log( '将要存贮的地址是： %s ...' % article_url)
artitle_html=get_html(article_url)
getContent(artitle_html)

#将内容存贮到excel中
def run_to_save_info_in_excel(content_total_list):
self_log('start save info into excel')
excel_w=Workbook()
excel_sheet_name=time.strftime('%Y-%m-%d_%H-%M-%S')
excel_content_handler=excel_w.add_sheet(excel_sheet_name)

first_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']
cols=0
for content in first_line:
excel_content_handler.write(0,cols,content)
cols +=1

index=1
for article_content in content_total_list:
cols =0
for a in article_content.split('|'):
excel_content_handler.write(index,cols,a)
cols +=1
index +=1
excel_w.save('count/'+'count'+time.strftime('%Y-%m-%d')+'.xls')

#程序运行主函数
def run(url):
read_num_for_sort=[]
content_list=[]
content_total_list=[]

#定义文件夹名字并创建文件夹
dir_path='count'
mkdir_folder(dir_path)

#定义文件名字
count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
fd=open(count_file_name,'w')

#1.从主页进入获取页面总数
main_html=get_html(url)
last_page=get_last_page(main_html,fd)

#2.获取积分内容
rank_list=get_rank(main_html,fd)

#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容
for i in range(1,int(last_page)+1):
main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i
self_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))

items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item list
handle_items(items,content_list,read_num_for_sort)#处理item list

#4.根据阅读次数 进行排序
read_num_for_sort.sort()
print read_num_for_sort
'''
这也是一种排序思想，其中有一些缺陷
for i in read_num_for_sort:
for a in content_list:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']
'''
self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数
#根据 indexs（阅读次数）这个索引值进行排序
#非常好的一个根据列表中字典数据进行排序的方法
content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)

article_index = 1
for a in content_list:
#组装打印语句
totalcontent= '第'+str(article_index)+'篇  |'+a['title']+'\t|'+a['read_times']+'\t|'+a['comments_time']+'\t|'+a['content_url']
#self_log(totalcontent)
#将其存贮到本地
fd.write(totalcontent)
fd.write('\n')
article_index +=1
content_total_list.append(totalcontent)
fd.close()

return content_total_list

if __name__ == '__main__':
print ''''
*****************************************
**    Welcome to Spider of Count CSDN  **
**      Created on 2017-04-12          **
**      @author: Jimy_Fengqi           **
*****************************************'''
url=raw_input(u'请输入将要统计的个人csdn主页地址，类似如下：\n http://blog.csdn.net/qiqiyingse?viewmode=contents') if not url:
url='http://blog.csdn.net/qiqiyingse?viewmode=contents'
content_total_list=run(url)
run_to_save_info_in_excel(content_total_list)
run_to_get_article(content_total_list)

第七版

更新一下excel 的使用方法

#coding:utf-8
import urllib2,re,time,random,os,datetime
from bs4 import BeautifulSoup
from pyExcelerator import * #导入excel相关包
import xlrd
import xlwt
from xlutils.copy import copy
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def create_excel(data):
excle_file_name=str(time.strftime('%Y-%m-%d')+'.xls')#以当天日期创建excel表

#判断一个文件是否存在
def file_is_exist(file_name):
path = os.path.join(os.getcwd()+'/count/'+file_name)
print 'current file [%s] path is [%s]' % (file_name,path)
is_exists = os.path.exists(path)
return is_exists

#读取复制一份，并且增加一张新表
def read_and_copy_excle(excle_file_name):
read_excel_flag=xlrd.open_workbook(excle_file_name,formatting_info=True)#保存原有格式
count = len(read_excel_flag.sheets()) #sheet数量
for r in read_excel_flag.sheets():
print r.name #sheet名称
worksheet_copy=copy(read_excel_flag)#复制一份excel
write_excel(worksheet_copy,excle_file_name)#之后再次插入一份

#写excel
def write_excel(excel_flag,excle_file_name):
sheet_name=str(time.strftime('%Y-%m-%d_%H-%M-%S'))
sheet_flag = excel_flag.add_sheet(sheet_name,cell_overwrite_ok=True) #创建sheet
first_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']

#生成第一行
for i in range(0,len(first_line)):
sheet_flag.write(0,i,first_line[i],set_style('Times New Roman',220,True,40))
if i== 1:#设置行宽
sheet_flag.col(i).width=256*150
elif i == 4:
sheet_flag.col(i).width=256*80
else:
sheet_flag.col(i).width=256*15

row_index=1
for article_content in data:
cols_index =0
for data_detail in article_content.split('|'):
sheet_flag.write(row_index,cols_index,data_detail,set_style('Arial',300,False,cols_index))
#sheet_flag.col(cols_index).width=sheet_flag.col(cols_index+1).width
cols_index +=1
row_index +=1
style = xlwt.easyxf('font:height 240, color-index red, bold on;align: wrap on, vert centre, horiz center');
sheet_flag.write(row_index+1,cols_index+1, 'hello world', style)
sheet_flag.write(row_index+2,cols_index+2,'start',set_style(u'宋体',300,False,20))
excel_flag.save('count/'+excle_file_name) #保存文件

#设置单元格格式
def set_style(name,height,bold,color_index):
style = xlwt.XFStyle() # 初始化样式

font = xlwt.Font() # 为样式创建字体
font.name = name # 字体名称
font.bold = bold #字体加粗
font.color_index = color_index #字体颜色， 但是貌似无效
font.height = height

borders= xlwt.Borders()#字体边框
borders.left= 6
borders.right= 6
borders.top= 6
borders.bottom= 6

style.font = font
if bold:
style.borders = borders
return style

#文件存在就复制一份，并在其表的后面插入一个，不存在就新创建一个
if file_is_exist(excle_file_name):
print 'file 【%s】 exist ' % excle_file_name
read_and_copy_excle(excle_file_name)#复制一个excle并追加一个sheet页
else:
print 'file 【%s】is not  exist, will create it ' % excle_file_name
excel_flag=xlwt.Workbook()#新建excel工作薄
write_excel(excel_flag,excle_file_name)

#自定义打印函数
def self_log(msg):
print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

#获取页面内容
def  get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
return html

#得到博客页面总数
def get_last_page(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
if page.find('div',class_ ='pagelist').find_all('a'):
last_page=page.find('div',class_ ='pagelist').find_all('a')
last_page= last_page[len(last_page)-1].get('href')[-1:]
self_log('总共有%s 页博客' % last_page)
fd.write('总共有%s 页博客\n' % last_page)

return last_page
else:
return 1

#获取积分内容
def get_rank(html,fd):
if not html:
self_log(u'页面错误，停止运行')
return
page = BeautifulSoup(html,'lxml')
rank_list=[]
if page.find(id='blog_rank'):

rank_content=page.find(id='blog_rank')
i =1
for rank in rank_content.find_all('li'):
if i<3:
self_log(rank.text)
fd.write(rank.text)
fd.write('\n')
rank_list.append(rank.text)
i +=1
return rank_list

#获取页面列表
def get_items(url):
content_html=get_html(url)
page = BeautifulSoup(content_html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
return items

#根据每一个items list 提取需要的元素
def handle_items(items,content_list,read_num_for_sort):
for item in items:
temp={}#临时变量

title=item.find('a')#标题
content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
comments_time=item.find('span',class_ ='link_comments')#评论次数

read_number = int(filter(str.isdigit, str(read_times)))	#提取出来具体阅读次数的数字，为之后的排序做准备
read_num_for_sort.append(read_number)

#将数据打包
temp['indexs']=read_number
temp['title']=title.text.strip()
temp['read_times']=read_times
temp['comments_time']=comments_time.text.strip()
temp['content_url']=content_url
content_list.append(temp)

#创建文件夹
def mkdir_folder(path):
if not os.path.exists(path):
os.makedirs(path)

#获取页面信息
def getContent(html):
page = BeautifulSoup(html,'lxml')
try:
title=page.find('div',class_='article_title').find('a').text
title=title.strip()
except Exception,e:
print e
try:
content=page.find('div',class_='article_content')
dir_path='count'
artitle_name_path=dir_path+'/'+title+'.txt'
with open(artitle_name_path+'.txt','w') as f:
f.write(content.text)
self_log(u'存贮文章：%s 完毕' % title)
except Exception,e:
print e

#存贮每一篇文章到本地
def run_to_get_article(content_total_list):
self_log('start save every article  ')
for article_content in content_total_list:
article_url=article_content.split('|')[4]
self_log( '将要存贮的地址是： %s ...' % article_url)
artitle_html=get_html(article_url)
getContent(artitle_html)

#将内容存贮到excel中
def run_to_save_info_in_excel(content_total_list):
self_log('start save info into excel')
excel_w=Workbook()
excel_sheet_name=time.strftime('%Y-%m-%d_%H-%M-%S')
excel_content_handler=excel_w.add_sheet(excel_sheet_name)

first_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']
cols=0
for content in first_line:
excel_content_handler.write(0,cols,content)
cols +=1

index=1
for article_content in content_total_list:
cols =0
for a in article_content.split('|'):
excel_content_handler.write(index,cols,a)
cols +=1
index +=1
excel_w.save('count/'+'count_'+time.strftime('%Y-%m-%d_%H-%M')+'.xls')

#程序运行主函数
def run(url):
read_num_for_sort=[]
content_list=[]
content_total_list=[]

#定义文件夹名字并创建文件夹
dir_path='count'
mkdir_folder(dir_path)

#定义文件名字
count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
fd=open(count_file_name,'w')

#1.从主页进入获取页面总数
main_html=get_html(url)
last_page=get_last_page(main_html,fd)

#2.获取积分内容
rank_list=get_rank(main_html,fd)

#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容
for i in range(1,int(last_page)+1):
main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i
self_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))

items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item list
handle_items(items,content_list,read_num_for_sort)#处理item list

#4.根据阅读次数 进行排序
read_num_for_sort.sort()
print read_num_for_sort
'''
这也是一种排序思想，其中有一些缺陷
for i in read_num_for_sort:
for a in content_list:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']
'''
self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数
#根据 indexs（阅读次数）这个索引值进行排序
#非常好的一个根据列表中字典数据进行排序的方法
content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)

article_index = 1
for a in content_list:
#组装打印语句
totalcontent= '第'+str(article_index)+'篇|'+a['title']+'|'+a['read_times']+'|'+a['comments_time']+'|'+a['content_url']
#self_log(totalcontent)
print totalcontent
#将其存贮到本地
fd.write(totalcontent)
fd.write('\n')
article_index +=1
content_total_list.append(totalcontent)
fd.close()

return content_total_list

if __name__ == '__main__':
print ''''
*****************************************
**    Welcome to Spider of Count CSDN  **
**      Created on 2017-04-12          **
**      @author: Jimy_Fengqi           **
*****************************************'''
url=raw_input(u'请输入将要统计的个人csdn主页地址，类似如下：\n http://blog.csdn.net/qiqiyingse?viewmode=contents') if not url:
url='http://blog.csdn.net/qiqiyingse?viewmode=contents'
content_total_list=run(url)
#run_to_save_info_in_excel(content_total_list)
create_excel(content_total_list)
#run_to_get_article(content_total_list)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航