您的位置:首页 > 编程语言 > Python开发

python爬虫新浪博客博文

2016-04-20 00:00 302 查看
一、功能描述

通过指定新浪博客主页url,自动爬取目录列表中所有文章的文字。保存所有文章的url,

二、完整代码

reptile.py

#!/usr/bin/python
#coding=utf-8
# Script: reptile.py
# Author: charlotte
# Date: 2016.4.20
#
# Platform:python
import urllib
import os
import artical_content

#use blog homeblog(reptile url),get blog dir_url(url)
filename = 'url_file'
rep_url = 'http://blog.sina.com.cn/twocold'
content = urllib.urlopen(rep_url).read()
bloginfo = content.find(r'blognavInfo')
dir_href = content.find(r'<a  href',bloginfo)
dir_html = content.find(r'.html',dir_href)
url = content[dir_href+10:dir_html+5]
# get all blog article,max 20page
j = 0
while j<20:
content = urllib.urlopen(url).read()
title = content.find(r'<a title')
html = title
# get all article on current page,max 50
i = 0
while i<50:
title = content.find(r'<a title', html)
html = content.find(r'.html', title)
url = content[title+34:html+5]
if url == '':
break
#  print url
# save url on file
open(filename,'a').write(url)
open(filename,'a').write('\n')
i = i+1
artical_content.get_article(url)
# get next page url
page_on = content.find(r'SG_pgon')
page_next = content.find(r'<a href=',page_on)
html = content.find(r'.html',page_next)
span = content.find(r'<span',page_on)
if page_next > span:
break
url = content[page_next+9:html+5]
# print url
j = j+1
#print i,' ',j

#print content
#filename = url[-26:]
#open(filename,'w').write(content)

artical_content.py

import urllib
import os
# Script: artical_content.py
# Author: charlotte
# Date: 2016.4.20
#
# Platform:python
def get_article(url):
article = urllib.urlopen(url).read().lower()
artical = article.find(r'articalcontent')
article_b = article.find(r'style',artical)
articalContent_e = article_b+1
print url
r=0
while r<30:
style = article.find(r'style',articalContent_e)
if (style-articalContent_e)>30:
break;
articalContent_b = article.find(r'>',style)
articalContent_e = article.find(r'</div>',articalContent_b)
articalContent = article[articalContent_b+2:articalContent_e]
if articalContent[0:3] == '<br':
print ''
else:
print articalContent
r = r+1
#get_article('')

三、代码效果

url_file

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: