您的位置:首页 > 编程语言 > Python开发

开源中国推荐博客爬虫

2016-04-16 23:04 519 查看
话不多说,直接上代码。

# coding: utf-8

import urllib
import urllib2
import re
import MySQLdb

req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'}

####################################################################################
conn = MySQLdb.connect(host="localhost",user="root",passwd="root",db="osc_recommend_blog",port=3306)
cur  = conn.cursor()
cur.execute("set names utf8")

def is_exist(id):
sql = "select * from blog where id='%s'"%(id)
cur.execute(sql)
result = cur.fetchall()
if len(result) > 0:
return True
else:
return False

def add_blog(id='',url='',title='',user_id='',user_name='',user_url='',content='',post_time='1970-01-01 00:00:00'):
title = MySQLdb.escape_string(title)
user_name = MySQLdb.escape_string(user_name)
content = MySQLdb.escape_string(content)
sql = "insert into blog(id,url,title,user_id,user_name,user_url,content,post_time) values('%s','%s','%s','%s','%s','%s','%s','%s')"%(id,url,title,user_id,user_name,user_url,content,post_time)
try:
cur.execute(sql)
except:
content = ""
sql = "insert into blog(id,url,title,user_id,user_name,user_url,content,post_time) values('%s','%s','%s','%s','%s','%s','%s','%s')"%(id,url,title,user_id,user_name,user_url,content,post_time)
cur.execute(sql)

####################################################################################

def get_blog_id (url):
while url.find('/') >= 0:
url = url[url.find('/')+1:]
return url

def get_blog_fromerr(url):
content = get_content(url)
pattern = re.compile('<script>location.href=.*?;</script>',re.S)
fromerrs = re.findall(pattern,content)
ans = fromerrs[0]
ans = ans[ans.find("<script>location.href=\"")+len("<script>location.href=\""):]
ans = ans[:ans.find("\";</script>")]
return ans

def get_content(url):
req = urllib2.Request(url, None, req_header)
resp = urllib2.urlopen(req,None,30)
content = resp.read()
return content

def get_blog_content(content):
pattern = re.compile("<div class='BlogContent'>.*?<div class='BlogShare'>", re.S)
blog_content = re.findall(pattern, content)[0]
blog_content = blog_content[:len(blog_content)-len("<div class='BlogShare'>")]
return blog_content

def get_blog_post_time(blog_html):
pattern = re.compile("<div class='BlogStat'>.*?发表于.*?前((.*?))  ",re.S)
post_time = re.findall(pattern, blog_html)[0]
ans = post_time[0]
ans = ans[1:]
ans = ans[:len(ans)-1]
ans += ":00"
return ans

def get_page(page_id):
url = 'http://www.oschina.net/blog/more?p=%d'%(page_id)
print '                    %s'%(url)
content = get_content(url)
pattern = re.compile("<ul class='BlogList'>.*?</ul>",re.S)
content = re.findall(pattern,content)[0]
pattern = re.compile("<li>.*?</li>", re.S)
blog_list = re.findall(pattern, content)
for item in blog_list:
#print item
pattern = re.compile("<h3><a href=\"(.*?)\" target=\'_blank\'>(.*?)</a></h3>", re.S)
tmp = re.findall(pattern, item)[0]
url = tmp[0]
title = tmp[1]
blog_id = get_blog_id(url)
if not is_exist (blog_id):
print "%50s +"%(url)
pattern = re.compile("<a href=\"(.*?)\" class='u' target='_blank'><img src=\".*?\" align=\".*?\" alt=\".*?\" title=\"(.*?)\" class=\".*?\" user=\"(.*?)\"/></a>",re.S)
tmp = re.findall(pattern, item)[0]
user_url = tmp[0]
user_name = tmp[1]
user_id = tmp[2]
blog_html = get_content(url + get_blog_fromerr(url))
try:
blog_content = get_blog_content(blog_html)
except:
continue
blog_posttime = get_blog_post_time(blog_html)
add_blog(id=blog_id,url=url,title=title,user_id=user_id,user_name=user_name,user_url=user_url,content=blog_content,post_time=blog_posttime)
else:
print "%50s"%(url)
print '------------------------------------------'

if __name__=="__main__":
for i in range(1,26):
get_page(i)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Python 爬虫