开源中国推荐博客爬虫
2016-04-16 23:04
519 查看
话不多说,直接上代码。
# coding: utf-8 import urllib import urllib2 import re import MySQLdb req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'} #################################################################################### conn = MySQLdb.connect(host="localhost",user="root",passwd="root",db="osc_recommend_blog",port=3306) cur = conn.cursor() cur.execute("set names utf8") def is_exist(id): sql = "select * from blog where id='%s'"%(id) cur.execute(sql) result = cur.fetchall() if len(result) > 0: return True else: return False def add_blog(id='',url='',title='',user_id='',user_name='',user_url='',content='',post_time='1970-01-01 00:00:00'): title = MySQLdb.escape_string(title) user_name = MySQLdb.escape_string(user_name) content = MySQLdb.escape_string(content) sql = "insert into blog(id,url,title,user_id,user_name,user_url,content,post_time) values('%s','%s','%s','%s','%s','%s','%s','%s')"%(id,url,title,user_id,user_name,user_url,content,post_time) try: cur.execute(sql) except: content = "" sql = "insert into blog(id,url,title,user_id,user_name,user_url,content,post_time) values('%s','%s','%s','%s','%s','%s','%s','%s')"%(id,url,title,user_id,user_name,user_url,content,post_time) cur.execute(sql) #################################################################################### def get_blog_id (url): while url.find('/') >= 0: url = url[url.find('/')+1:] return url def get_blog_fromerr(url): content = get_content(url) pattern = re.compile('<script>location.href=.*?;</script>',re.S) fromerrs = re.findall(pattern,content) ans = fromerrs[0] ans = ans[ans.find("<script>location.href=\"")+len("<script>location.href=\""):] ans = ans[:ans.find("\";</script>")] return ans def get_content(url): req = urllib2.Request(url, None, req_header) resp = urllib2.urlopen(req,None,30) content = resp.read() return content def get_blog_content(content): pattern = re.compile("<div class='BlogContent'>.*?<div class='BlogShare'>", re.S) blog_content = re.findall(pattern, content)[0] blog_content = blog_content[:len(blog_content)-len("<div class='BlogShare'>")] return blog_content def get_blog_post_time(blog_html): pattern = re.compile("<div class='BlogStat'>.*?发表于.*?前((.*?)) ",re.S) post_time = re.findall(pattern, blog_html)[0] ans = post_time[0] ans = ans[1:] ans = ans[:len(ans)-1] ans += ":00" return ans def get_page(page_id): url = 'http://www.oschina.net/blog/more?p=%d'%(page_id) print ' %s'%(url) content = get_content(url) pattern = re.compile("<ul class='BlogList'>.*?</ul>",re.S) content = re.findall(pattern,content)[0] pattern = re.compile("<li>.*?</li>", re.S) blog_list = re.findall(pattern, content) for item in blog_list: #print item pattern = re.compile("<h3><a href=\"(.*?)\" target=\'_blank\'>(.*?)</a></h3>", re.S) tmp = re.findall(pattern, item)[0] url = tmp[0] title = tmp[1] blog_id = get_blog_id(url) if not is_exist (blog_id): print "%50s +"%(url) pattern = re.compile("<a href=\"(.*?)\" class='u' target='_blank'><img src=\".*?\" align=\".*?\" alt=\".*?\" title=\"(.*?)\" class=\".*?\" user=\"(.*?)\"/></a>",re.S) tmp = re.findall(pattern, item)[0] user_url = tmp[0] user_name = tmp[1] user_id = tmp[2] blog_html = get_content(url + get_blog_fromerr(url)) try: blog_content = get_blog_content(blog_html) except: continue blog_posttime = get_blog_post_time(blog_html) add_blog(id=blog_id,url=url,title=title,user_id=user_id,user_name=user_name,user_url=user_url,content=blog_content,post_time=blog_posttime) else: print "%50s"%(url) print '------------------------------------------' if __name__=="__main__": for i in range(1,26): get_page(i)
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- 爬虫笔记
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- introduction to python for statistics,analysis笔记3