您的位置:首页 > 编程语言 > Python开发

使用python抓取csdn博客访问量并保存在sqlite3数据库中

2017-10-02 00:15 627 查看
转载请注明来源:http://blog.csdn.net/imred

解析网页使用的是BeautifulSoup,具体不解释了,代码里有注释,不懂欢迎提问。

# -*- coding: utf-8 -*-

import os
import sqlite3
import urllib.request
from bs4 import BeautifulSoup

# 前缀+页数+后缀拼成URL,HOST用来把链接相对地址转为绝对地址
PREFIX = "http://blog.csdn.net/imred/article/list/"
SUFFIX = "?viewmode=contents"
HOST = "http://blog.csdn.net"

# t_article保存文章信息,t_time保存抓取时时间,t_view保存访问量
SQL_CREATE_T_ARTICLE = "CREATE TABLE IF NOT EXISTS t_article ( \
id INTEGER PRIMARY KEY AUTOINCREMENT, \
title TEXT NOT NULL, \
link TEXT NOT NULL)"
SQL_CREATE_T_TIME = "CREATE TABLE IF NOT EXISTS t_time ( \
id INTEGER PRIMARY KEY AUTOINCREMENT, \
time TEXT DEFAULT (datetime('now', 'localtime')))"
SQL_CREATE_T_VIEW = "CREATE TABLE IF NOT EXISTS t_view ( \
id INTEGER PRIMARY KEY AUTOINCREMENT, \
tid INTEGER, \
aid INTEGER, \
view INTEGER NOT NULL, \
FOREIGN KEY(tid) REFERENCES t_time(id) ON DELETE CASCADE, \
FOREIGN KEY(aid) REFERENCES t_article(id) ON DELETE CASCADE)"

# 得到最近插入的时间id
SQL_QUERY_MAX_TID = "SELECT MAX(id) max_tid FROM t_time"
SQL_INSERT_TIME = "INSERT INTO t_time(time) VALUES(datetime('now', 'localtime'))"
# 通过链接地址查询文章id
SQL_QUERY_ARTICLE = "SELECT id FROM t_article WHERE link=?"
SQL_INSERT_ARTICLE = "INSERT INTO t_article(title, link) VALUES(?, ?)"
SQL_INSERT_VIEW = "INSERT INTO t_view(tid, aid, view) VALUES(?, ?, ?)"

class MyError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)

def getHtml(url):
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
try:
req = urllib.request.Request(url, headers=HEADERS)
response = urllib.request.urlopen(req)
html = response.read().decode()
return html
except urllib.error.HTTPError as e:
raise MyError("Request for " + url + " failed: " + str(e))

# 获取文章总共有多少页
def getPageNum(soup):
soupPageListDiv = soup.find("div", class_="pagelist")

if soupPageListDiv is None:
raise MyError("No any articles")

soupPageNumSpan = soupPageListDiv.span
strSpan = soupPageNumSpan.string
left = strSpan.find("共")
right = strSpan.find("页")
strNum = strSpan[left + 1:right]
return int(strNum)

def main():
html = getHtml(PREFIX + "1" + SUFFIX)

soup = BeautifulSoup(html, "lxml")
iPageNum = getPageNum(soup)

# 文章标题列表
titleListHTML = soup.find_all("span", class_="link_title")
titleList = []

# 文章访问量列表和文章链接列表
viewListHTML = soup.find_all("span", class_="link_view")
viewList = []
linkList = []

if iPageNum > 1:
for i in range(2, iPageNum + 1):
tmpHtml = getHtml(PREFIX + str(i) + SUFFIX)
tempSoup = BeautifulSoup(tmpHtml, "lxml")
titleListHTML += tempSoup.find_all("span", class_="link_title")
viewListHTML += tempSoup.find_all("span", class_="link_view")

for title in titleListHTML:
titleList.append(title.a.string.strip())
for view in viewListHTML:
viewList.append(view.contents[1].strip("()"))
linkList.append(HOST + view.contents[0]['href'])

for i in range(len(titleList)):
print(titleList[i] + " " + viewList[i] + " " + linkList[i])

strDbPath = os.path.join(os.path.dirname(__file__), 'blog_stat.db')

try:
conn = sqlite3.connect(strDbPath)
try:
cursor = conn.cursor()

cursor.execute(SQL_CREATE_T_ARTICLE)
cursor.execute(SQL_CREATE_T_TIME)
cursor.execute(SQL_CREATE_T_VIEW)

cursor.execute(SQL_INSERT_TIME)

for i in range(len(titleList)):
title = titleList[i]
link = linkList[i]
# 在t_article中查询是否有这篇文章,如果没有,则插入一条新纪录
cursor.execute(SQL_QUERY_ARTICLE, (link,))
result = cursor.fetchall()
if len(result) == 0:
cursor.execute(SQL_INSERT_ARTICLE, (title, link))

# 得到时间id
cursor.execute(SQL_QUERY_MAX_TID)
result = cursor.fetchone()
max_tid = result[0]
for i in range(len(titleList)):
link = linkList[i]
view = viewList[i]
cursor.execute(SQL_QUERY_ARTICLE, (link, ))
result = cursor.fetchone()
# 得到文章id
aid = result[0]
# 插入新纪录
cursor.execute(SQL_INSERT_VIEW, (max_tid, aid, view))
finally:
cursor.close()
conn.commit()
finally:
conn.close()

if __name__ == "__main__":
main()


本文采用 CC-BY 协议进行授权
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息