您的位置:首页 > 编程语言 > Python开发

第一次用python 写的简单爬虫 记录在自己的博客

2013-12-29 23:01 921 查看
#python.py
from bs4 import BeautifulSoup
import urllib.request
from MySqlite import MySqlite

global g_intid
g_intid=0
def GetBlogTileAndName(url):
res = urllib.request.urlopen(url)
html = res.read()
res.close()
str(html, 'utf-8')
soup=BeautifulSoup(html)
divs=soup.find_all(attrs={"class":"postTitle"})

for divname in divs:
print("title:=",divname.a.string,"href:=",divname.a["href"])
global  g_intid
g_intid+=1
x=MySqlite()
x.InsertDate(g_intid,divname.a["href"],divname.a.string)
def GetBlogPage(url):
res = urllib.request.urlopen(url)
html = res.read()
res.close()
str(html, 'utf-8')
soup=BeautifulSoup(html)
divPager=soup.find(attrs={"class":"pager"})
print(divPager.string)

for i in range(1,8) :
url=r"http://www.cnblogs.com/FCoding/default.html?page="+str(i)
GetBlogTileAndName(url)

  

#MySqlite.py

class MySqlite(object):
"""description of class"""
def __init__(self, *args):
return super().__init__(*args)
def callstr(self,str):
print(str)

def InsertDate(self,id,url,title):
conn = sqlite3.connect(r"d:\123.db")
c=conn.cursor()
#try:
#    c.execute('create table blog (ID intergeer,url text,title text , PRIMARY KEY(ID))')
#except ValueError:
#    print("error My")
strExe="insert into blog values ({0}, \"{1}\",\"{2}\")".format(id,url,title)
print(id)
#c.execute('insert into blog values (last_insert_rowid(),url,title)')
c.execute(strExe)
conn.commit()
c.close()
conn.close()

def GetDate(self):
import sqlite3
conn = sqlite3.connect(r"d:\123.db")
c=conn.cursor()
res=c.execute("select count(*) from blog")
res=c.fetchone()
print(res[0])
data=c.execute("select * from blog")
for item in data:
for ite in item:
print(ite)
conn.commit()
c.close()
conn.close()


 简述一下功能:

通过urllib 下载网页 使用BeautifulSoup 解析

调用find_all(attrs={"class":"postTitle"})

找到HTML 中所有class=posttitle 的tag

然后遍历 取出title 和href 保存到数据库中

此程序 无容错。新手无笑!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: