您的位置:首页 > 编程语言 > Python开发

python 爬虫 (错误很多)

2014-02-13 00:00 253 查看
不怎么会用PYTHON写。。今天试了一下。

#!/usr/bin/python # vim: set fileencoding=utf-8:
import sys import urllib2 import re import sqlite3 import hashlib import random from BeautifulSoup import BeautifulSoup class SpriderUrl: # 初始化
def __init__(self,url,domain_name): self.url=url self.domain_name=domain_name # 获得URL列表
def getUrl(self): urls=[] # try:
body_text=urllib2.urlopen(self.url).read() soup=BeautifulSoup(body_text) links=soup.findAll('a') # connect sqllite3
md5_str=hashlib.md5(str(random.randint(1,100000))+"aa") print "data_name:"+md5_str.hexdigest() # create sqlite3 data name
con=sqlite3.connect(md5_str.hexdigest()+".db") # create sqlite3 table name
con.execute("""create table url_data(id interger auto_increment primary key,url TEXT not null)""") for link in links: if re.match('(.*)\:\/\/'+self.domain_name,link.get('href')): urls.append(link.get('href')) con.execute("insert into url_data(url)values('"+link.get('href')+"')") con.commit() while len(urls)>0: for url in urls: body_text2=urllib2.urlopen(url).read() soup2=BeautifulSoup(body_text2) links2=soup2.findAll('a') for link2 in links2: if re.match('(.*)\:\/\/'+self.domain_name,link2.get('href')): test=link2.get('href') cur=con.execute("select * from url_data where url='"+test+"'") bool_itm=cur.fetchone() if bool_itm is None: urls.append(link2.get('href')) con.execute("insert into url_data(url)values('"+test+"')") else: continue
else: continue
print "Done" t=SpriderUrl('http://www.baidu.com/',"www.baidu.com") t.getUrl()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: