您的位置:首页 > 编程语言 > Python开发

python获取所有链接保存到数据表并依次打开

2011-06-29 09:51 477 查看
python获取网页上所有链接,并保存到sqlite3数据表中,并用浏览器打开。如果该表已存在,则直接从表中读取链接并打开。

表名中去掉开头"http://", 结尾"/", 端口号,替换中间字符".", "/"为"_"

用到的python库:

sgmllib,urllib网页有关

re正则表达式

sqlite3数据表

subprocess子进程

#!/usr/bin/env python

#-*-coding:utf-8 -*-

from sgmllib import SGMLParser

import urllib,re

import sys, os, string, time

import sqlite3

import subprocess, signal

class UrlList(SGMLParser):

def reset(self):

self.urls=[]

SGMLParser.reset(self)

def start_a(self,attrs):

href=[v for k,v in attrs if k=='href']

if href:

self.urls.extend(href)

def get_urls(url):

try:

usock=urllib.urlopen(url)

except:

print "get url except "+url

return []

result=[]

parser=UrlList()

parser.feed(usock.read())

usock.close()

parser.close()

urls=parser.urls

for url in urls:

if len(re.findall(r'^http://',url))>0 or len(re.findall(r'^../../',url))>0: #指定正则表达式

result.append(url)

return result

def find_string(url, sub, rdepth):

if rdepth == 0:

return url

n=url.rfind(sub)

return find_string(url[:n], sub, rdepth-1)

def update_urls(startURL, url_list):

if len(url_list)==0:

return []

result=[]

s=find_string(startURL, r'/', 3) #找到右数第三次出现'/'之前的字符串,去替换../..

for u in url_list:

if u.find(r'../../')==0:

u=u.replace(r'../..', s)

result.append(u)

return result

def write_urls_into_table(urldb, table_name, urls):

conn=sqlite3.connect(urldb)

conn.isolation_level = None

conn.execute('create table if not exists %s(id integer primary key, url varchar(255), comment varchar(128))' % table_name)

for i, url in enumerate(urls):

conn.execute("insert into %s values(%d, '%s', '')" % (table_name, i, url))

conn.commit()

conn.close

def read_urls_from_table(urldb, table_name):

conn=sqlite3.connect(urldb)

conn.isolation_level = None

conn.text_factory = str

cur = conn.cursor()

try:

cur.execute("select url from %s" % table_name)

except sqlite3.Error, e:

print "An error occurred:", e.args[0]

res = cur.fetchall()

cur.close

conn.close

if len(res):

print "total urls: %d" % len(res)

return res

else:

print "read table %s null" % table_name

sys.exit(1)

def open_url(content):

if len(content)==0:

return

for line in content:

strl=str(line)

url=strl[2:-3]

print "open url "+url

try:

p=subprocess.Popen(["chrome", url], close_fds=True, preexec_fn=os.setsid)

time.sleep(8)

os.killpg(p.pid, signal.SIGUSR1)

time.sleep(3)

if p.poll():

print '/n'

continue

else:

print "Not kill all child process"

sys.exit(1)

except KeyboardInterrupt:

print "Pressed ctrl+c quit"

sys.exit(0)

else:

print "open urls over"

def start_run(startUrl, urldb):

if startUrl is None:

print "start url is null"

sys.exit(1)

if urldb is None:

print "db is null"

sys.exit(1)

table_name=''

if startUrl.find(r'http://')==0: #以http://开头,去掉开头

url=startUrl[7:]

start=url.find(':')

if start!=-1: #去掉端口号

end=url.find(r'/')

url=url[:start]+url[end:]

if startUrl.rfind(r'/')==(len(startUrl)-1): #以/结尾,去掉结尾

url=url[:-1]

if startUrl.find(r'/', 7)!=-1: #含有/,如http://bj.58.com/wenziluru,将/转换为_

url=url.replace(r'/', '_')

table_name=url.replace('.', '_') #将.转换为_

print "table name: %s" % table_name

# sys.exit(0)

conn=sqlite3.connect(urldb)

conn.isolation_level = None

try:

conn.execute("select * from %s" % table_name) #判断表是否存在,存在就直接读取内容

except :

print "%s not exists, create ..." % table_name #不存在需创建表

urls=get_urls(startUrl)

newurls=update_urls(startUrl, urls) #将../../替换为绝对路径

write_urls_into_table(urldb, table_name, newurls) #将链接写入数据库中

conn.close

content=read_urls_from_table(urldb, table_name) #读取表中内容

open_url(content) #打开链接

if __name__=="__main__":

startUrl="http://www.baidu.com:80/"

urldb='urls.db'

start_run(startUrl, urldb)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐