您的位置:首页 > 编程语言 > Python开发

python完成网页下载(包括图片和css)

2013-06-07 20:10 393 查看
import urllib2
import os
from bs4 import BeautifulSoup
import re
siteUrls = " "

url = "http://www.sina.com.cn"
def getContent(url):
content = urllib2.urlopen(url).read()
content = writeCss(url,content)
content = writefileName(url,content)
fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
fileName = fileNames[0]
print fileName
f = file(fileName+".html",'w')
f.write(content)
f.close()

def writeCss(url,content):
soup = BeautifulSoup(content)
csss = soup.findAll('link',attrs={'type':'text/css'})
css_pat = re.compile('.*/(.*)\.css')
fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
fileName = fileNames[0]
print fileName
for css in csss:
cssnames = re.findall(r'.*/(.*)\.css',str(css))
cssurls = re.findall(r'.*href=\"([^\"]*)\"',str(css))
#		print cssnames[0]
#		print cssurls[0]
cssurl = "http://review.artintern.net/" + cssurls[0]
#		print cssurl
content = content.replace(cssurls[0],fileName + "/" + cssnames[0]+".css")
print os.path.isdir(fileName)
if not os.path.isdir(fileName):
os.mkdir(fileName)
csscontent = urllib2.urlopen(cssurl).read()
cssNewName = fileName+"/"+cssnames[0]+".css"
cssfile = file(cssNewName,'w')
cssfile.write(csscontent)
cssfile.close()
return content

def writefileName(url,content):
soup = BeautifulSoup(content)
imgs = soup.findAll('img')
img_pat = re.compile('.*/(.*)\.[jpg|gif]')
fileNames = re.findall(r'/[^\?]*\?([^/|^\?]*)$',url)
fileName = fileNames[0]
for img in imgs:
imgNames = re.findall(r'.*/(.*)\.[jpg|gif]',str(img))
imgType = re.findall(r'.*/.*\.([^ ]*)"',str(img))
imgUrls = re.findall(r'.*src=\"([^\"]*)\"',str(img))
#		print imgNames[0]
#		print imgType[0]
#		print imgUrls[0]
imgUrl = "http://review.artintern.net/" + imgUrls[0]
#		print imgUrl
content = content.replace(imgUrls[0],fileName+"/"+imgNames[0]+"."+imgType[0])
if not os.path.isdir(fileName):
os.mkdir(fileName)
imgContent = urllib2.urlopen(imgUrl).read()
imgNewName = fileName+"/"+imgNames[0]+"."+imgType[0]
imgfile = file(imgNewName,'w')
imgfile.write(imgContent)
imgfile.close()
return content

getContent(url)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: