您的位置:首页 > 编程语言 > Python开发

无比强大!Python抓取cssmoban站点的模版并下载

2017-08-06 08:07 295 查看
Python实现抓取http://www.cssmoban.com/cssthemes站点的模版并下载

实现代码

# -*- coding: utf-8 -*-
import urlparse
import urllib2
import re
import os
import os.path

URL='http://www.cssmoban.com/cssthemes'

#全局超时设置
urllib2.socket.setdefaulttimeout(500)

#依据url获取内容
def getUrlContent(url):
response = urllib2.urlopen(url)
html = response.read();
return html

#获取html中的a标签。且格式是<a target="_blank" href="/showcase/*">的
def getAllUrl(html):
return re.findall('<a[\\s]+href="/cssthemes/\d+\.shtml">.*?

\/a>',html)

#获取下载文件的标题
def getDownTitle(html):
return re.findall('\<h1>(.*?

)\</h1>',html)

#获取文件下载的url
def getDownUrl(html):
return re.findall('<a.*?class="button btn-down".*?

\/a>',html)

#获取下一页的url
def getNextUrl(html):
return re.findall('<a.*?

下一页</a>',html)

#下载文件
def download(title,url):
result = urllib2.urlopen(url).read()
if os.path.exists("template/")==False:
os.makedirs("template/")
newname=("template/"+title.decode('utf-8'))
newname=newname+'.'+url[url.rfind('.')+1:len(url)]
open(newname, "wb").write(result)

#记录日志
def i(msg):
fileobj=open('info.log','a')
fileobj.write(msg+'\n')
fileobj.close();
print msg
#记录错误日志
def e(msg):
fileobj=open('error.log','a')
fileobj.write(msg+'\n')
fileobj.close();
print msg
if __name__ == '__main__':

#print getDownUrl('<a href="http://down.cssmoban.com/cssthemes1/cctp_17_jeans.zip" target="_blank" class="button btn-down" title="免费下载"><i class="icon-down icon-white"></i><i class="icon-white icon-down-transiton"></i>免费下载</a>')

html= getUrlContent(URL)
i('開始下载:%s' %(URL))
while True:
lista= getAllUrl(html);
#print lista;
nextPage=getNextUrl(html)
#print nextPage[0]
nextUrl=''
#i('下一页%s'%(nextPage))

if len(nextPage)<=0:
e('地址:%s。未找到下一页,程序退出' %(nextPage))
break;

nextUrl=nextPage[0]
nextUrl=URL+'/'+nextUrl[nextUrl.index('href="')+6:nextUrl.index('" target')]
#print nextPage
for a in lista:
downGotoUrl=''
try:
#print a.decode('utf-8')
downGotoUrl=(URL+''+a[a.index('href="')+6:a.index('">')])
downGotoUrl=downGotoUrl.replace(URL,'http://www.cssmoban.com')
#print downGotoUrl
downHtml=getUrlContent(downGotoUrl)
#print downHtml
downTitleList= getDownTitle(downHtml)
downTitle=''
if len(downTitleList)>0:
downTitle=downTitleList[0]
#print downTitle
downUrlList= getDownUrl(downHtml)
downUrl=''
if len(downUrlList)>0:
downUrl=downUrlList[0]
downUrl= downUrl[downUrl.index('href="')+6:downUrl.index('" target')]
#print downUrl
i('開始下载:%s,文件名称:%s' %(downUrl,downTitle))

download(downTitle,downUrl)
i('%s下载完毕。保存文件名称:%s' %(downUrl,downTitle))
except Exception,e:
e('地址:%s下载失败,失败信息:' %(downGotoUrl))
e(str(e))

i('-----------------------------------------')
i('运行下一页:%s' %(nextUrl))
html= getUrlContent(nextUrl)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: