python下载图片(3)
2011-10-19 19:29
330 查看
# -*- coding: utf-8 -*- """ some function by metaphy,2007-04-03,copyleft version 0.2 """ import urllib, httplib, urlparse import re import random """judge url exists or not,by others""" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if ':' in host: # port specified, try to use it host, port = host.split(':', 1) try: port = int(port) except ValueError: print 'invalid port number %r' % (port,) return False else: # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request("HEAD", path) resp = connection.getresponse( ) if resp.status == 200: # normal 'found' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) else: # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found """get html src,return lines[]""" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except: print "gGetHtmlLines() error!" return """get html src,return string""" def gGetHtml(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.read() page.close() return html except: print "gGetHtml() error!" return """根据url获取文件名""" def gGetFileName(url): if url==None: return None if url=="" : return "" arr=url.split("/") return arr[len(arr)-1] """生成随机文件名""" def gRandFilename(type): fname = '' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + '.' + type """根据url和其上的link,得到link的绝对地址""" def gGetAbslLink(url,link): if url==None or link == None : return if url=='' or link=='' : return url addr = '' if link[0] == '/' : addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == 'http': addr = link elif len(link)>2 and link[0:2] == '..': addr = gGetHttpAddrFatherAssign(url,link) else: addr = gGetHttpAddrFather(url) + link return addr """根据输入的lines,匹配正则表达式,返回list""" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList: matchs = re.search(regx, line, re.IGNORECASE) if matchs!=None: allGroups = matchs.groups() for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList """根据url下载文件,文件名参数指定""" def gDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp = urlopen.open(url) data = fp.read() fp.close() file=open(savePath + file,'w+b') file.write(data) file.close() except IOError: print "download error!"+ url """根据url下载文件,文件名自动从url获取""" def gDownload(url,savePath): #参数检查,现忽略 fileName = gGetFileName(url) #fileName =gRandFilename('jpg') gDownloadWithFilename(url,savePath,fileName) """根据某网页的url,下载该网页的jpg""" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) regx = r"""src\s*="?(\S+)\.jpg""" lists =gGetRegList(lines,regx) if lists==None: return for jpg in lists: jpg = gGetAbslLink(downloadUrl,jpg) + '.jpg' gDownload(jpg,savePath) ### print gGetFileName(jpg) """根据url取主站地址""" def gGetHttpAddr(url): if url== '' : return '' arr=url.split("/") return arr[0]+"//"+arr[2] """根据url取上级目录""" def gGetHttpAddrFather(url): if url=='' : return '' arr=url.split("/") addr = arr[0]+'//'+arr[2]+ '/' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + '/' return addr """根据url和上级的link取link的绝对地址""" def gGetHttpAddrFatherAssign(url,link): if url=='' : return '' if link=='': return '' linkArray=link.split("/") urlArray = url.split("/") partLink ='' partUrl = '' for i in range(len(linkArray)): if linkArray[i]=='..': numOfFather = i + 1 #上级数 else: partLink = partLink + '/' + linkArray[i] for i in range(len(urlArray)-1-numOfFather): partUrl = partUrl + urlArray[i] if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + '/' return partUrl + partLink """根据url获取其上的相关htm、html链接,返回list""" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r"""href="?(\S+)\.htm""" for link in gGetRegList(lines,regx): link = gGetAbslLink(url,link) + '.htm' if link not in rtnList: rtnList.append(link) print link return rtnList """根据url,抓取其上的jpg和其链接htm上的jpg""" def gDownloadAllJpg(url,savePath): #参数检查,现忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links: gDownloadHtmlJpg(link,savePath) """test""" def test(): u='http://www.gs.xinhuanet.com/news/2007-01/31/content_9188207_1.htm' save='d:/tmp/' print 'download pic from [' + u +']' print 'save to [' +save+'] ...' gDownloadHtmlJpg(u,save) print "download finished" test()
相关文章推荐
- python:利用requests库下载图片
- python爬虫图片下载
- Python爬虫实战(五) :下载百度贴吧帖子里的所有图片
- python学习:urllib库学习:制作简易爬虫下载图片
- Python 爬虫5——爬取并下载网页指定规格的图片
- python爬虫:下载百度贴吧图片学习笔记
- python 百度贴吧爬虫(下载图片)
- python 通过google搜索图片并下载
- python下载网页图片(2)
- python3 网页爬虫图片下载无效链接处理 try except
- 【转】python下载网页图片代码
- Python 爬虫5——爬取并下载网页指定规格的图片
- Python爬虫之多线程下载豆瓣Top250电影图片
- 【python】写文件、复制文件、下载url链接图片
- python爬虫(五)图片下载爬虫
- 【Python】下载图片
- Python下载一张图片与有道词典
- Python轻松入门-38 从网站上下载图片
- 我的第一个python爬虫程序(从百度贴吧自动下载图片)
- Python实现从订阅源下载图片的方法