您的位置:首页 > 编程语言 > Python开发

{python·图片下载}

2011-07-03 09:01 381 查看
用python写了一个下载topit.me整个专辑中的图片
import re
import os
from urllib import request
import threading
from time import sleep,ctime
from html import parser
import getpass
import time
__author__ = 'Dino'
class album:
albumid=""
def __init__(self,albumid):
self.albumid=albumid
#下载
def downloadAlbum(self):
self.WebUrl="http://topit.me/album/%s"%self.albumid
rootUrl="http://topit.me"
pageRoot="/album/%s?p="%self.albumid

pparser=parserPages(self.albumid)
lparser = parserLinks(self.albumid)
iparser=parserImageUrl()
t=tools(self.albumid)

web = request.urlopen( self.WebUrl )
for context in web.readlines():
_str="%s"%context
try:
pparser.feed(_str)
except parser.HTMLParseError:
print("parser error")
pass

pages=pparser.getpagelist()
print("分页总数:",pages)
for page in range(1,pages+1):
lparser.filelist=[]
print("访问分页:","%s%s%s"%(rootUrl,pageRoot,page))
web = request.urlopen("%s%s%s"%(rootUrl,pageRoot,page))

for context in web.readlines():
_str="%s"%context
try:
lparser.feed( _str)
except parser.HTMLParseError:
print( "parser error")
pass
web.close()
imageurllist= lparser.getfileurllist()
imageurllist=list(set(imageurllist))#去重
print(imageurllist)
for url in imageurllist:
ihtmlurl=request.urlopen("http://topit.me%s"%url)
print("http://topit.me%s"%url)
for imagecontext in ihtmlurl.readlines():
_str="%s"%imagecontext
try:
iparser.feed( _str)
except parser.HTMLParseError:
print( "parser error")
pass
imagelist=iparser.getfilelist()
print(imagelist)
t.downjpgmutithread(imagelist)

class tools:
def __init__(self,albumid):
self.albumname=albumid

def downjpg(self, fileurl,filepath,FileName ="default.jpg" ):
try:
web = request.urlopen( fileurl)
print("访问网络文件"+fileurl+"\n")
jpg = web.read()

print("保存文件"+filepath+FileName+"\n")
try:
File = open( filepath+FileName,"wb" )
File.write( jpg)
File.close()
return
except IOError:
print("error\n")
return
except Exception:
print("error\n")
return

def downjpgmutithread(self, filepathlist ):
uname=getpass.getuser()#获取用户名
DstDir="C:\\Users\\%s\\Pictures\\topit_me\\"%uname
tempdir="%s%s\\"%(DstDir,time.strftime("%Y%m%d%H%M%S",time.localtime(time.time())))
os.mkdir(tempdir)
print("共有%d个文件需要下载"%len(filepathlist))
for file in filepathlist:
print( file )
print("开始多线程下载")
task_threads=[] #存储线程
count=1
for file in filepathlist:
t= threading.Thread( target=self.downjpg,args=(file,tempdir,"%d.jpg"%count) )
count=count+1
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join() #等待所有线程结束
print("线程结束")

class parserLinks( parser.HTMLParser):
filelist=[]

def __init__(self,albumid):
parser.HTMLParser.__init__(self)
self.albumid=albumid
self.p=re.compile('/album/%s/item/*'%self.albumid)

def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
if self.p.match(value):
self.filelist.append(value)

def getfileurllist(self):
return self.filelist

class parserPages(parser.HTMLParser):
pagelist=[]
def __init__(self,albumid):
parser.HTMLParser.__init__(self)
print('/album/%s[?]p=[0-9]+'%albumid)
self.page=re.compile('/album/%s[?]p=(?P<page>[0-9]+)'%albumid)

def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
m=self.page.match(value)
if m:
self.pagelist.append(int(m.group("page")))

def getpagelist(self):
return max(set(self.pagelist))

class parserImageUrl(parser.HTMLParser):
filelist=[]
def handle_starttag(self,tag,attrs):
if tag=="img":
if [x[0] for x in attrs  ].count('class')>0:
for name,value in attrs:
if name=="class":
if value!="img":
break
if name=="src":
self.filelist.append(value)
def getfilelist(self):
return self.filelist

topit_me=album("296107")
topit_me.downloadAlbum()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: