您的位置:首页 > 编程语言 > Python开发

python查找并删除相同文件-UNIQ File-script版本

2016-04-17 20:22 609 查看
今天用wxPython做了一个GUI程序,实现查找指定目录内的相同文件,主要原理是计算文件的md5值(计算前先找出文件大小相同的文件,然后计算这些文件的md5值,而不是所有文件都计算,大大减少了md5的计算量),加入了多线程功能。

以下是其脚本版本(无需安装wxPython)

UNIQFile-script.py

# -*- coding: gbk -*-

'''
Author:@DoNotSpyOnMe
Blog: http://www.cnblogs.com/aaronhoo '''

import hashlib
import os
import threading

def getFileSize(filePath):
return os.path.getsize(filePath)

''' 一般文件的md5计算方法,一次读取文件的全部内容'''
def CalcMD5(filepath):
with open(filepath,'rb') as f:
md5obj = hashlib.md5()
md5obj.update(f.read())
hash = md5obj.hexdigest()
return hash
'''大文件计算md5的方法,分批读取文件内容,防止内存爆掉'''
def GetFileMd5(filename):
if not os.path.isfile(filename):
return
myhash = hashlib.md5()
f = open(filename,'rb')
while True:
b = f.read(8*1024)
if not b :
break
myhash.update(b)
f.close()
return myhash.hexdigest()

def GetAllFiles(directory):
files=[]
for dirpath, dirnames,filenames in os.walk(directory):
if filenames!=[]:
for file in filenames:
files.append(dirpath+'\\'+file)
     files.sort(key=len)#按照文件名的长度排序
return files

def findSameSizeFiles(files):
dicSize={}
for f in files:
size=getFileSize(f)
if not dicSize.has_key(size):
dicSize[size]=f
else:
dicSize[size]=dicSize[size]+';'+f
dicCopy=dicSize.copy()
for k in dicSize.iterkeys():
if dicSize[k].find(';')==-1:
dicCopy.pop(k)
del dicSize
return dicCopy

def findSameMD5Files(files):
dicMD5={}
for f in files:
print 'calculating the md5 value of file %s'%f
md5=GetFileMd5(f)
if not dicMD5.has_key(md5):
dicMD5[md5]=f
else:
dicMD5[md5]=dicMD5[md5]+';'+f
dicCopy=dicMD5.copy()
for k in dicMD5.iterkeys():
if dicMD5[k].find(';')==-1:
dicCopy.pop(k)
del dicMD5
return dicCopy

def removeSameFile(mydir):
msg=''
msgUniq='Result:No file is removed since they are all uniq.'
try:
existsFlag=False
files=GetAllFiles(mydir)
print'%s files found in directory %s\n'%(len(files),mydir)
dicFileOfSameSize=findSameSizeFiles(files)
if dicFileOfSameSize=={}:
print msgUniq
return
else:
#list the duplicated files first:
dicFiltered={}
for k in dicFileOfSameSize.iterkeys():
filesOfSameSize=dicFileOfSameSize[k].split(';')
dicSameMD5file=findSameMD5Files(filesOfSameSize)
if dicSameMD5file!={}:
existsFlag=True
for k in dicSameMD5file.iterkeys():
msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+'\n'
dicFiltered[k]=dicSameMD5file[k]
if not existsFlag:
msg=msgUniq
return
else:
msg='Duplicated files:\n'+msg+'\n'
#then remove the duplicated files:
removeCount=0
for k in dicFiltered.iterkeys():
sameFiles=dicFiltered[k].split(';')
flagRemove=False
for f in sameFiles:
if not flagRemove:
flagRemove=True
else:
msg=msg+'Removing file: %s'%f+'\n'
os.remove(f)
removeCount=removeCount+1
msg=msg+'%s files are removed.\n'%removeCount
except Exception,e:
print e
#         msg='Exception occured.'
finally:
print msg+'\n'+'Operation finished.'

def listSameFile(mydir):
msg=''
msgUniq='Result:All files are uniq.'
try:
existsFlag=False
files=GetAllFiles(mydir)
print '%s files found in directory %s\n'%(len(files),mydir)
dicFileOfSameSize=findSameSizeFiles(files)
if dicFileOfSameSize=={}:
print msgUniq
return
else:
for k in dicFileOfSameSize.iterkeys():
filesOfSameSize=dicFileOfSameSize[k].split(';')
dicSameMD5file=findSameMD5Files(filesOfSameSize)
if dicSameMD5file!={}:
existsFlag=True
for k in dicSameMD5file.iterkeys():
msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+'\n'
if not existsFlag:
msg=msgUniq
else:
msg='Duplicated files:\n'+msg
except Exception,e:
print e
#         msg='Exception occured.'
finally:
print msg+'\n'+'Operation finished.'

if __name__=="__main__":
print 'This program is designed for clearing the duplicated files and saving memory space.Select a directory and we will find or remove the duplicated files.'
print 'All rights are reserved by @DoNotSpyOnMe'
print '\n'

print "You have three options:"
print "'f' for finding the duplicated files in the directory that you're required to enter later,or"
print "'r' for finding and the removing the duplicated file,or"
print "'q' to quit"
while True:
option=raw_input('Please enter your option:\n')
option=option.lower()
while option!='f' and option!='r' and option!='q':
option=raw_input('Please enter your option:\n')
if option=='f' or option=='r':
mydir=raw_input('Please enter the direcotry containing files:\n')
mydir=mydir.lower()
while mydir.find('\\')==-1 or not os.path.isdir(mydir):
mydir=raw_input('Please enter a valid direcotry containing files:\n')
if option=='f':
listSameFile(mydir)
else:
removeSameFile(mydir)
elif option=='q':
exit(0)
print ''
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: