Script Duplicate
#!/usr/bin/python
# -*-coding:Latin-1 -*
# Supprime les fichiers doublons dans le dossier spécifié (et ses sous-dossiers)
# exemple:
# python duplicate.py <dossier>
# renvoie le nombre de fichiers supprimés
import os, sys, hashlib
files = set()
duplicates_count = 0
def md5_for_file(path, block_size=8192):
md5 = hashlib.md5()
with open(path,'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
md5.update(chunk)
return md5.hexdigest()
def bi_contains(lst, item):
""" efficient `item in lst` for sorted lists """
# if item is larger than the last its not in the list, but the bisect would
# find `len(lst)` as the index to insert, so check that first. Else, if the
# item is in the list then it has to be at index bisect_left(lst, item)
return (item <= lst[-1]) and (lst[bisect_left(lst, item)] == item)
def analysedir(path):
global duplicates_count
for file in os.listdir(path):
fullpath = os.path.join(path, file)
if os.path.isfile(fullpath):
hashmd5 = md5_for_file(fullpath)
if hashmd5 in files:
duplicates_count += 1
os.remove(fullpath)
else:
files.add(hashmd5)
else:
analysedir(fullpath)
analysedir(sys.argv[1])
print "finished. removed", duplicates_count, "duplicates"
retour