Script Duplicate

#!/usr/bin/python
# -*-coding:Latin-1 -*

# Supprime les fichiers doublons dans le dossier spécifié (et ses sous-dossiers)
# exemple:
# python duplicate.py <dossier>
# renvoie le nombre de fichiers supprimés
import os, sys, hashlib

files = set()
duplicates_count = 0

def md5_for_file(path, block_size=8192):
	md5 = hashlib.md5()
	with open(path,'rb') as f:
		for chunk in iter(lambda: f.read(8192), b''): 
			md5.update(chunk)
	return md5.hexdigest()

def bi_contains(lst, item):
	""" efficient `item in lst` for sorted lists """
	# if item is larger than the last its not in the list, but the bisect would 
	# find `len(lst)` as the index to insert, so check that first. Else, if the 
	# item is in the list then it has to be at index bisect_left(lst, item)
	return (item <= lst[-1]) and (lst[bisect_left(lst, item)] == item)

def analysedir(path):
	global duplicates_count
	for file in os.listdir(path):
		fullpath = os.path.join(path, file)
		if os.path.isfile(fullpath):
			hashmd5 = md5_for_file(fullpath)
			if hashmd5 in files:
				duplicates_count += 1
				os.remove(fullpath)
			else:
				files.add(hashmd5)
		else:
			analysedir(fullpath)


analysedir(sys.argv[1])
print "finished. removed", duplicates_count, "duplicates"
retour