Find duplicate files with Python
I adopted simple Python script from here to find duplicate files in directories. The script itself is self-explaining. I just have added check for file size and then later on to compute hash to save the computer resources. Hashing and comparing the content based on MD5 hash algorithm is enought for this case.
#!/usr/bin/env python3
# dupFinder.py
import os, sys
import hashlib
# Find duplicates
def findDup(parentFolder):
# Dups in format {hash:[names]}
dups = {}
sizes = {}
for dirName, subdirs, fileList in os.walk(parentFolder):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
if os.path.isfile(path):
fsize = os.stat(path).st_size
if fsize in sizes:
sizes[fsize].append(path)
else:
sizes[fsize] = [path]
print('Calculating hashes...')
# For same size calculate hashes
for paths in sizes.values():
if len(paths) > 1:
for path in paths:
file_hash = hashfile(path)
# Add or append the file path
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups
# Joins two dictionaries
def joinDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
# Compute hash
def hashfile(path, blocksize = 65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
# Print output
def printResults(dict1):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print('The following files are identical. The name could differ, but the content is identical')
print('___________________')
for result in results:
for subresult in result:
print('%s' % subresult)
print('___________________')
else:
print('No duplicate files found.')
# Main
if __name__ == '__main__':
if len(sys.argv) > 1:
dups = {}
folders = sys.argv[1:]
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to the dups
joinDicts(dups, findDup(i))
else:
print('%s is not a valid path, please verify' % i)
sys.exit()
printResults(dups)
else:
print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2')