Find duplicate files with Python

Posted on in category «Tech» by fnv with tags , ,

I adopted simple Python script from here to find duplicate files in directories. The script itself is self-explaining. I just have added check for file size and then later on to compute hash to save the computer resources. Hashing and comparing the content based on MD5 hash algorithm is enought for this case.

#!/usr/bin/env python3

# dupFinder.py
import os, sys
import hashlib


# Find duplicates
def findDup(parentFolder):
# Dups in format {hash:[names]}
        dups = {}
        sizes = {}
        for dirName, subdirs, fileList in os.walk(parentFolder):
                print('Scanning %s...' % dirName)
                for filename in fileList:
                        # Get the path to the file
                        path = os.path.join(dirName, filename)
                        if os.path.isfile(path):
                                fsize = os.stat(path).st_size
                                if fsize in sizes:
                                        sizes[fsize].append(path)
                                else:
                                        sizes[fsize] = [path]
        print('Calculating hashes...')
        # For same size calculate hashes
        for paths in sizes.values():
                if len(paths) > 1:
                        for path in paths:
                                file_hash = hashfile(path)
                                # Add or append the file path
                                if file_hash in dups:
                                        dups[file_hash].append(path)
                                else:
                                        dups[file_hash] = [path]
        return dups


# Joins two dictionaries
def joinDicts(dict1, dict2):
        for key in dict2.keys():
                if key in dict1:
                        dict1[key] = dict1[key] + dict2[key]
                else:
                        dict1[key] = dict2[key]


# Compute hash
def hashfile(path, blocksize = 65536):
        afile = open(path, 'rb')
        hasher = hashlib.md5()
        buf = afile.read(blocksize)
        while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(blocksize)
        afile.close()
        return hasher.hexdigest()


# Print output
def printResults(dict1):
        results = list(filter(lambda x: len(x) > 1, dict1.values()))
        if len(results) > 0:
                print('Duplicates Found:')
                print('The following files are identical. The name could differ, but the content is identical')
                print('___________________')
                for result in results:
                        for subresult in result:
                                print('%s' % subresult)
                        print('___________________')

        else:
                print('No duplicate files found.')


# Main 
if __name__ == '__main__':
        if len(sys.argv) > 1:
                dups = {}
                folders = sys.argv[1:]
                for i in folders:
            # Iterate the folders given
                        if os.path.exists(i):
                                # Find the duplicated files and append them to the dups
                                joinDicts(dups, findDup(i))
                        else:
                                print('%s is not a valid path, please verify' % i)
                                sys.exit()
                printResults(dups)
        else:
                print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2')