Skip to content

Instantly share code, notes, and snippets.

@varikin
Created December 12, 2009 19:26
Show Gist options
  • Select an option

  • Save varikin/255017 to your computer and use it in GitHub Desktop.

Select an option

Save varikin/255017 to your computer and use it in GitHub Desktop.
import os
import shutil
import md5
def find_duplicate_files(dirs):
all_files = {}
for dir in dirs:
for dirname, dirs, filenames in os.walk(dir):
if '.svn' in dirs:
dirs.remove('.svn')
for filename in filenames:
filename = os.path.join(dirname, filename)
content = open(filename, 'rb').read()
hash = md5.new(content).hexdigest()
if hash in all_files:
all_files[hash].append(filename)
else:
all_files[hash] = [filename]
#all_files.setdefault(hash, []).append(filename)
duplicate_files = []
for filename_sets in all_files.values():
if len(filename_sets) > 1:
duplicate_files.append(filename_sets)
return duplicate_files
if __name__ == '__main__':
import sys
#sys.argv[0] == 'example_test.py'
dirs = sys.argv[1:]
duplicate_files = find_duplicate_files(dirs)
for filename_sets in duplicate_files:
print 'Duplicate files:'
for filename in filename_sets:
print ' ', filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment