Created
December 12, 2009 19:26
-
-
Save varikin/255017 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import shutil | |
| import md5 | |
| def find_duplicate_files(dirs): | |
| all_files = {} | |
| for dir in dirs: | |
| for dirname, dirs, filenames in os.walk(dir): | |
| if '.svn' in dirs: | |
| dirs.remove('.svn') | |
| for filename in filenames: | |
| filename = os.path.join(dirname, filename) | |
| content = open(filename, 'rb').read() | |
| hash = md5.new(content).hexdigest() | |
| if hash in all_files: | |
| all_files[hash].append(filename) | |
| else: | |
| all_files[hash] = [filename] | |
| #all_files.setdefault(hash, []).append(filename) | |
| duplicate_files = [] | |
| for filename_sets in all_files.values(): | |
| if len(filename_sets) > 1: | |
| duplicate_files.append(filename_sets) | |
| return duplicate_files | |
| if __name__ == '__main__': | |
| import sys | |
| #sys.argv[0] == 'example_test.py' | |
| dirs = sys.argv[1:] | |
| duplicate_files = find_duplicate_files(dirs) | |
| for filename_sets in duplicate_files: | |
| print 'Duplicate files:' | |
| for filename in filename_sets: | |
| print ' ', filename | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment