Created
September 3, 2018 10:50
-
-
Save nuriozbey/f6ab9b12263aa1e6b32b8c3f12622ae2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Thu August 28 10:55:43 2018 | |
| This script is to find the duplicate files in the folder | |
| @author: Nuri Özbey | |
| Email: nuriozbey@gmail.com | |
| """ | |
| import datetime | |
| import os, sys | |
| import hashlib | |
| global_list = [] | |
| def main(): | |
| if len(sys.argv) > 1: | |
| dupList = {} | |
| folders = sys.argv[1:] | |
| for subfold in folders: | |
| # Iterate the folders given | |
| if os.path.exists(subfold): | |
| # Find the duplicated files and append them to the dups | |
| addDicts(dupList, findDuplicate(subfold)) | |
| else: | |
| print('%s is not a valid path, please verify' % subfold) | |
| sys.exit() | |
| filename = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") | |
| #print(filename) | |
| file_log = open('log-'+filename+'.txt', 'w') | |
| printResults(dupList,file_log) | |
| printNotOpenedFiles(global_list,file_log) | |
| file_log.close() | |
| else: | |
| print('Usage: python duplicateFileFinder.py Folder or python dupFileFinder.py Folder1 Folder2') | |
| def findDuplicate(parentFolder): | |
| # Dups in format {hash:[names]} | |
| dupList = {} | |
| for dirName, subdirs, fileList in os.walk(parentFolder): | |
| print('Scanning %s...' % dirName) | |
| for filename in fileList: | |
| # Get the path to the file | |
| path = os.path.join(dirName, filename) | |
| # Calculate hash | |
| file_hash = hashfile(path) | |
| # Add or append the file path | |
| if file_hash in dupList: | |
| dupList[file_hash].append(path) | |
| else: | |
| dupList[file_hash] = [path] | |
| return dupList | |
| # Add two dictionaries | |
| def addDicts(dict1, dict2): | |
| for key in dict2.keys(): | |
| if key in dict1: | |
| dict1[key] = dict1[key] + dict2[key] | |
| else: | |
| dict1[key] = dict2[key] | |
| def hashfile(path, blocksize=65536): | |
| try: | |
| #myfile = open("myfile.csv", "r+") # or "a+", whatever you need | |
| afile = open(path, 'rb') | |
| hasher = hashlib.md5() | |
| buf = afile.read(blocksize) | |
| while len(buf) > 0: | |
| hasher.update(buf) | |
| buf = afile.read(blocksize) | |
| afile.close() | |
| return hasher.hexdigest() | |
| except IOError: | |
| print("Could not open file! ==> "+path) | |
| global_list.append(path) | |
| def printResults(dict1,log): | |
| results = list(filter(lambda x: len(x) > 1, dict1.values())) | |
| if len(results) > 0: | |
| print('Duplicates Found:') | |
| print('The following files are identical. The name could differ, but the content is identical') | |
| print('___________________') | |
| log.write('Duplicates Found:\n') | |
| log.write('The following files are identical. The name could differ, but the content is identical\n') | |
| log.write("_____________________________________________\n") | |
| for result in results: | |
| for subresult in result: | |
| print('\t%s' % subresult) | |
| log.write('\t%s\n' % subresult.encode('utf8')) | |
| print('___________________') | |
| log.write("_____________________________________________\n") | |
| else: | |
| print('No duplicate files found.') | |
| log.write('No duplicate files found.\n') | |
| def printNotOpenedFiles(files,log): | |
| if len(files) >0: | |
| print("_____________________________________________") | |
| print("____ List of Not Opened Files ______") | |
| print("_____________________________________________") | |
| log.write("\n\n\n*********************************************\n") | |
| log.write("******* List of Not Opened Files *********\n") | |
| log.write("*********************************************\n**\n") | |
| for file in files: | |
| print('\t%s' % file) | |
| log.write('**\t%s\n' % file) | |
| print("_____________________________________________") | |
| log.write("**\n*********************************************\n") | |
| else: | |
| print('no Not opened files found..!') | |
| log.write('no Not opened files found..!\n') | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment