Skip to content

Instantly share code, notes, and snippets.

@nuriozbey
Created September 3, 2018 10:50
Show Gist options
  • Select an option

  • Save nuriozbey/f6ab9b12263aa1e6b32b8c3f12622ae2 to your computer and use it in GitHub Desktop.

Select an option

Save nuriozbey/f6ab9b12263aa1e6b32b8c3f12622ae2 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Thu August 28 10:55:43 2018
This script is to find the duplicate files in the folder
@author: Nuri Özbey
Email: nuriozbey@gmail.com
"""
import datetime
import os, sys
import hashlib
global_list = []
def main():
if len(sys.argv) > 1:
dupList = {}
folders = sys.argv[1:]
for subfold in folders:
# Iterate the folders given
if os.path.exists(subfold):
# Find the duplicated files and append them to the dups
addDicts(dupList, findDuplicate(subfold))
else:
print('%s is not a valid path, please verify' % subfold)
sys.exit()
filename = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
#print(filename)
file_log = open('log-'+filename+'.txt', 'w')
printResults(dupList,file_log)
printNotOpenedFiles(global_list,file_log)
file_log.close()
else:
print('Usage: python duplicateFileFinder.py Folder or python dupFileFinder.py Folder1 Folder2')
def findDuplicate(parentFolder):
# Dups in format {hash:[names]}
dupList = {}
for dirName, subdirs, fileList in os.walk(parentFolder):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Calculate hash
file_hash = hashfile(path)
# Add or append the file path
if file_hash in dupList:
dupList[file_hash].append(path)
else:
dupList[file_hash] = [path]
return dupList
# Add two dictionaries
def addDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize=65536):
try:
#myfile = open("myfile.csv", "r+") # or "a+", whatever you need
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
except IOError:
print("Could not open file! ==> "+path)
global_list.append(path)
def printResults(dict1,log):
results = list(filter(lambda x: len(x) > 1, dict1.values()))
if len(results) > 0:
print('Duplicates Found:')
print('The following files are identical. The name could differ, but the content is identical')
print('___________________')
log.write('Duplicates Found:\n')
log.write('The following files are identical. The name could differ, but the content is identical\n')
log.write("_____________________________________________\n")
for result in results:
for subresult in result:
print('\t%s' % subresult)
log.write('\t%s\n' % subresult.encode('utf8'))
print('___________________')
log.write("_____________________________________________\n")
else:
print('No duplicate files found.')
log.write('No duplicate files found.\n')
def printNotOpenedFiles(files,log):
if len(files) >0:
print("_____________________________________________")
print("____ List of Not Opened Files ______")
print("_____________________________________________")
log.write("\n\n\n*********************************************\n")
log.write("******* List of Not Opened Files *********\n")
log.write("*********************************************\n**\n")
for file in files:
print('\t%s' % file)
log.write('**\t%s\n' % file)
print("_____________________________________________")
log.write("**\n*********************************************\n")
else:
print('no Not opened files found..!')
log.write('no Not opened files found..!\n')
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment