nuriozbey/duplicateFileFinder.py

## duplicateFileFinder.py
# -*- coding: utf-8 -*-
"""
Created on Thu August 28 10:55:43 2018
This script is to find the duplicate files in the folder
@author: Nuri Özbey
Email: nuriozbey@gmail.com
"""
import datetime
import os, sys
import hashlib


global_list = []

def main():
    if len(sys.argv) > 1:
        dupList = {}
        folders = sys.argv[1:]
        for subfold in folders:
            # Iterate the folders given
            if os.path.exists(subfold):
                # Find the duplicated files and append them to the dups
                addDicts(dupList, findDuplicate(subfold))
            else:
                print('%s is not a valid path, please verify' % subfold)
                sys.exit()
        filename = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        #print(filename)
        file_log = open('log-'+filename+'.txt', 'w')
        printResults(dupList,file_log)
        printNotOpenedFiles(global_list,file_log)
        file_log.close()
    else:
        print('Usage: python duplicateFileFinder.py Folder or python dupFileFinder.py Folder1 Folder2')


def findDuplicate(parentFolder):
    # Dups in format {hash:[names]}
    dupList = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        print('Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            file_hash = hashfile(path)
            # Add or append the file path
            if file_hash in dupList:
                dupList[file_hash].append(path)
            else:
                dupList[file_hash] = [path]
    return dupList


# Add two dictionaries
def addDicts(dict1, dict2):
    for key in dict2.keys():
        if key in dict1:
            dict1[key] = dict1[key] + dict2[key]
        else:
            dict1[key] = dict2[key]


def hashfile(path, blocksize=65536):
    try:
        #myfile = open("myfile.csv", "r+")  # or "a+", whatever you need
        afile = open(path, 'rb')
        hasher = hashlib.md5()
        buf = afile.read(blocksize)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(blocksize)
        afile.close()
        return hasher.hexdigest()
    except IOError:
        print("Could not open file! ==> "+path)
        global_list.append(path)


def printResults(dict1,log):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print('Duplicates Found:')
        print('The following files are identical. The name could differ, but the content is identical')
        print('___________________')
        log.write('Duplicates Found:\n')
        log.write('The following files are identical. The name could differ, but the content is identical\n')
        log.write("_____________________________________________\n")
        for result in results:
            for subresult in result:
                print('\t%s' % subresult)
                log.write('\t%s\n' % subresult.encode('utf8'))
            print('___________________')
            log.write("_____________________________________________\n")

    else:
        print('No duplicate files found.')
        log.write('No duplicate files found.\n')


def printNotOpenedFiles(files,log):
    if len(files) >0:
        print("_____________________________________________")
        print("____     List of Not Opened Files      ______")
        print("_____________________________________________")
        log.write("\n\n\n*********************************************\n")
        log.write("*******  List of Not Opened Files   *********\n")
        log.write("*********************************************\n**\n")
        for file in files:
            print('\t%s' % file)
            log.write('**\t%s\n' % file)
        print("_____________________________________________")
        log.write("**\n*********************************************\n")
    else:
        print('no Not opened files found..!')
        log.write('no Not opened files found..!\n')

main()
	# -- coding: utf-8 --
	"""
	Created on Thu August 28 10:55:43 2018
	This script is to find the duplicate files in the folder
	@author: Nuri Özbey
	Email: nuriozbey@gmail.com
	"""
	import datetime
	import os, sys
	import hashlib


	global_list = []

	def main():
	if len(sys.argv) > 1:
	dupList = {}
	folders = sys.argv[1:]
	for subfold in folders:
	# Iterate the folders given
	if os.path.exists(subfold):
	# Find the duplicated files and append them to the dups
	addDicts(dupList, findDuplicate(subfold))
	else:
	print('%s is not a valid path, please verify' % subfold)
	sys.exit()
	filename = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
	#print(filename)
	file_log = open('log-'+filename+'.txt', 'w')
	printResults(dupList,file_log)
	printNotOpenedFiles(global_list,file_log)
	file_log.close()
	else:
	print('Usage: python duplicateFileFinder.py Folder or python dupFileFinder.py Folder1 Folder2')


	def findDuplicate(parentFolder):
	# Dups in format {hash:[names]}
	dupList = {}
	for dirName, subdirs, fileList in os.walk(parentFolder):
	print('Scanning %s...' % dirName)
	for filename in fileList:
	# Get the path to the file
	path = os.path.join(dirName, filename)
	# Calculate hash
	file_hash = hashfile(path)
	# Add or append the file path
	if file_hash in dupList:
	dupList[file_hash].append(path)
	else:
	dupList[file_hash] = [path]
	return dupList


	# Add two dictionaries
	def addDicts(dict1, dict2):
	for key in dict2.keys():
	if key in dict1:
	dict1[key] = dict1[key] + dict2[key]
	else:
	dict1[key] = dict2[key]


	def hashfile(path, blocksize=65536):
	try:
	#myfile = open("myfile.csv", "r+") # or "a+", whatever you need
	afile = open(path, 'rb')
	hasher = hashlib.md5()
	buf = afile.read(blocksize)
	while len(buf) > 0:
	hasher.update(buf)
	buf = afile.read(blocksize)
	afile.close()
	return hasher.hexdigest()
	except IOError:
	print("Could not open file! ==> "+path)
	global_list.append(path)


	def printResults(dict1,log):
	results = list(filter(lambda x: len(x) > 1, dict1.values()))
	if len(results) > 0:
	print('Duplicates Found:')
	print('The following files are identical. The name could differ, but the content is identical')
	print('___________________')
	log.write('Duplicates Found:\n')
	log.write('The following files are identical. The name could differ, but the content is identical\n')
	log.write("_____________________________________________\n")
	for result in results:
	for subresult in result:
	print('\t%s' % subresult)
	log.write('\t%s\n' % subresult.encode('utf8'))
	print('___________________')
	log.write("_____________________________________________\n")

	else:
	print('No duplicate files found.')
	log.write('No duplicate files found.\n')


	def printNotOpenedFiles(files,log):
	if len(files) >0:
	print("_____________________________________________")
	print("____ List of Not Opened Files ______")
	print("_____________________________________________")
	log.write("\n\n\n*********************************************\n")
	log.write("***** List of Not Opened Files *******\n")
	log.write("*******************************************\n\n")
	for file in files:
	print('\t%s' % file)
	log.write('**\t%s\n' % file)
	print("_____________________________________________")
	log.write("\n*******************************************\n")
	else:
	print('no Not opened files found..!')
	log.write('no Not opened files found..!\n')

	main()
No results found