alexfriant/patternEyes.py

## patternEyes.py
#####################################################################################
#
# This script will provide you a basic understanding of the alphanumeric patterns
# which exist in a list. You might get this list from a SQL query or something like
# that.
#
# INPUT: Give this script a file that has a single column of ID type strings.
# EXAMPLES:
#  (from windows command line):
#       > python patternEyes.py "c:\temp\id_list.txt"
#
#  (from Python Console in PyCharm):
#     >>> from patternEyes import *
#     >>> patternEyes(r"C:\LocalData\temp\parcels.csv")
#
# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
# characters to "X". All punctuation stays as it exists.
#
# For example, if you want to see if all records are phone numbers, you might expect
# to see something like this:
#         ###-###-####
# But if you also see something like this, you know the data isn't as "clean" as
# you were hoping, requiring further investigation:
#         ##-XXX-######
# Spaces are now displayed as '·', so something with spaces might look like this:
#         ###·XXXX··
#
#####################################################################################

import re, os.path, sys
from collections import defaultdict
from pathlib import Path


def patternEyes( filePath = r'H:\IGS\GIS\GISADMIN\Parcel Tax System Update\Automation Integration\sql\PCSalesComp_research\all_docnum_values.txt'):
    strings = []
    patterns = []
    input_file = filePath

    if os.path.isfile( input_file ):
        cp = re.compile(r'[,]')
        np = re.compile(r'\d')
        ap = re.compile(r'[a-z]', re.IGNORECASE)
        sp = re.compile(r' ')  # New regex pattern for space

        file = open(input_file, 'r')
        for line in file:
            strings.extend(line.strip('\n').split(','))
        file.close()

        for string in strings:
            sm = sp.sub('·', string)  # Replace spaces with 's'
            nm = np.sub('#', sm)
            am = ap.sub('X', nm)
            patterns.append(am)

        pattern_counts = defaultdict(int)
        for pattern in patterns:
            if pattern == '':
                pattern_counts['No Data'] += 1
            else:
                pattern_counts[pattern] += 1

        pattern_rank = []
        for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
            pattern_rank.append([k, pattern_counts[k]])

        print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
        print("\n{0:40} | {1:10}".format("PATTERN", "COUNT"))
        print("-"*50)
        for pattern, count in pattern_rank:
            print("{0:40} | {1:10}".format(pattern, str(count)))
    else:
        print( "\nSorry, there is no file here: {}".format(input_file))

def main( inputs ):
    if len( inputs ) > 1:
        print('okay, gonna try and run this: ' + inputs[1])
        patternEyes( inputs[1] )
    else:
        patternEyes()
if __name__ == "__main__": main( sys.argv )
	#####################################################################################
	#
	# This script will provide you a basic understanding of the alphanumeric patterns
	# which exist in a list. You might get this list from a SQL query or something like
	# that.
	#
	# INPUT: Give this script a file that has a single column of ID type strings.
	# EXAMPLES:
	# (from windows command line):
	# > python patternEyes.py "c:\temp\id_list.txt"
	#
	# (from Python Console in PyCharm):
	# >>> from patternEyes import *
	# >>> patternEyes(r"C:\LocalData\temp\parcels.csv")
	#
	# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
	# characters to "X". All punctuation stays as it exists.
	#
	# For example, if you want to see if all records are phone numbers, you might expect
	# to see something like this:
	# ###-###-####
	# But if you also see something like this, you know the data isn't as "clean" as
	# you were hoping, requiring further investigation:
	# ##-XXX-######
	# Spaces are now displayed as '·', so something with spaces might look like this:
	# ###·XXXX··
	#
	#####################################################################################

	import re, os.path, sys
	from collections import defaultdict
	from pathlib import Path


	def patternEyes( filePath = r'H:\IGS\GIS\GISADMIN\Parcel Tax System Update\Automation Integration\sql\PCSalesComp_research\all_docnum_values.txt'):
	strings = []
	patterns = []
	input_file = filePath

	if os.path.isfile( input_file ):
	cp = re.compile(r'[,]')
	np = re.compile(r'\d')
	ap = re.compile(r'[a-z]', re.IGNORECASE)
	sp = re.compile(r' ') # New regex pattern for space

	file = open(input_file, 'r')
	for line in file:
	strings.extend(line.strip('\n').split(','))
	file.close()

	for string in strings:
	sm = sp.sub('·', string) # Replace spaces with 's'
	nm = np.sub('#', sm)
	am = ap.sub('X', nm)
	patterns.append(am)

	pattern_counts = defaultdict(int)
	for pattern in patterns:
	if pattern == '':
	pattern_counts['No Data'] += 1
	else:
	pattern_counts[pattern] += 1

	pattern_rank = []
	for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
	pattern_rank.append([k, pattern_counts[k]])

	print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
	print("\n{0:40} \| {1:10}".format("PATTERN", "COUNT"))
	print("-"*50)
	for pattern, count in pattern_rank:
	print("{0:40} \| {1:10}".format(pattern, str(count)))
	else:
	print( "\nSorry, there is no file here: {}".format(input_file))

	def main( inputs ):
	if len( inputs ) > 1:
	print('okay, gonna try and run this: ' + inputs[1])
	patternEyes( inputs[1] )
	else:
	patternEyes()
	if __name__ == "__main__": main( sys.argv )
No results found