Skip to content

Instantly share code, notes, and snippets.

@alexfriant
Last active June 3, 2025 19:19
Show Gist options
  • Select an option

  • Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.

Select an option

Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.
This Python script will provide a visual summary of alphanumeric patterns which exist in a list of values
#####################################################################################
#
# This script will provide you a basic understanding of the alphanumeric patterns
# which exist in a list. You might get this list from a SQL query or something like
# that.
#
# INPUT: Give this script a file that has a single column of ID type strings.
# EXAMPLES:
# (from windows command line):
# > python patternEyes.py "c:\temp\id_list.txt"
#
# (from Python Console in PyCharm):
# >>> from patternEyes import *
# >>> patternEyes(r"C:\LocalData\temp\parcels.csv")
#
# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
# characters to "X". All punctuation stays as it exists.
#
# For example, if you want to see if all records are phone numbers, you might expect
# to see something like this:
# ###-###-####
# But if you also see something like this, you know the data isn't as "clean" as
# you were hoping, requiring further investigation:
# ##-XXX-######
# Spaces are now displayed as '·', so something with spaces might look like this:
# ###·XXXX··
#
#####################################################################################
import re, os.path, sys
from collections import defaultdict
from pathlib import Path
def patternEyes( filePath = r'H:\IGS\GIS\GISADMIN\Parcel Tax System Update\Automation Integration\sql\PCSalesComp_research\all_docnum_values.txt'):
strings = []
patterns = []
input_file = filePath
if os.path.isfile( input_file ):
cp = re.compile(r'[,]')
np = re.compile(r'\d')
ap = re.compile(r'[a-z]', re.IGNORECASE)
sp = re.compile(r' ') # New regex pattern for space
file = open(input_file, 'r')
for line in file:
strings.extend(line.strip('\n').split(','))
file.close()
for string in strings:
sm = sp.sub('·', string) # Replace spaces with 's'
nm = np.sub('#', sm)
am = ap.sub('X', nm)
patterns.append(am)
pattern_counts = defaultdict(int)
for pattern in patterns:
if pattern == '':
pattern_counts['No Data'] += 1
else:
pattern_counts[pattern] += 1
pattern_rank = []
for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
pattern_rank.append([k, pattern_counts[k]])
print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
print("\n{0:40} | {1:10}".format("PATTERN", "COUNT"))
print("-"*50)
for pattern, count in pattern_rank:
print("{0:40} | {1:10}".format(pattern, str(count)))
else:
print( "\nSorry, there is no file here: {}".format(input_file))
def main( inputs ):
if len( inputs ) > 1:
print('okay, gonna try and run this: ' + inputs[1])
patternEyes( inputs[1] )
else:
patternEyes()
if __name__ == "__main__": main( sys.argv )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment