sergioloppe/Pairwise Comparison for Personal Data.ipynb

## Pairwise Comparison for Personal Data.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Pairwise Comparison for Personal Data.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## pairwise_comparison.py
# Dependencies
# pip install mmh3 textdistance

import random
import mmh3
import textdistance

####
# Define the main algorithm
####
def jaro_winkler_distance(R1, R2):
    return textdistance.jaro_winkler(R1, R2)

def levenshtein_distance(R1, R2):
    return textdistance.levenshtein(R1, R2)

def levenshtein_probability(R1,R2):
    D = levenshtein_distance(R1, R2)
    max_len = max(len(R1), len(R2))
    P = 1 - D / max_len
    return P

def normalized_weighted_average(values, weights):
    return sum(weights[i] * values[i] for i in range(len(values))) / sum(weights)

def compute_normalized_weighted_average(x, y, distance_function, weights):
    distances = [distance_function(x[i], y[i]) for i in range(len(x))]
    return normalized_weighted_average(distances, weights)

def compare(x, y, w):
    jw_s1 = compute_normalized_weighted_average(x, y, jaro_winkler_distance, w)
    lv_s2 = compute_normalized_weighted_average(x, y, levenshtein_probability, w)

    s = normalized_weighted_average([jw_s1, lv_s2], [0.6, 0.4])
    # print(f'{s} <- {jw_s1} {lv_s2}')
    return s

def identify_duplicates(x, Y, w, threshold):
    duplicates = [y for y in Y if compare(x, y, w) >= threshold]
    return duplicates

####
# Create a Random Dataset
####
def generate_dataset(n, x):
    """
        x = ["Johannes", "Dough", "uk", "1990-01-01"]
    """
    first_names = ["John", "Jon", "Jonathan", "Johnny", "Jonny", "Johannes", "Juan", "Joan", "Jean", "Giovanni"]
    last_names = ["Doe", "Dow", "Dough", "Doh", "Do", "Dou", "Doww", "Dowe", "Dohh", "Doughh"]
    nationalities = ["us", "ca", "uk", "au", "de", "fr", "es", "it", "du", "pt"]
    years = [str(year) for year in range(1980, 2000)]

    Y = [x]

    for _ in range(n):
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        nationality = random.choice(nationalities)
        year = random.choice(years)
        date_of_birth = f"{year}-01-01"
        Y.append([first_name, last_name, nationality, date_of_birth])

    return x, Y

####
# Testing the thing
####

# Weights and threshold
w = [0.25, 0.25, 0.1, 0.4]
th = 0.97

# Generate 10000 records
x = ["Johannes", "Dough", "uk", "1990-01-01"]
x, Y = generate_dataset(10000, x)

print(x)
print(Y[:5])  # Print the first 5 records
print("-----")
identify_duplicates(x, Y, w, th)
	# Dependencies
	# pip install mmh3 textdistance

	import random
	import mmh3
	import textdistance

	####
	# Define the main algorithm
	####
	def jaro_winkler_distance(R1, R2):
	return textdistance.jaro_winkler(R1, R2)

	def levenshtein_distance(R1, R2):
	return textdistance.levenshtein(R1, R2)

	def levenshtein_probability(R1,R2):
	D = levenshtein_distance(R1, R2)
	max_len = max(len(R1), len(R2))
	P = 1 - D / max_len
	return P

	def normalized_weighted_average(values, weights):
	return sum(weights[i] * values[i] for i in range(len(values))) / sum(weights)

	def compute_normalized_weighted_average(x, y, distance_function, weights):
	distances = [distance_function(x[i], y[i]) for i in range(len(x))]
	return normalized_weighted_average(distances, weights)

	def compare(x, y, w):
	jw_s1 = compute_normalized_weighted_average(x, y, jaro_winkler_distance, w)
	lv_s2 = compute_normalized_weighted_average(x, y, levenshtein_probability, w)

	s = normalized_weighted_average([jw_s1, lv_s2], [0.6, 0.4])
	# print(f'{s} <- {jw_s1} {lv_s2}')
	return s

	def identify_duplicates(x, Y, w, threshold):
	duplicates = [y for y in Y if compare(x, y, w) >= threshold]
	return duplicates

	####
	# Create a Random Dataset
	####
	def generate_dataset(n, x):
	"""
	x = ["Johannes", "Dough", "uk", "1990-01-01"]
	"""
	first_names = ["John", "Jon", "Jonathan", "Johnny", "Jonny", "Johannes", "Juan", "Joan", "Jean", "Giovanni"]
	last_names = ["Doe", "Dow", "Dough", "Doh", "Do", "Dou", "Doww", "Dowe", "Dohh", "Doughh"]
	nationalities = ["us", "ca", "uk", "au", "de", "fr", "es", "it", "du", "pt"]
	years = [str(year) for year in range(1980, 2000)]

	Y = [x]

	for _ in range(n):
	first_name = random.choice(first_names)
	last_name = random.choice(last_names)
	nationality = random.choice(nationalities)
	year = random.choice(years)
	date_of_birth = f"{year}-01-01"
	Y.append([first_name, last_name, nationality, date_of_birth])

	return x, Y

	####
	# Testing the thing
	####

	# Weights and threshold
	w = [0.25, 0.25, 0.1, 0.4]
	th = 0.97

	# Generate 10000 records
	x = ["Johannes", "Dough", "uk", "1990-01-01"]
	x, Y = generate_dataset(10000, x)

	print(x)
	print(Y[:5]) # Print the first 5 records
	print("-----")
	identify_duplicates(x, Y, w, th)
No results found