Skip to content

Instantly share code, notes, and snippets.

@sergioloppe
Last active September 11, 2023 20:21
Show Gist options
  • Select an option

  • Save sergioloppe/d1fa6c987b0123ca1edabdc67d5049e4 to your computer and use it in GitHub Desktop.

Select an option

Save sergioloppe/d1fa6c987b0123ca1edabdc67d5049e4 to your computer and use it in GitHub Desktop.
Pairwise comparison for personal data
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# Dependencies
# pip install mmh3 textdistance
import random
import mmh3
import textdistance
####
# Define the main algorithm
####
def jaro_winkler_distance(R1, R2):
return textdistance.jaro_winkler(R1, R2)
def levenshtein_distance(R1, R2):
return textdistance.levenshtein(R1, R2)
def levenshtein_probability(R1,R2):
D = levenshtein_distance(R1, R2)
max_len = max(len(R1), len(R2))
P = 1 - D / max_len
return P
def normalized_weighted_average(values, weights):
return sum(weights[i] * values[i] for i in range(len(values))) / sum(weights)
def compute_normalized_weighted_average(x, y, distance_function, weights):
distances = [distance_function(x[i], y[i]) for i in range(len(x))]
return normalized_weighted_average(distances, weights)
def compare(x, y, w):
jw_s1 = compute_normalized_weighted_average(x, y, jaro_winkler_distance, w)
lv_s2 = compute_normalized_weighted_average(x, y, levenshtein_probability, w)
s = normalized_weighted_average([jw_s1, lv_s2], [0.6, 0.4])
# print(f'{s} <- {jw_s1} {lv_s2}')
return s
def identify_duplicates(x, Y, w, threshold):
duplicates = [y for y in Y if compare(x, y, w) >= threshold]
return duplicates
####
# Create a Random Dataset
####
def generate_dataset(n, x):
"""
x = ["Johannes", "Dough", "uk", "1990-01-01"]
"""
first_names = ["John", "Jon", "Jonathan", "Johnny", "Jonny", "Johannes", "Juan", "Joan", "Jean", "Giovanni"]
last_names = ["Doe", "Dow", "Dough", "Doh", "Do", "Dou", "Doww", "Dowe", "Dohh", "Doughh"]
nationalities = ["us", "ca", "uk", "au", "de", "fr", "es", "it", "du", "pt"]
years = [str(year) for year in range(1980, 2000)]
Y = [x]
for _ in range(n):
first_name = random.choice(first_names)
last_name = random.choice(last_names)
nationality = random.choice(nationalities)
year = random.choice(years)
date_of_birth = f"{year}-01-01"
Y.append([first_name, last_name, nationality, date_of_birth])
return x, Y
####
# Testing the thing
####
# Weights and threshold
w = [0.25, 0.25, 0.1, 0.4]
th = 0.97
# Generate 10000 records
x = ["Johannes", "Dough", "uk", "1990-01-01"]
x, Y = generate_dataset(10000, x)
print(x)
print(Y[:5]) # Print the first 5 records
print("-----")
identify_duplicates(x, Y, w, th)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment