A small, composable JVM library for matching entities across two lists using a set of field names. Designed for Java/Kotlin backends, favors composition, data classes, and ADTs. No reflection required unless you opt into it.
-
Overview
-
Core API (Kotlin, Java-callable)
-
Strategies
- Exact
- Fuzzy
- Hybrid (blocking + inner)
- Optional: One-to-one global matching via Hungarian adapter
-
Similarities
-
Conflict policy
-
Field extractors
-
Examples (Kotlin + Java)
-
Gradle setup
-
Tests (sample)
-
Extensibility notes
You pass:
left: List<T>right: List<T>fields: Set<String>that name the properties to compareextractor: FieldExtractor<T>that knows how to read those fieldsconflictPolicy: ConflictPolicy
You choose a MatchingStrategy<T>:
ExactMatchStrategyFuzzyMatchStrategyHybridBlockingStrategyHungarianGlobalStrategy(optional, for global one-to-one)
Outputs are explicit ADTs:
MatchSet<T>withmatches,leftUnmatched,rightUnmatchedMatched<T>holds(left, right, score)Unmatched<T>holds the entity
package io.edgar.matching
typealias FieldName = String
data class Matched<T>(val left: T, val right: T, val score: Double)
data class Unmatched<T>(val entity: T)
data class MatchSet<T>(
val matches: List<Matched<T>>,
val leftUnmatched: List<Unmatched<T>>,
val rightUnmatched: List<Unmatched<T>>
)
fun interface FieldExtractor<T> {
fun get(entity: T, field: FieldName): Any?
}
data class ExtractorRegistry<T>(
val byField: Map<FieldName, (T) -> Any?>
) : FieldExtractor<T> {
override fun get(entity: T, field: FieldName): Any? = byField[field]?.invoke(entity)
}
sealed interface MatchingStrategy<T> {
fun match(
left: List<T>,
right: List<T>,
fields: Set<FieldName>,
extractor: FieldExtractor<T>,
conflictPolicy: ConflictPolicy
): MatchSet<T>
}
sealed interface ConflictPolicy {
data object First : ConflictPolicy
data object UniqueOnly : ConflictPolicy
data object All : ConflictPolicy
data object BestScore : ConflictPolicy
}package io.edgar.matching.strategy
import io.edgar.matching.*
class ExactMatchStrategy<T> : MatchingStrategy<T> {
private data class CompositeKey(val values: List<Any?>)
private fun keyOf(entity: T, fields: Set<FieldName>, extractor: FieldExtractor<T>) =
CompositeKey(fields.map { extractor.get(entity, it) })
override fun match(
left: List<T>,
right: List<T>,
fields: Set<FieldName>,
extractor: FieldExtractor<T>,
conflictPolicy: ConflictPolicy
): MatchSet<T> {
val index = right.groupBy { keyOf(it, fields, extractor) }.toMutableMap()
val usedRight = mutableSetOf<T>()
val matches = mutableListOf<Matched<T>>()
val leftUnmatched = mutableListOf<Unmatched<T>>()
for (l in left) {
val k = keyOf(l, fields, extractor)
val candidates = index[k].orEmpty().filter { it !in usedRight }
when (conflictPolicy) {
is ConflictPolicy.All -> {
if (candidates.isEmpty()) {
leftUnmatched += Unmatched(l)
} else {
for (r in candidates) {
usedRight += r
matches += Matched(l, r, 1.0)
}
}
}
is ConflictPolicy.UniqueOnly -> {
if (candidates.size == 1) {
val r = candidates.single()
usedRight += r
matches += Matched(l, r, 1.0)
} else {
leftUnmatched += Unmatched(l)
}
}
is ConflictPolicy.First, is ConflictPolicy.BestScore -> {
if (candidates.isEmpty()) {
leftUnmatched += Unmatched(l)
} else {
val r = candidates.first()
usedRight += r
matches += Matched(l, r, 1.0)
}
}
}
}
val rightUnmatched = right.filter { it !in usedRight }.map(::Unmatched)
return MatchSet(matches, leftUnmatched, rightUnmatched)
}
}package io.edgar.matching.strategy
import io.edgar.matching.*
fun interface Similarity<T> {
fun score(a: T, b: T, fields: Set<FieldName>, extractor: FieldExtractor<T>): Double
}
class DefaultStringSimilarity<T> : Similarity<T> {
private fun sim(a: String, b: String): Double {
if (a.isEmpty() && b.isEmpty()) return 1.0
val la = a.lowercase()
val lb = b.lowercase()
if (la == lb) return 1.0
val minLen = minOf(la.length, lb.length)
var common = 0
var i = 0
while (i < minLen && la[i] == lb[i]) {
common += 1
i += 1
}
return common.toDouble() / maxOf(la.length, lb.length)
}
override fun score(a: T, b: T, fields: Set<FieldName>, extractor: FieldExtractor<T>): Double {
if (fields.isEmpty()) return 0.0
val scores = fields.map {
val av = extractor.get(a, it)?.toString().orEmpty()
val bv = extractor.get(b, it)?.toString().orEmpty()
sim(av, bv)
}
return scores.average()
}
}
class FuzzyMatchStrategy<T>(
private val similarity: Similarity<T>,
private val threshold: Double = 0.9
) : MatchingStrategy<T> {
override fun match(
left: List<T>,
right: List<T>,
fields: Set<FieldName>,
extractor: FieldExtractor<T>,
conflictPolicy: ConflictPolicy
): MatchSet<T> {
val usedRight = mutableSetOf<T>()
val matches = mutableListOf<Matched<T>>()
val leftUnmatched = mutableListOf<Unmatched<T>>()
for (l in left) {
val scored = right.asSequence()
.filter { it !in usedRight }
.map { r -> r to similarity.score(l, r, fields, extractor) }
.filter { (_, s) -> s >= threshold }
.sortedByDescending { it.second }
.toList()
when (conflictPolicy) {
is ConflictPolicy.All -> {
if (scored.isEmpty()) leftUnmatched += Unmatched(l)
for ((r, s) in scored) {
usedRight += r
matches += Matched(l, r, s)
}
}
is ConflictPolicy.UniqueOnly -> {
if (scored.size == 1) {
val (r, s) = scored.single()
usedRight += r
matches += Matched(l, r, s)
} else {
leftUnmatched += Unmatched(l)
}
}
is ConflictPolicy.First, is ConflictPolicy.BestScore -> {
if (scored.isEmpty()) {
leftUnmatched += Unmatched(l)
} else {
val (r, s) = scored.first()
usedRight += r
matches += Matched(l, r, s)
}
}
}
}
val rightUnmatched = right.filter { it !in usedRight }.map(::Unmatched)
return MatchSet(matches, leftUnmatched, rightUnmatched)
}
}package io.edgar.matching.strategy
import io.edgar.matching.*
class HybridBlockingStrategy<T>(
private val blockingFields: Set<FieldName>,
private val inner: MatchingStrategy<T>
) : MatchingStrategy<T> {
private data class BlockKey(val values: List<Any?>)
private fun keyOf(entity: T, extractor: FieldExtractor<T>) =
BlockKey(blockingFields.map { extractor.get(entity, it) })
override fun match(
left: List<T>,
right: List<T>,
fields: Set<FieldName>,
extractor: FieldExtractor<T>,
conflictPolicy: ConflictPolicy
): MatchSet<T> {
val rightBlocks = right.groupBy { keyOf(it, extractor) }.toMutableMap()
val matches = mutableListOf<Matched<T>>()
val leftUnmatched = mutableListOf<Unmatched<T>>()
val usedRight = mutableSetOf<T>()
for (l in left) {
val block = rightBlocks[keyOf(l, extractor)].orEmpty().filter { it !in usedRight }
if (block.isEmpty()) {
leftUnmatched += Unmatched(l)
continue
}
val res = inner.match(listOf(l), block, fields, extractor, conflictPolicy)
matches += res.matches
usedRight += res.matches.map { it.right }
if (res.matches.isEmpty()) leftUnmatched += Unmatched(l)
}
val rightUnmatched = right.filter { it !in usedRight }.map(::Unmatched)
return MatchSet(matches, leftUnmatched, rightUnmatched)
}
}This adapter builds a score matrix from a Similarity<T>, converts to a cost matrix, and runs assignment to maximize total score. It respects a threshold by rejecting pairs below it.
package io.edgar.matching.strategy
import io.edgar.matching.*
import kotlin.math.max
class HungarianGlobalStrategy<T>(
private val similarity: Similarity<T>,
private val threshold: Double = 0.9
) : MatchingStrategy<T> {
override fun match(
left: List<T>,
right: List<T>,
fields: Set<FieldName>,
extractor: FieldExtractor<T>,
conflictPolicy: ConflictPolicy
): MatchSet<T> {
if (left.isEmpty() || right.isEmpty()) return MatchSet(emptyList(), left.map(::Unmatched), right.map(::Unmatched))
val n = left.size
val m = right.size
val size = max(n, m)
val scores = Array(size) { DoubleArray(size) { 0.0 } }
var i = 0
while (i < n) {
var j = 0
while (j < m) {
scores[i][j] = similarity.score(left[i], right[j], fields, extractor)
j += 1
}
i += 1
}
val cost = Array(size) { DoubleArray(size) { 1.0 } }
i = 0
while (i < size) {
var j2 = 0
while (j2 < size) {
val s = if (i < n && j2 < m) scores[i][j2] else 0.0
cost[i][j2] = 1.0 - s
j2 += 1
}
i += 1
}
val assignment = hungarian(cost)
val matches = mutableListOf<Matched<T>>()
val leftUnmatched = mutableListOf<Unmatched<T>>()
val usedRight = mutableSetOf<Int>()
for ((li, rj) in assignment) {
if (li < n && rj < m) {
val s = scores[li][rj]
if (s >= threshold) {
matches += Matched(left[li], right[rj], s)
usedRight += rj
} else {
leftUnmatched += Unmatched(left[li])
}
}
}
val matchedLeftIdx = assignment.map { it.first }.toSet()
val extraLeft = (0 until n).filter { it !in matchedLeftIdx }.map { Unmatched(left[it]) }
val rightUnmatched = (0 until m).filter { it !in usedRight }.map { Unmatched(right[it]) }
return MatchSet(matches, leftUnmatched + extraLeft, rightUnmatched)
}
private fun hungarian(cost: Array<DoubleArray>): List<Pair<Int, Int>> {
val n = cost.size
val u = DoubleArray(n + 1)
val v = DoubleArray(n + 1)
val p = IntArray(n + 1)
val way = IntArray(n + 1)
var i = 1
while (i <= n) {
p[0] = i
var j0 = 0
val minv = DoubleArray(n + 1) { Double.POSITIVE_INFINITY }
val used = BooleanArray(n + 1)
do {
used[j0] = true
val i0 = p[j0]
var delta = Double.POSITIVE_INFINITY
var j1 = 0
var j = 1
while (j <= n) {
if (!used[j]) {
val cur = cost[i0 - 1][j - 1] - u[i0] - v[j]
if (cur < minv[j]) {
minv[j] = cur
way[j] = j0
}
if (minv[j] < delta) {
delta = minv[j]
j1 = j
}
}
j += 1
}
var j2 = 0
while (j2 <= n) {
if (used[j2]) {
u[p[j2]] += delta
v[j2] -= delta
} else {
minv[j2] -= delta
}
j2 += 1
}
j0 = j1
} while (p[j0] != 0)
do {
val j1b = way[j0]
p[j0] = p[j1b]
j0 = j1b
} while (j0 != 0)
i += 1
}
val result = mutableListOf<Pair<Int, Int>>()
var j = 1
while (j <= n) {
if (p[j] != 0) result += (p[j] - 1) to (j - 1)
j += 1
}
return result
}
}You can compose per-field metrics and average or weight them. A weighted example:
package io.edgar.matching.sim
import io.edgar.matching.FieldExtractor
import io.edgar.matching.FieldName
import io.edgar.matching.strategy.Similarity
data class WeightedSimilaritySpec(val weights: Map<FieldName, Double>)
class WeightedSimilarity<T>(
private val base: (String, String) -> Double,
private val spec: WeightedSimilaritySpec
) : Similarity<T> {
override fun score(a: T, b: T, fields: Set<FieldName>, extractor: FieldExtractor<T>): Double {
if (fields.isEmpty()) return 0.0
var sum = 0.0
var wsum = 0.0
for (f in fields) {
val w = spec.weights[f] ?: 1.0
val av = extractor.get(a, f)?.toString().orEmpty()
val bv = extractor.get(b, f)?.toString().orEmpty()
sum += w * base(av, bv)
wsum += w
}
return if (wsum == 0.0) 0.0 else sum / wsum
}
}You can plug in Jaro–Winkler, Levenshtein, Damerau, Jaccard, phonetic keys, numeric deltas, or TF-IDF cosine by swapping the (String, String) -> Double.
package io.edgar.matching.java
import io.edgar.matching.*
import io.edgar.matching.strategy.*
object EntityMatcherFacade {
@JvmStatic
fun <T> exact(
left: List<T>,
right: List<T>,
fields: Set<String>,
extractor: FieldExtractor<T>,
policy: ConflictPolicy
): MatchSet<T> = ExactMatchStrategy<T>().match(left, right, fields, extractor, policy)
@JvmStatic
fun <T> fuzzy(
left: List<T>,
right: List<T>,
fields: Set<String>,
extractor: FieldExtractor<T>,
threshold: Double,
policy: ConflictPolicy
): MatchSet<T> = FuzzyMatchStrategy(DefaultStringSimilarity(), threshold)
.match(left, right, fields, extractor, policy)
@JvmStatic
fun <T> hybridBlockingFuzzy(
left: List<T>,
right: List<T>,
blockingFields: Set<String>,
matchFields: Set<String>,
extractor: FieldExtractor<T>,
threshold: Double,
policy: ConflictPolicy
): MatchSet<T> = HybridBlockingStrategy(
blockingFields,
FuzzyMatchStrategy(DefaultStringSimilarity(), threshold)
).match(left, right, matchFields, extractor, policy)
@JvmStatic
fun <T> globalHungarian(
left: List<T>,
right: List<T>,
fields: Set<String>,
extractor: FieldExtractor<T>,
threshold: Double
): MatchSet<T> = HungarianGlobalStrategy(DefaultStringSimilarity(), threshold)
.match(left, right, fields, extractor, ConflictPolicy.BestScore)
}package demo
data class User(val id: String, val email: String, val fullName: String)import io.edgar.matching.*
import io.edgar.matching.strategy.*
import demo.User
val usersA = listOf(
User("1", "alice@ex.com", "Alice Smith"),
User("2", "bob@ex.com", "Bob Stone"),
User("3", "carol@ex.com", "Carol Ann")
)
val usersB = listOf(
User("1", "alice@ex.com", "Alice S."),
User("2", "bob@ex.com", "Robert Stone"),
User("X", "carol@ex.com", "Carol Ann")
)
val extractor = ExtractorRegistry<User>(
mapOf(
"id" to { u: User -> u.id },
"email" to { u: User -> u.email },
"fullName" to { u: User -> u.fullName }
)
)
val result = ExactMatchStrategy<User>().match(
left = usersA,
right = usersB,
fields = setOf("id"),
extractor = extractor,
conflictPolicy = ConflictPolicy.UniqueOnly
)import io.edgar.matching.*
import io.edgar.matching.strategy.*
import demo.User
val hybrid = HybridBlockingStrategy(
blockingFields = setOf("email"),
inner = FuzzyMatchStrategy<User>(DefaultStringSimilarity(), threshold = 0.85)
).match(
left = usersA,
right = usersB,
fields = setOf("fullName", "email"),
extractor = extractor,
conflictPolicy = ConflictPolicy.BestScore
)import io.edgar.matching.*
import io.edgar.matching.strategy.*
import demo.User
val global = HungarianGlobalStrategy<User>(
similarity = DefaultStringSimilarity(),
threshold = 0.85
).match(
left = usersA,
right = usersB,
fields = setOf("fullName", "email"),
extractor = extractor,
conflictPolicy = ConflictPolicy.BestScore
)import io.edgar.matching.*;
import io.edgar.matching.strategy.*;
import io.edgar.matching.java.EntityMatcherFacade;
import java.util.*;
public class Example {
public static final class User {
private final String id;
private final String email;
private final String fullName;
public User(String id, String email, String fullName) { this.id = id; this.email = email; this.fullName = fullName; }
public String getId() { return id; }
public String getEmail() { return email; }
public String getFullName() { return fullName; }
}
public static void main(String[] args) {
List<User> a = Arrays.asList(
new User("1","alice@ex.com","Alice Smith"),
new User("2","bob@ex.com","Bob Stone")
);
List<User> b = Arrays.asList(
new User("1","alice@ex.com","Alice S."),
new User("3","charlie@ex.com","Charlie Brown")
);
FieldExtractor<User> extractor = (entity, field) -> switch (field) {
case "id" -> entity.getId();
case "email" -> entity.getEmail();
case "fullName" -> entity.getFullName();
default -> null;
};
MatchSet<User> exact = EntityMatcherFacade.exact(
a, b, Set.of("id"), extractor, ConflictPolicy.UniqueOnly.INSTANCE
);
MatchSet<User> fuzzy = EntityMatcherFacade.fuzzy(
a, b, Set.of("fullName"), extractor, 0.85, ConflictPolicy.BestScore.INSTANCE
);
}
}// settings.gradle.kts
rootProject.name = "entity-matching-lib"// build.gradle.kts
plugins {
kotlin("jvm") version "2.0.0"
}
group = "io.edgar"
version = "0.1.0"
repositories {
mavenCentral()
}
dependencies {
testImplementation(kotlin("test"))
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
}
tasks.test {
useJUnitPlatform()
}package io.edgar.matching
import io.edgar.matching.strategy.*
import kotlin.test.Test
import kotlin.test.assertEquals
data class User(val id: String, val email: String, val fullName: String)
class MatchingTests {
private val extractor = ExtractorRegistry<User>(
mapOf(
"id" to { u: User -> u.id },
"email" to { u: User -> u.email },
"fullName" to { u: User -> u.fullName }
)
)
@Test
fun exactById() {
val a = listOf(
User("1", "alice@ex.com", "Alice Smith"),
User("2", "bob@ex.com", "Bob Stone")
)
val b = listOf(
User("1", "alice@ex.com", "Alice S."),
User("3", "charlie@ex.com", "Charlie Brown")
)
val res = ExactMatchStrategy<User>().match(
a, b, setOf("id"), extractor, ConflictPolicy.UniqueOnly
)
assertEquals(1, res.matches.size)
assertEquals(1, res.leftUnmatched.size)
assertEquals(1, res.rightUnmatched.size)
}
@Test
fun fuzzyOnName() {
val a = listOf(User("x", "a@ex.com", "Alice Smith"))
val b = listOf(User("y", "b@ex.com", "Alice S."))
val res = FuzzyMatchStrategy<User>(DefaultStringSimilarity(), 0.7).match(
a, b, setOf("fullName"), extractor, ConflictPolicy.BestScore
)
assertEquals(1, res.matches.size)
}
@Test
fun hybridBlockByEmail() {
val a = listOf(
User("1", "same@ex.com", "Alice Smith"),
User("2", "diff@ex.com", "Bob Stone")
)
val b = listOf(
User("x", "same@ex.com", "Alice S."),
User("y", "other@ex.com", "Bob Stone")
)
val res = HybridBlockingStrategy<User>(
blockingFields = setOf("email"),
inner = FuzzyMatchStrategy(DefaultStringSimilarity(), 0.7)
).match(
a, b, setOf("fullName"), extractor, ConflictPolicy.BestScore
)
assertEquals(1, res.matches.size)
}
}- Add field-aware similarities by composing
Similarity<T>with per-field functions or weights. - Add phonetic normalization or tokenization before similarity scoring.
- Use
HybridBlockingStrategywith multiple blocking layers. - Replace
DefaultStringSimilaritywith domain-specific functions for dates, numbers, and addresses. - Switch to
HungarianGlobalStrategywhen you need strict one-to-one mapping that maximizes the total score.
- Clean IDs available →
ExactMatchStrategy - IDs flaky, emails stable →
HybridBlockingStrategywithblockingFields = {"email"}andFuzzyMatchStrategyon names - Many near-duplicates →
HungarianGlobalStrategywith a good similarity and a confident threshold - Noisy text → Jaccard/phonetic layers, then fuzzy
If you want this split into actual src/ files, say the word and I’ll scaffold the project layout exactly as a Gradle module.