#meta
Implements the most common similarity algorithms:
- Common Prefix Length,
- Common Suffix Length,
- Levenshtein Distance,
- Hamming Distance,
- Longest Common Subsequence,
- Cosine Similarity,
- Camberra Distance and
- Jaro-Winkler Distance.
simfn = simfn or {}
simfn.__index = simfn
-- Common Prefix Length
function simfn.cpl(s1, s2)
local l1, l2 = #s1, #s2
local lmax = l1 < l2 and l1 or l2
for i = 1, lmax do
if s1:byte(i) ~= s2:byte(i) then
return i - 1
end
end
return lmax
end
-- Common Suffix Length
function simfn.csl(s1, s2)
local l1, l2 = #s1, #s2
local lmax = l1 < l2 and l1 or l2
for i = 1, lmax do
if s1:byte(l1 - i + 1) ~= s2:byte(l2 - i + 1) then
return i - 1
end
end
return lmax
end
-- Levenshtein Distance
function simfn.ld(s1, s2)
local l1, l2 = #s1, #s2
if l1 == 0 then return l2 end
if l2 == 0 then return l1 end
local m = {} -- matrix
for i = 0, l1 do
m[i] = {}
m[i][0] = i
end
for j = 0, l2 do
m[0][j] = j
end
for i = 1, l1 do
for j = 1, l2 do
local cost = s1:byte(i) == s2:byte(j) and 0 or 1
m[i][j] = math.min(
m[i - 1][j ] + 1,
m[i ][j - 1] + 1,
m[i - 1][j - 1] + cost)
end
end
return m[l1][l2]
end
-- Hamming Distance
function simfn.hd(s1, s2)
local l1, l2 = #s1, #s2
if l1 ~= l2 then
return 1/0 -- positive infinity (math.huge)
end
local dist = 0
for i = 1, l1 do
if s1:byte(i) ~= s2:byte(i) then
dist = dist + 1
end
end
return dist
end
-- Longest Common Subsequence
function simfn.lcs(s1, s2)
local l1, l2 = #s1, #s2
local m = {} -- matrix
for i = 0, l1 do
m[i] = {}
m[i][0] = 0
end
for j = 0, l2 do
m[0][j] = 0
end
for i = 1, l1 do
for j = 1, l2 do
if s1:byte(i) == s2:byte(j) then
m[i][j] = m[i - 1][j - 1] + 1
else
m[i][j] = math.max(m[i - 1][j], m[i][j - 1])
end
end
end
return m[l1][l2]
end
-- Cosine Similarity
function simfn.cs(s1, s2)
local function cf(s)
local f = {}
for i = 1, #s do
local c = s:byte(i)
f[c] = (f[c] or 0) + 1
end
return f
end
local f1 = cf(s1)
local f2 = cf(s2)
local cs = {}
for c in pairs(f1) do
cs[c] = true
end
for c in pairs(f2) do
cs[c] = true
end
local dp, nm1, nm2 = 0, 0, 0
for c in pairs(cs) do
local f1 = f1[c] or 0
local f2 = f2[c] or 0
dp = dp + f1*f2
nm1 = nm1 + f1*f1
nm2 = nm2 + f2*f2
end
local mag = math.sqrt(nm1)*math.sqrt(nm2)
if mag == 0 then
return 0
end
return dp/mag
end
-- Camberra Distance
function simfn.cd(s1, s2)
local function cf(s)
local f = {}
for i = 1, #s do
local c = s:byte(i)
f[c] = (f[c] or 0) + 1
end
return f
end
local f1 = cf(s1)
local f2 = cf(s2)
local cs = {}
for c in pairs(f1) do
cs[c] = true
end
for c in pairs(f2) do
cs[c] = true
end
local sum = 0
for c in pairs(cs) do
local fr1 = f1[c] or 0
local fr2 = f2[c] or 0
local num = math.abs(fr1 - fr2)
local den = fr1 + fr2
if den > 0 then
sum = sum + num/den
end
end
return sum
end
-- Jaro-Winkler Distance
function simfn.jwd(s1, s2)
local l1, l2 = #s1, #s2
if l1 == 0 and l2 == 0 then
return 1.0
end
if l1 == 0 or l2 == 0 then
return 0.0
end
local md = math.max(l1, l2)//2 - 1
if md < 0 then
md = 0
end
local s1m = {}
local s2m = {}
local mc = 0
for i = 1, l1 do
local start = math.max(1, i - md)
local finish = math.min(i + md, l2)
for j = start, finish do
if not s2m[j] and s1:byte(i) == s2:byte(j) then
s1m[i] = true
s2m[j] = true
mc = mc + 1
break
end
end
end
if mc == 0 then
return 0.0
end
local tps = 0
local k = 1
for i = 1, l1 do
if s1m[i] then
while not s2m[k] do
k = k + 1
end
if s1:byte(i) ~= s2:byte(k) then
tps = tps + 1
end
k = k + 1
end
end
local jaro = (mc/l1 + mc/l2 + (mc - tps/2)/mc)/3
if jaro < 0.7 then
return jaro
end
local plen = 0
for i = 1, math.min(l1, l2, 4) do
if s1:byte(i) == s2:byte(i) then
plen = plen + 1
else
break
end
end
return jaro + 0.1*plen*(1 - jaro)
end
- Implements words clustering.
- Intended to be used together with the
simfnclass methods.
words = words or {}
words.__index = words
local function cluster_by_sim(words, thresh, sim_fn, cmp_fn)
local n = #words
local u = {}
local gs = {}
local gc = 0
cmp_fn = cmp_fn or function(sim, thresh)
return sim >= thresh
end
for i = 1, n do
if not u[i] then
local g = {words[i]}
local gsz = 1
for j = i + 1, n do
if not u[j] then
local sim = sim_fn(words[i], words[j])
if cmp_fn(sim, thresh) then
gsz = gsz + 1
g[gsz] = words[j]
u[j] = true
end
end
end
if gsz > 1 then
gc = gc + 1
gs[gc] = g
u[i] = true
end
end
end
return gs
end
function words.cluster_by_pfx(words, min_len)
return cluster_by_sim(words, min_len, simfn.cpl)
end
function words.cluster_by_sfx(words, min_len)
return cluster_by_sim(words, min_len, simfn.csl)
end
function words.cluster_by_ld(words, max_dist)
return cluster_by_sim(words, max_dist, simfn.ld,
function(dist, max)
return dist <= max
end)
end
function words.cluster_by_hd(words, max_dist)
return cluster_by_sim(words, max_dist, simfn.hd,
function(dist, max)
return dist <= max
end)
end
function words.cluster_by_lcs(words, min_len)
return cluster_by_sim(words, min_len, simfn.lcs)
end
function words.cluster_by_cs(words, min_sim)
return cluster_by_sim(words, min_sim, simfn.cs)
end
function words.cluster_by_cd(words, max_dist)
return cluster_by_sim(words, max_dist, simfn.cd,
function(dist, max)
return dist <= max
end)
end
function words.cluster_by_jwd(words, min_sim)
return cluster_by_sim(words, min_sim, simfn.jwd)
end
-- Custom classification
function words.cluster_by_cls(words, classes)
classes = classes or {}
local n = #words
local u = {}
local cl = {}
for cn, patterns in pairs(classes) do
for i = 1, n do
if not u[i] then
local word = words[i]
for j = 1, #patterns do
if string.match(word, patterns[j]) then
cl[#cl + 1] = { class = cn, word = word }
u[i] = true
break
end
end
end
end
end
local gs = {}
local max_per_class = {}
for i = 1, #cl do
local cn = cl[i].class
max_per_class[cn] = (max_per_class[cn] or 0) + 1
end
for i = 1, #cl do
local cn = cl[i].class
local word = cl[i].word
for j = 1, max_per_class[cn] do
gs[j] = gs[j] or {}
if not gs[j][cn] then
gs[j][cn] = word
break
end
end
end
return gs
end
${words.cluster_by_pfx(query[[
from
index.tag 'tag'
select
'#' .. name
]],
3
)}Result:
${words.cluster_by_pfx(query[[ from index.tag 'tag' select '#' .. name ]], 3 )}
${words.cluster_by_jwd(query[[
from
index.tag 'tag'
select
'#' .. name
]],
0.8
)}Result:
${words.cluster_by_jwd(query[[ from index.tag 'tag' select '#' .. name ]], 0.8 )}
${words.cluster_by_cls(query[[
from
index.tag 'tag'
select
'#' .. name
]],
-- array of arrays of regular expressions
{
{'meta/.*'},
{'TODO', 'BUG', 'PR[0-9]*', 'ISSUE[0-9]*'},
{'home', 'personal', 'family'}
}
)}Result:
${words.cluster_by_cls(query[[ from index.tag 'tag' select '#' .. name ]], -- array of arrays of regular expressions { {'meta/.'}, {'TODO', 'BUG', 'PR[0-9]', 'ISSUE[0-9]*'}, {'home', 'personal', 'family'} } )}