Last active
July 10, 2018 15:16
-
-
Save regiskuckaertz/caf4839430de259f8e3f8605d58280ad to your computer and use it in GitHub Desktop.
Edit distance with the Wagner-Fischer algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| module Levenshtein where | |
| pairs :: [a] -> [(a,a)] | |
| pairs [x,y] = [(x,y)] | |
| pairs (x:y:xs) = (x,y) : pairs (y:xs) | |
| vowels :: String | |
| vowels = "aeiou" | |
| cost :: Rational -> Rational -> Rational -> Char -> Char -> Rational | |
| cost a b c s t | s == t = b | |
| | otherwise = (a + insert t) `min` (b + subst s t) `min` (c + delete s) | |
| -- Cost of substitution | |
| subst :: Char -> Char -> Rational | |
| subst s t | s `elem` vowels = 0.5 | |
| | t `elem` vowels = 0.5 | |
| | otherwise = 1 | |
| -- Cost of insertion/removal | |
| insert, delete :: Char -> Rational | |
| insert s | s `elem` vowels = 0.5 | |
| | otherwise = 1 | |
| delete = insert | |
| -- Coefficients for row `i` of the matrix given coefficients for row `i-1` | |
| costs :: [Rational] -> Rational -> Char -> String -> [Rational] | |
| costs row i s ts = reverse . foldl f [i] $ xss | |
| where f (a:as) ((b,c), t) = cost a b c s t : (a:as) | |
| xss = zip (pairs row) ts | |
| levenshtein :: String -> String -> Double | |
| levenshtein "" ts = fromRational . sum . map insert $ ts | |
| levenshtein ss "" = fromRational . sum . map insert $ ss | |
| levenshtein ss ts = fromRational . last $ row | |
| where row = foldl f row0 (reverse $ foldl h [(0,'-')] ss) | |
| row0 = reverse $ foldl g [0] ts | |
| f row (i,s) = costs row i s ts | |
| g l@(x:xs) t = x + delete t : l | |
| h l@((x,_):xs) s = (x + insert s, s) : l | |
| -- The cost matrix of `levenshtein "Pneumonoultramicroscopicsilicovolcanoconiosis" "Pseudopseudohypoparathyroidism" | |
| -- | |
| -- P s e u d o p s e u d o h y p o p a r a t h y r o i d i s m | |
| -- 0 1 2 2.5 3 4 4.5 5.5 6.5 7 7.5 8.5 9 10 11 12 12.5 13.5 14 15 15.5 16.5 17.5 18.5 19.5 20 20.5 21.5 22 23 24 | |
| -- P 1 0 1 1.5 2 3 3.5 4.5 5.5 6 6.5 7.5 8 9 10 11 11.5 12.5 13 14 14.5 15.5 16.5 17.5 18.5 19 19.5 20.5 21 22 23 | |
| -- n 2 1 1 1.5 2 3 3.5 4.5 5.5 6 6.5 7.5 8 9 10 11 11.5 12.5 13 14 14.5 15.5 16.5 17.5 18.5 19 19.5 20.5 21 22 23 | |
| -- e 2.5 1.5 1.5 1 1.5 2.5 3 4 5 5.5 6 7 7.5 8.5 9.5 10.5 11 12 12.5 13.5 14 15 16 17 18 18.5 19 20 20.5 21.5 22.5 | |
| -- u 3 2 2 1.5 1 2 2.5 3.5 4.5 5 5.5 6.5 7 8 9 10 10.5 11.5 12 13 13.5 14.5 15.5 16.5 17.5 18 18.5 19.5 20 21 22 | |
| -- m 4 3 3 2.5 2 2 2.5 3.5 4.5 5 5.5 6.5 7 8 9 10 10.5 11.5 12 13 13.5 14.5 15.5 16.5 17.5 18 18.5 19.5 20 21 21 | |
| -- o 4.5 3.5 3.5 3 2.5 2.5 2 3 4 4.5 5 6 6.5 7.5 8.5 9.5 10 11 11.5 12.5 13 14 15 16 17 17.5 18 19 19.5 20.5 21.5 | |
| -- n 5.5 4.5 4.5 4 3.5 3.5 3 3 4 4.5 5 6 6.5 7.5 8.5 9.5 10 11 11.5 12.5 13 14 15 16 17 17.5 18 19 19.5 20.5 21.5 | |
| -- o 6 5 5 4.5 4 4 3.5 3.5 3.5 4 4.5 5.5 6 7 8 9 9.5 10.5 11 12 12.5 13.5 14.5 15.5 16.5 17 17.5 18.5 19 20 21 | |
| -- u 6.5 5.5 5.5 5 4.5 4.5 4 4 4 4 4 5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 13 14 15 16 16.5 17 18 18.5 19.5 20.5 | |
| -- l 7.5 6.5 6.5 6 5.5 5.5 5 5 5 4.5 4.5 5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 13 14 15 16 16.5 17 18 18.5 19.5 20.5 | |
| -- t 8.5 7.5 7.5 7 6.5 6.5 6 6 6 5.5 5 5.5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 12 13 14 15 15.5 16 17 17.5 18.5 19.5 | |
| -- r 9.5 8.5 8.5 8 7.5 7.5 7 7 7 6.5 6 6 6 6.5 7.5 8.5 9 10 10.5 10.5 11 12 13 14 14 14.5 15 16 16.5 17.5 18.5 | |
| -- a 10 9 9 8.5 8 8 7.5 7.5 7.5 7 6.5 6.5 6.5 6.5 7 8 8.5 9.5 10 11 10.5 11.5 12.5 13.5 14.5 14.5 15 15.5 16 17 18 | |
| -- m 11 10 10 9.5 9 9 8.5 8.5 8.5 8 7.5 7.5 7 7.5 7.5 8 8.5 9.5 10 11 11.5 11.5 12.5 13.5 14.5 15 15 16 16 17 17 | |
| -- i 11.5 10.5 10.5 10 9.5 9.5 9 9 9 8.5 8 8 7.5 7.5 8 8 8.5 9 9.5 10.5 11 12 12 13 14 14.5 15 15.5 16 16.5 17.5 | |
| -- c 12.5 11.5 11.5 11 10.5 10.5 10 10 10 9.5 9 9 8.5 8.5 8.5 9 8.5 9.5 9.5 10.5 11 12 13 13 14 14.5 15 16 16 17 17.5 | |
| -- r 13.5 12.5 12.5 12 11.5 11.5 11 11 11 10.5 10 10 9.5 9.5 9.5 9.5 9.5 9.5 10 9.5 10 11 12 13 13 13.5 14 15 15.5 16.5 17.5 | |
| -- o 14 13 13 12.5 12 12 11.5 11.5 11.5 11 10.5 10.5 10 10 10 10 9.5 10 10 10 10 10.5 11.5 12.5 13.5 13 13.5 14.5 15 16 17 | |
| -- s 15 14 13 13.5 13 13 12.5 12.5 11.5 12 11.5 11.5 11 11 11 11 10.5 10.5 10.5 11 10.5 11 11.5 12.5 13.5 14 13.5 14.5 15 15 16 | |
| -- c 16 15 14 13.5 14 14 13.5 13.5 12.5 12 12.5 12.5 12 12 12 12 11.5 11.5 11 11.5 11.5 11.5 12 12.5 13.5 14 14.5 14.5 15 16 16 | |
| -- o 16.5 15.5 14.5 14 14 14.5 14 14 13 12.5 12.5 13 12.5 12.5 12.5 12.5 12 12 11.5 11.5 12 12 12 12.5 13 13.5 14 15 15 15.5 16.5 | |
| -- p 17.5 16.5 15.5 15 14.5 15 15 14 14 13.5 13 13.5 13.5 13.5 13.5 12.5 13 12 12.5 12.5 12 13 13 13 13.5 13.5 14 15 15.5 16 16.5 | |
| -- i 18 17 16 15.5 15 15 15.5 14.5 14.5 14 13.5 13.5 14 14 14 13 13 12.5 12.5 13 12.5 12.5 13.5 13.5 13.5 14 13.5 14.5 15 16 16.5 | |
| -- c 19 18 17 16.5 16 16 15.5 15.5 15.5 15 14.5 14.5 14 15 15 14 13.5 13.5 13 13.5 13.5 13.5 13.5 14.5 14.5 14 14.5 14.5 15 16 17 | |
| -- s 20 19 18 17.5 17 17 16.5 16.5 15.5 16 15.5 15.5 15 15 16 15 14.5 14.5 14 14 14 14.5 14.5 14.5 15.5 15 14.5 15.5 15 15 16 | |
| -- i 20.5 19.5 18.5 18 17.5 17.5 17 17 16 16 16 16 15.5 15.5 15.5 15.5 15 15 14.5 14.5 14.5 14.5 15 15 15 15.5 15 15 15.5 15.5 15.5 | |
| -- l 21.5 20.5 19.5 19 18.5 18.5 18 18 17 16.5 16.5 17 16.5 16.5 16.5 16.5 16 16 15.5 15.5 15 15.5 15.5 16 16 15.5 16 16 15.5 16.5 16.5 | |
| -- i 22 21 20 19.5 19 19 18.5 18.5 17.5 17 17 17 17 17 17 17 16.5 16.5 16 16 15.5 15.5 16 16 16.5 16 15.5 16.5 16 16 17 | |
| -- c 23 22 21 20.5 20 20 19.5 19.5 18.5 18 17.5 18 17.5 18 18 18 17.5 17.5 17 17 16.5 16.5 16.5 17 17 17 16.5 16.5 17 17 17 | |
| -- o 23.5 22.5 21.5 21 20.5 20.5 20 20 19 18.5 18 18 18 18 18.5 18.5 18 18 17.5 17.5 17 17 17 17 17.5 17 17 17 17 17.5 17.5 | |
| -- v 24.5 23.5 22.5 22 21.5 21.5 21 21 20 19.5 19 19 18.5 19 19 19.5 19 19 18.5 18.5 18 18 18 18 18 18 17.5 18 17.5 18 18.5 | |
| -- o 25 24 23 22.5 22 22 21.5 21.5 20.5 20 19.5 19.5 19 19 19.5 19.5 19.5 19.5 19 19 18.5 18.5 18.5 18.5 18.5 18 18 18 18 18 18.5 | |
| -- l 26 25 24 23.5 23 23 22.5 22.5 21.5 21 20.5 20.5 20 20 20 20.5 20 20.5 20 20 19.5 19.5 19.5 19.5 19.5 19 18.5 19 18.5 19 19 | |
| -- c 27 26 25 24.5 24 24 23.5 23.5 22.5 22 21.5 21.5 21 21 21 21 21 21 21 21 20.5 20.5 20.5 20.5 20.5 20 19.5 19.5 19.5 19.5 20 | |
| -- a 27.5 26.5 25.5 25 24.5 24.5 24 24 23 22.5 22 22 21.5 21.5 21.5 21.5 21.5 21.5 21 21.5 21 21 21 21 21 20.5 20 20 20 20 20 | |
| -- n 28.5 27.5 26.5 26 25.5 25.5 25 25 24 23.5 23 23 22.5 22.5 22.5 22.5 22 22.5 22 22 22 22 22 22 22 21.5 21 21 20.5 21 21 | |
| -- o 29 28 27 26.5 26 26 25.5 25.5 24.5 24 23.5 23.5 23 23 23 23 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22 21.5 21.5 21 21 21.5 | |
| -- c 30 29 28 27.5 27 27 26.5 26.5 25.5 25 24.5 24.5 24 24 24 24 23.5 23.5 23 23.5 23 23.5 23.5 23.5 23.5 23 22.5 22.5 22 22 22 | |
| -- o 30.5 29.5 28.5 28 27.5 27.5 27 27 26 25.5 25 25 24.5 24.5 24.5 24.5 24 24 23.5 23.5 23.5 23.5 24 24 24 23.5 23 23 22.5 22.5 22.5 | |
| -- n 31.5 30.5 29.5 29 28.5 28.5 28 28 27 26.5 26 26 25.5 25.5 25.5 25.5 25 25 24.5 24.5 24 24.5 24.5 25 25 24.5 24 24 23.5 23.5 23.5 | |
| -- i 32 31 30 29.5 29 29 28.5 28.5 27.5 27 26.5 26.5 26 26 26 26 25.5 25.5 25 25 24.5 24.5 25 25 25.5 25 24.5 24.5 24 24 24 | |
| -- o 32.5 31.5 30.5 30 29.5 29.5 29 29 28 27.5 27 27 26.5 26.5 26.5 26.5 26 26 25.5 25.5 25 25 25 25.5 25.5 25.5 25 25 24.5 24.5 24.5 | |
| -- s 33.5 32.5 31.5 31 30.5 30.5 30 30 29 28.5 28 28 27.5 27.5 27.5 27.5 27 27 26.5 26.5 26 26 26 26 26.5 26 26 26 25.5 24.5 25.5 | |
| -- i 34 33 32 31.5 31 31 30.5 30.5 29.5 29 28.5 28.5 28 28 28 28 27.5 27.5 27 27 26.5 26.5 26.5 26.5 26.5 26.5 26 26.5 26 25 25 | |
| -- s 35 34 33 32.5 32 32 31.5 31.5 30.5 30 29.5 29.5 29 29 29 29 28.5 28.5 28 28 27.5 27.5 27.5 27.5 27.5 27 27 27 27 26 26 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment