Jaro similarity: Difference between revisions
No edit summary |
(→{{header|Haskell}}: Slight disaggregation for less cluttered reading) |
||
Line 671: | Line 671: | ||
then 0 |
then 0 |
||
else (1 / 3) * ((m / s1) + (m / s2) + ((m - t) / m)) |
else (1 / 3) * ((m / s1) + (m / s2) + ((m - t) / m)) |
||
where |
|||
matches :: String -> String -> [(Int, Char)] |
|||
matches s1 s2 = |
|||
let [(l1, xs), (l2, ys)] = |
|||
sortBy (comparing fst) ((length >>= (,)) <$> [s1, s2]) |
|||
r = quot l2 2 - 1 |
|||
in foldr |
|||
(\(c, n) a -> |
|||
let offset = max 0 (n - (r + 1)) -- initial chars out of range ? |
|||
-- Any index for this char within range ? |
|||
in case elemIndex c (drop offset (take (n + r) ys)) of |
|||
Just i -> (offset + i, c) : a |
|||
Nothing -> a) |
|||
[] |
|||
(zip xs [1 ..]) |
|||
transpositions :: [(Int, Char)] -> Int |
|||
transpositions = |
|||
foldr |
|||
let f (x, y) = |
|||
if fst x > fst y |
|||
if x > y -- This character was expected before the previous one. |
|||
then succ |
|||
else id |
|||
in foldr f 0 . (zip <*> tail) |
|||
(zip ics (tail ics)) |
|||
-- TEST ---------------------------------------------------------------------- |
-- TEST ---------------------------------------------------------------------- |
Revision as of 21:40, 23 June 2017
You are encouraged to solve this task according to the task description, using any language you may know.
The Jaro distance is a measure of similarity between two strings.
The higher the Jaro distance for two strings is, the more similar the strings are.
The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters;
- is half the number of transpositions.
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in .
The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro-distance algorithm and show the distances for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro–Winkler distance on Wikipedia.
AWK
<lang AWK>
- syntax: GAWK -f JARO_DISTANCE.AWK
BEGIN {
main("DWAYNE","DUANE") main("MARTHA","MARHTA") main("DIXON","DICKSONX") main("JELLYFISH","SMELLYFISH") exit(0)
} function main(str1,str2) {
printf("%9.7f '%s' '%s'\n",jaro(str1,str2),str1,str2)
} function jaro(str1,str2, begin,end,i,j,k,leng1,leng2,match_distance,matches,str1_arr,str2_arr,transpositions) {
leng1 = length(str1) leng2 = length(str2) if (leng1 == 0 && leng2 == 0) { # both strings are empty return(1) } if (leng1 == 0 || leng2 == 0) { # only one string is empty return(0) } match_distance = int(max(leng1,leng2)/2-1) for (i=1; i<=leng1; i++) { # find matches begin = int(max(0,i-match_distance)) end = int(min(i+match_distance+1,leng2)) for (j=begin; j<=end; j++) { if (str2_arr[j]) { continue } if (substr(str1,i,1) != substr(str2,j,1)) { continue } str1_arr[i] = 1 str2_arr[j] = 1 matches++ break } } if (matches == 0) { return(0) } k = 0 for (i=1; i<=leng1; i++) { # count transpositions if (!str1_arr[i]) { continue } while (!str2_arr[k]) { k++ } if (substr(str1,i,1) != substr(str2,k,1)) { transpositions++ } k++ } transpositions /= 2 return((matches/leng1)+(matches/leng2)+((matches-transpositions)/matches))/3
} function max(x,y) { return((x > y) ? x : y) } function min(x,y) { return((x < y) ? x : y) } </lang>
- Output:
0.8222222 'DWAYNE' 'DUANE' 0.9444444 'MARTHA' 'MARHTA' 0.7666667 'DIXON' 'DICKSONX' 0.8962963 'JELLYFISH' 'SMELLYFISH'
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
C++
<lang cpp>#include <algorithm>
- include <iostream>
- include <string>
double jaro(const std::string s1, const std::string s2) {
const uint l1 = s1.length(), l2 = s2.length(); if (l1 == 0) return l2 == 0 ? 1.0 : 0.0; const uint match_distance = std::max(l1, l2) / 2 - 1; bool s1_matches[l1]; bool s2_matches[l2]; std::fill(s1_matches, s1_matches + l1, false); std::fill(s2_matches, s2_matches + l2, false); uint matches = 0; for (uint i = 0; i < l1; i++) { const int end = std::min(i + match_distance + 1, l2); for (int k = std::max(0u, i - match_distance); k < end; k++) if (!s2_matches[k] && s1[i] == s2[k]) { s1_matches[i] = true; s2_matches[k] = true; matches++; break; } } if (matches == 0) return 0.0; double t = 0.0; uint k = 0; for (uint i = 0; i < l1; i++) if (s1_matches[i]) { while (!s2_matches[k]) k++; if (s1[i] != s2[k]) t += 0.5; k++; }
const double m = matches; return (m / l1 + m / l2 + (m - t) / m) / 3.0;
}
int main() {
using namespace std; cout << jaro("MARTHA", "MARHTA") << endl; cout << jaro("DIXON", "DICKSONX") << endl; cout << jaro("JELLYFISH", "SMELLYFISH") << endl; return 0;
}</lang>
Clojure
<lang clojure> (ns test-project-intellij.core
(:gen-class))
(defn find-matches [s t]
" find match locations in the two strings " " s_matches is set to true wherever there is a match in t and t_matches is set conversely " (let [s_len (count s) t_len (count t) match_distance (int (- (/ (max s_len t_len) 2) 1)) matches 0 transpositions 0 fn-start (fn [i] (max 0 (- i match_distance))) ; function to compute starting position fn-end (fn [i] (min (+ i match_distance 1) (- t_len 1))) ] ; function to compute end position (loop [i 0 start (fn-start i) end (fn-end i) k start s_matches (vec (repeat (count s) false)) t_matches (vec (repeat (count t) false)) matches 0]
(if (< i s_len)
(if (<= k end)
(if (get t_matches k) ; continue with next k (recur i start end (inc k) s_matches t_matches matches)
(if (= (get s i) (get t k)) ; match a position, so update matches, s_matches, t_matches to reflect match (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) (assoc s_matches i true) (assoc t_matches k true) (inc matches)) ; no match so try next k (recur i start end (inc k) s_matches t_matches matches)))
; End of k iterations, so increment i and set k to start based upon i (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) s_matches t_matches matches))
; End of i iterations [matches s_matches t_matches]))))
(defn count-transpositions [s t s_matches t_matches]
" Utility function to count the number of transpositions " (let [s_len (count s)] (loop [i 0 k 0 transpositions 0]
(if (< i s_len) ; still elements in s (since i < s_len) (if (not (get s_matches i nil)) ; skip to next i since there are no matches in s (recur (inc i) k transpositions) ; checking for match in t (if (not (get t_matches k nil)) ; keeping looping around as long as there are no matches in t (recur i (inc k) transpositions) (if (not= (get s i) (get t k)) ; increment transposition count (if strings don't equal at match location) (recur (inc i) (inc k) (inc transpositions)) ; was a match, so advance i and k without increasing transpositions count (recur (inc i) (inc k) transpositions)))) ; Return count transpositions))))
(defn jaro [s t]
" Main Jaro Distance routine" (if (= s t) 1 (let [[matches s_matches t_matches] (find-matches s t)] (if (= 0 matches) 0 (let [s_len (count s) t_len (count t) transpositions (count-transpositions s t s_matches t_matches)] (float (/ (+ (/ matches s_len) (/ matches t_len) (/ (- matches (/ transpositions 2)) matches)) 3)))))))
(println (jaro "MARTHA" "MARHTA"))
(println (jaro "DIXON" "DICKSONX"))
(println (jaro "JELLYFISH" "SMELLYFISH"))
</lang>
- Output:
0.9444444 0.76666665 0.8962963
CoffeeScript
<lang coffeescript>jaro = (s1, s2) ->
l1 = s1.length l2 = s2.length if l1 == 0 then return if l2 == 0 then 1.0 else 0.0 match_distance = Math.max(l1, l2) / 2 - 1 s1_matches = [] s2_matches = [] m = 0 for i in [0...l1] end = Math.min(i + match_distance + 1, l2) for k in [Math.max(0, i - match_distance)...end] if !s2_matches[k] and s1[i] == s2[k] s1_matches[i] = true s2_matches[k] = true m++ break if m == 0 0.0 else t = 0.0 k = 0 for i in [0...l1] if s1_matches[i] until s2_matches[k] then k++ if s1[i] != s2[k++] then t += 0.5 (m / l1 + m / l2 + (m - t) / m) / 3.0
console.log jaro "MARTHA", "MARHTA" console.log jaro "DIXON", "DICKSONX" console.log jaro "JELLYFISH", "SMELLYFISH"</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
D
<lang D>auto jaro(in string s1, in string s2) {
int s1_len = cast(int) s1.length; int s2_len = cast(int) s2.length; if (s1_len == 0 && s2_len == 0) return 1;
import std.algorithm.comparison: min, max; auto match_distance = max(s1_len, s2_len) / 2 - 1; auto s1_matches = new bool[s1_len]; auto s2_matches = new bool[s2_len]; int matches = 0; for (auto i = 0; i < s1_len; i++) { auto start = max(0, i - match_distance); auto end = min(i + match_distance + 1, s2_len); for (auto j = start; j < end; j++) if (!s2_matches[j] && s1[i] == s2[j]) { s1_matches[i] = true; s2_matches[j] = true; matches++; break; } } if (matches == 0) return 0;
auto t = 0.0; auto k = 0; for (auto i = 0; i < s1_len; i++) if (s1_matches[i]) { while (!s2_matches[k]) k++; if (s1[i] != s2[k++]) t += 0.5; } double m = matches; return (m / s1_len + m / s2_len + (m - t) / m) / 3.0;
}
void main() {
import std.stdio: writeln; writeln(jaro( "MARTHA", "MARHTA")); writeln(jaro( "DIXON", "DICKSONX")); writeln(jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
0.944444 0.766667 0.896296
Elixir
<lang elixir>defmodule Jaro do
def distance(s, t) when is_binary(s) and is_binary(t), do: distance(to_charlist(s), to_charlist(t)) def distance(x, x), do: 1.0 def distance(s, t) do s_len = length(s) t_len = length(t) {s_matches, t_matches, matches} = matching(s, t, s_len, t_len) if matches == 0 do 0.0 else {k, transpositions} = transposition(s, t, s_matches, t_matches) ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3 end end defp matching(s, t, s_len, t_len) do match_distance = div(max(s_len, t_len), 2) - 1 ac0 = {List.duplicate(false, s_len), List.duplicate(false, t_len), 0} Enum.reduce(0..s_len-1, ac0, fn i,acc -> j_start = max(0, i-match_distance) j_end = min(i+match_distance, t_len-1) Enum.reduce_while(j_start..j_end, acc, fn j,{sm,tm,m} -> if Enum.at(tm, j) or Enum.at(s, i) != Enum.at(t, j) do {:cont, {sm, tm, m}} else {:halt, { List.replace_at(sm, i, true), List.replace_at(tm, j, true), m + 1 }} end end) end) end defp transposition(s, t, s_matches, t_matches) do Enum.reduce(0..length(s)-1, {0,0}, fn i,{k,transpositions} -> if Enum.at(s_matches, i) do k = k + (Enum.drop(t_matches, k) |> Enum.take_while(fn matche -> not matche end) |> length) if Enum.at(s, i) == Enum.at(t, k), do: {k+1, transpositions}, else: {k+1, transpositions+1} else {k, transpositions} end end) end
end
~w( MARTHA MARHTA
DIXON DICKSONX JELLYFISH SMELLYFISH )c
|> Enum.chunk(2) |> Enum.each(fn [s,t] ->
:io.format "jaro(~s, ~s) = ~.10f~n", [inspect(s), inspect(t), Jaro.distance(s, t)] end)</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Elixir has a built-in function (String.jaro_distance
).
FreeBASIC
<lang freebasic>' version 09-10-2016 ' compile with: fbc -s console
- Macro max(x, y)
IIf((x) > (y), (x), (y))
- EndMacro
- Macro min(x, y)
IIf((x) < (y), (x), (y))
- EndMacro
Function jaro(word1 As String, word2 As String) As Double
If Len(word1) > Len(word2) Then Swap word1, word2
Dim As Long i, j, j1, m, t Dim As Long s1 = Len(word1) Dim As Long s2 = Len(word2) Dim As Long max_dist = s2 \ 2 -1 ' integer division
For i = 0 To s1 -1 If word1[i] = word2[j] Then m = m +1 word2[j] = 32 Else For j1 = max(0, i - max_dist) To min(s2 -1, i + max_dist) If word1[i] = word2[j1] Then t = t +1 m = m +1 word2[j1] = 32 If j1 > j Then j = j1 End If Next End If j = j + 1 Next If m = 0 Then Return 0
t = t \ 2 Return (m / s1 + m / s2 + ((m - t) / m)) / 3
End Function
' ------=< MAIN >=------
Print Print " jaro (MARTHA, MARHTA) ="; jaro("MARTHA", "MARHTA") Print " jaro (DIXON, DICKSONX) ="; jaro("DIXON", "DICKSONX") Print " jaro (JELLYFISH, SMELLYFISH) ="; jaro("JELLYFISH", "SMELLYFISH")
' empty keyboard buffer
While Inkey <> "" : Wend
Print : Print "hit any key to end program"
Sleep
End</lang>
- Output:
jaro (MARTHA, MARHTA) = 0.9444444444444444 jaro (DIXON, DICKSONX) = 0.7666666666666667 jaro (JELLYFISH, SMELLYFISH) = 0.8962962962962963
Go
<lang go>package main
import "fmt"
func jaro(str1, str2 string) float64 {
if len(str1) == 0 && len(str2) == 0 { return 1 } if len(str1) == 0 || len(str2) == 0 { return 0 } match_distance := len(str1) if len(str2) > match_distance { match_distance = len(str2) } match_distance = match_distance/2 - 1 str1_matches := make([]bool, len(str1)) str2_matches := make([]bool, len(str2)) matches := 0. transpositions := 0. for i := range str1 { start := i - match_distance if start < 0 { start = 0 } end := i + match_distance + 1 if end > len(str2) { end = len(str2) } for k := start; k < end; k++ { if str2_matches[k] { continue } if str1[i] != str2[k] { continue } str1_matches[i] = true str2_matches[k] = true matches++ break } } if matches == 0 { return 0 } k := 0 for i := range str1 { if !str1_matches[i] { continue } for !str2_matches[k] { k++ } if str1[i] != str2[k] { transpositions++ } k++ } transpositions /= 2 return (matches/float64(len(str1)) + matches/float64(len(str2)) + (matches-transpositions)/matches) / 3
}
func main() {
fmt.Printf("%f\n", jaro("MARTHA", "MARHTA")) fmt.Printf("%f\n", jaro("DIXON", "DICKSONX")) fmt.Printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"))
}</lang>
- Output:
0.944444 0.766667 0.896296
Haskell
<lang Haskell>import Data.Ord (comparing) import Text.Printf (printf) import Data.List (sortBy, elemIndex, intercalate)
jaro :: String -> String -> Float jaro x y =
let f = (fromIntegral . length) [m, t] = [f, fromIntegral . transpositions] <*> [matches x y] [s1, s2] = [f] <*> [x, y] in if m == 0 then 0 else (1 / 3) * ((m / s1) + (m / s2) + ((m - t) / m))
matches :: String -> String -> [(Int, Char)] matches s1 s2 =
let [(l1, xs), (l2, ys)] = sortBy (comparing fst) ((length >>= (,)) <$> [s1, s2]) r = quot l2 2 - 1 in foldr (\(c, n) a -> let offset = max 0 (n - (r + 1)) -- initial chars out of range ? -- Any index for this char within range ? in case elemIndex c (drop offset (take (n + r) ys)) of Just i -> (offset + i, c) : a Nothing -> a) [] (zip xs [1 ..])
transpositions :: [(Int, Char)] -> Int transpositions =
let f (x, y) = if fst x > fst y then succ else id in foldr f 0 . (zip <*> tail)
-- TEST ---------------------------------------------------------------------- main :: IO () main =
mapM_ putStrLn $ (\(s1, s2) -> intercalate " -> " [s1, s2, printf "%.3f\n" $ jaro s1 s2]) <$> [ ("DWAYNE", "DUANE") , ("MARTHA", "MARHTA") , ("DIXON", "DICKSONX") , ("JELLYFISH", "SMELLYFISH") ]</lang>
- Output:
DWAYNE -> DUANE -> 0.822 MARTHA -> MARHTA -> 0.944 DIXON -> DICKSONX -> 0.767 JELLYFISH -> SMELLYFISH -> 0.896
Haxe
<lang Haxe>class Jaro {
private static function jaro(s1: String, s2: String): Float { var s1_len = s1.length; var s2_len = s2.length; if (s1_len == 0 && s2_len == 0) return 1; var match_distance = Std.int(Math.max(s1_len, s2_len)) / 2 - 1; var matches = { s1: [for(n in 0...s1_len) false], s2: [for(n in 0...s2_len) false] }; var m = 0; for (i in 0...s1_len) { var start = Std.int(Math.max(0, i - match_distance)); var end = Std.int(Math.min(i + match_distance + 1, s2_len)); for (j in start...end) if (!matches.s2[j] && s1.charAt(i) == s2.charAt(j)) {
matches.s1[i] = true; matches.s2[j] = true; m++; break;
} } if (m == 0) return 0; var k = 0; var t = 0.; for (i in 0...s1_len) if (matches.s1[i]) { while (!matches.s2[k]) k++; if (s1.charAt(i) != s2.charAt(k++)) t += 0.5; } return (m / s1_len + m / s2_len + (m - t) / m) / 3.0; } public static function main() { Sys.println(jaro( "MARTHA", "MARHTA")); Sys.println(jaro( "DIXON", "DICKSONX")); Sys.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.944444444444445 0.766666666666667 0.896296296296296
J
Implementation:
<lang J>jaro=: dyad define
d=. ((x >.&# y)%2)-1 e=. (x =/y) * d >: |x -/&(i.@#) y xm=. (+./"1 e)#x ym=. (+./"2 e)#y m=. xm <.&# ym t=. (+/xm ~:&(m&{.) ym)%2 s1=. #x s2=. #y ((m%s1)+(m%s2)+(m-t)%m)%3
)</lang>
Task examples:
<lang J> 'MARTHA' jaro 'MARHTA' 0.944444
'DIXON' jaro 'DICKSONX'
0.766667
'JELLYFISH' jaro 'SMELLYFISH'
0.896296</lang>
Java
<lang java>public class JaroDistance {
public static double jaro(String s, String t) { int s_len = s.length(); int t_len = t.length();
if (s_len == 0 && t_len == 0) return 1;
int match_distance = Integer.max(s_len, t_len) / 2 - 1;
boolean[] s_matches = new boolean[s_len]; boolean[] t_matches = new boolean[t_len];
int matches = 0; int transpositions = 0;
for (int i = 0; i < s_len; i++) { int start = Integer.max(0, i-match_distance); int end = Integer.min(i+match_distance+1, t_len);
for (int j = start; j < end; j++) { if (t_matches[j]) continue; if (s.charAt(i) != t.charAt(j)) continue; s_matches[i] = true; t_matches[j] = true; matches++; break; } }
if (matches == 0) return 0;
int k = 0; for (int i = 0; i < s_len; i++) { if (!s_matches[i]) continue; while (!t_matches[k]) k++; if (s.charAt(i) != t.charAt(k)) transpositions++; k++; }
return (((double)matches / s_len) + ((double)matches / t_len) + (((double)matches - transpositions/2.0) / matches)) / 3.0; }
public static void main(String[] args) { System.out.println(jaro( "MARTHA", "MARHTA")); System.out.println(jaro( "DIXON", "DICKSONX")); System.out.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
jq
def jaro(s1; s2): def when(p; q): if p then q else . end; (s1|length) as $len1 | (s2|length) as $len2 | (( [$len1, $len2] | max ) / 2 - 1) as $match_standard | {m:0, p:0} | reduce range(0; $len1) as $l1 (.; s1[$l1:$l1+1] as $t1 | reduce range(0; $len2) as $l2 (.; s2[$l2:$l2+1] as $t2 | when( $t1 == $t2; when( ($l2-$l1) <= $match_standard and ($l1-$l2) <= $match_standard; .m+=1) | when($l2 == $l1; .p += 1) ) ) ) | ((.m-.p)/2) as $t | ( (.m/$len1) + (.m/$len2) + ((.m-$t)/.m) ) / 3 ; jaro("MARTHA";"MARHTA") , jaro("DIXON"; "DICKSONX") , jaro("JELLYFISH";"SMELLYFISH")
Output:
0.9444444444444444 0.6833333333333332 0.8870370370370371
Julia
function jaro(s1,s2) s1::AbstractString s2::AbstractString m=0 t=0 p=0 l1=0 l2=0 match_standard=((max(length(s1),length(s2)))/2)-1 for i in s1[1:end] l1+=1 l2=0 for j in s2[1:end] l2+=1 if i==j if l2-l1<=match_standard && l1-l2<=match_standard m+=1 end if l2==l1 p+=1 end end end end t=(m-p)/2 d=1/3*((m/length(s1))+(m/length(s2))+((m-t)/m)) println(d) end
"MARTHA","MARHTA" : 0.9444444444444444444 "DIXON","DICKSONX" : 0.6833333333333333333 "JELLYFISH","SMELLYFISH" : 0.8870370370370
Kotlin
<lang scala>object Jaro {
fun distance(s1: String, s2: String): Double { val s1_len = s1.length val s2_len = s2.length if (s1_len == 0 && s2_len == 0) return 1.0 val match_distance = Math.max(s1_len, s2_len) / 2 - 1 val s1_matches = BooleanArray(s1_len) val s2_matches = BooleanArray(s2_len) var matches = 0 for (i in 0..s1_len - 1) { val start = Math.max(0, i - match_distance) val end = Math.min(i + match_distance + 1, s2_len) (start..end - 1).find { j -> !s2_matches[j] && s1[i] == s2[j] } ?. let { s1_matches[i] = true s2_matches[it] = true matches++ } } if (matches == 0) return 0.0 var t = 0.0 var k = 0 (0..s1_len - 1).filter { s1_matches[it] }.forEach { i -> while (!s2_matches[k]) k++ if (s1[i] != s2[k]) t += 0.5 k++ }
val m = matches.toDouble() return (m / s1_len + m / s2_len + (m - t) / m) / 3.0 }
}
fun main(args: Array<String>) {
println(Jaro.distance("MARTHA", "MARHTA")) println(Jaro.distance("DIXON", "DICKSONX")) println(Jaro.distance("JELLYFISH", "SMELLYFISH"))
}</lang>
PARI/GP
This version was translated from Java and Perl.
<lang parigp> \\Jaro distance between 2 strings s1 and s2. \\ 4/12/16 aev jaroDist(s1,s2)={ my(vt1=Vecsmall(s1),vt2=Vecsmall(s2),n1=#s1,n2=#s2,d,
md=max(n1,n2)\2-1,cs,ce,mc=0,tr=0,k=1,ds, s1m=vector(n1,z,0),s2m=vector(n2,z,0));
if(!n1||!n2, return(0)); for(i=1,n1,
cs=max(1,i-md); ce=min(i+md+1,n2); for(j=cs,ce, if(s2m[j],next); if(vt1[i]!=vt2[j], next); mc++; s1m[i]=1; s2m[j]=1; break; );\\fend j
);\\fend i if(!mc, return(0)); for(i=1,n1,
if(!s1m[i], next); while(!s2m[k], k++); if(vt1[i]!=vt2[k], tr++); k++
);\\fend i d=(mc/n1+mc/n2+(mc-tr/2)/mc)/3.0; ds=Strprintf("%.5f",d); print(" *** Jaro distance is: ",ds," for strings: ",s1,", ",s2); return(d); }
{ \\ Testing: jaroDist("MARTHA","MARHTA"); jaroDist("DIXON","DICKSONX"); jaroDist("JELLYFISH","SMELLYFISH"); jaroDist("DWAYNE","DUANE"); } </lang>
- Output:
*** Jaro distance is: 0.94444 for strings: MARTHA, MARHTA *** Jaro distance is: 0.76667 for strings: DIXON, DICKSONX *** Jaro distance is: 0.89630 for strings: JELLYFISH, SMELLYFISH *** Jaro distance is: 0.82222 for strings: DWAYNE, DUANE
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
return 1 if $s_len == 0 and $t_len == 0;
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; until ($t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Perl 6
<lang perl6>sub jaro ($s, $t) {
return 1 if $s eq $t;
my $s_len = + my @s = $s.comb; my $t_len = + my @t = $t.comb;
my $match_distance = ($s_len max $t_len) div 2 - 1;
my @s_matches; my @t_matches; my $matches = 0;
for ^@s -> $i {
my $start = 0 max $i - $match_distance; my $end = $i + $match_distance min $t_len;
for $start .. $end -> $j { @t_matches[$j] and next; @s[$i] eq @t[$j] or next; @s_matches[$i] = 1; @t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
for ^@s -> $i { @s_matches[$i] or next; until @t_matches[$k] { ++$k } @s[$i] eq @t[$k] or ++$transpositions; ++$k; }
($matches / $s_len + $matches / $t_len + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Python
<lang python>from __future__ import division
def jaro(s, t):
s_len = len(s) t_len = len(t)
if s_len == 0 and t_len == 0: return 1
match_distance = (max(s_len, t_len) // 2) - 1
s_matches = [False] * s_len t_matches = [False] * t_len
matches = 0 transpositions = 0
for i in range(s_len): start = max(0, i-match_distance) end = min(i+match_distance+1, t_len)
for j in range(start, end): if t_matches[j]: continue if s[i] != t[j]: continue s_matches[i] = True t_matches[j] = True matches += 1 break
if matches == 0: return 0
k = 0 for i in range(s_len): if not s_matches[i]: continue while not t_matches[k]: k += 1 if s[i] != t[k]: transpositions += 1 k += 1
return ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
for s,t in [( 'MARTHA', 'MARHTA'),
( 'DIXON', 'DICKSONX'), ('JELLYFISH', 'SMELLYFISH')]: print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Racket
(kinda)
Returns an exact value for the Jaro distance.
<lang racket>#lang racket/base
(require data/bit-vector)
(define (jaro-distance str1 str2)
(define str1-len (string-length str1)) (define str2-len (string-length str2)) (cond [(and (zero? str1-len) (zero? str2-len)) 0] [(or (zero? str1-len) (zero? str2-len)) 1] [else ;; vectors of bools that signify if that char in the matching string has a match (define str1-matches (make-bit-vector str1-len)) (define str2-matches (make-bit-vector str2-len)) (define matches ;; max distance between two chars to be considered matching (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2)))) (for/fold ((matches 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1))) (define start (max 0 (- i match-distance))) (define end (min (+ i match-distance 1) str2-len)) (for/fold ((matches matches)) ((k (in-range start end)) (c2 (in-string str2 start)) #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue #:when (char=? c1 c2) ; if str1 and str2 are not #:final #t) ;; otherwise assume there is a match (bit-vector-set! str1-matches i #t) (bit-vector-set! str2-matches k #t) (add1 matches))))) (cond [(zero? matches) 0] [else (define-values (transpositions*2 k+) (for/fold ((transpositions 0) (k 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1)) (b1 (in-bit-vector str1-matches)) ;; if there are no matches in str1 continue #:when b1) (define k+ (for/first ((k+ (in-range k str2-len)) (b2 (in-bit-vector str2-matches k)) #:when b2) k+)) (values (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions (add1 k+)))) ;; while there is no match in str2 increment k ;; divide the number of transpositions by two as per the algorithm specs ;; this division is valid because the counted transpositions include both ;; instances of the transposed characters. (define transpositions (quotient transpositions*2 2)) ;; return the Jaro distance (/ (+ (/ matches str1-len) (/ matches str2-len) (/ (- matches transpositions) matches)) 3)])]))
(module+ test
(jaro-distance "MARTHA" "MARHTA"); 0.944444 (exact->inexact (jaro-distance "MARTHA" "MARHTA")); 0.944444 (jaro-distance "DIXON" "DICKSONX"); 0.766667 (exact->inexact (jaro-distance "DIXON" "DICKSONX")); 0.766667 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
- Output:
The exact->inexact
calls in the tests give an inexact, floating point version of the rational values.
17/18 0.9444444444444444 23/30 0.7666666666666667 121/135 0.8962962962962963
REXX
<lang rexx>/*REXX program computes the Jaro distance between two strings (or a list of strings).*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/
@.1= 'MARTHA MARHTA' @.2= 'DIXON DICKSONX' @.3= 'JELLYFISH SMELLYFISH' @.4= 'DWAYNE DUANE' end
do j=1 while @.j\== /*process all the strings in the list. */ d=jaroDist(@.j) say 'Jaro distance is ' format(d, , 5) " for strings: " @.j end /*j*/ /* └──── digits past the decimal point.*/
exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0
if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /* [↓] see if the char is near enough.*/ do k=1 for L1; p=pos(substr(s.1, k, 1), s.2, max(1, k-f)); r.k=p if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ else r.k=0 /* ··· otherwise, don't count it.*/ end /*k*/ t=0 do o=1 for L1; om=o-1 if pos( substr(s.1, o, 1), s.2)==0 | r.o==0 | r.om==0 then iterate if r.o<r.om then t=t+1 end /*o*/ /* [↑] count number of transpositions.*/
if m==0 then return 0 return (m/L1 + m/L2 + (m-t)/m) / 3</lang>
output when using the default inputs:
Jaro distance is 0.94444 for strings: MARTHA MARHTA Jaro distance is 0.76667 for strings: DIXON DICKSONX Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH Jaro distance is 0.82222 for strings: DWAYNE DUANE
Ruby
<lang ruby>def jaro(s, t)
return 1.0 if s == t s_len = s.size t_len = t.size match_distance = ([s_len, t_len].max / 2) - 1 s_matches = [] t_matches = [] matches = 0.0 s_len.times do |i| j_start = [0, i-match_distance].max j_end = [i+match_distance, t_len-1].min (j_start..j_end).each do |j| t_matches[j] && next s[i] == t[j] || next s_matches[i] = true t_matches[j] = true matches += 1.0 break end end return 0.0 if matches == 0.0 k = 0 transpositions = 0.0 s_len.times do |i| s_matches[i] || next k += 1 until t_matches[k] s[i] == t[k] || (transpositions += 1.0) k += 1 end ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2.0) / matches)) / 3.0
end
%w(
MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
).each_slice(2) do |s,t|
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"
end</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Rust
<lang rust>use std::cmp;
pub fn jaro(s1: &str, s2: &str) -> f64 {
let s1_len = s1.len(); let s2_len = s2.len(); if s1_len == 0 && s2_len == 0 { return 1.0; } let match_distance = cmp::max(s1_len, s2_len) / 2 - 1; let mut s1_matches = vec![false; s1_len]; let mut s2_matches = vec![false; s2_len]; let mut m: isize = 0; for i in 0..s1_len { let start = cmp::max(0, i as isize - match_distance as isize) as usize; let end = cmp::min(i + match_distance + 1, s2_len); for j in start..end { if !s2_matches[j] && s1.as_bytes()[i] == s2.as_bytes()[j] { s1_matches[i] = true; s2_matches[j] = true; m += 1; break; } } } if m == 0 { return 0.0; } let mut t = 0.0; let mut k = 0; for i in 0..s1_len { if s1_matches[i] { while !s2_matches[k] { k += 1; } if s1.as_bytes()[i] != s2.as_bytes()[k] { t += 0.5; } k += 1; } }
let m = m as f64; (m / s1_len as f64 + m / s2_len as f64 + (m - t) / m) / 3.0
}
fn main() {
let pairs = [("MARTHA", "MARHTA"), ("DIXON", "DICKSONX"), ("JELLYFISH", "SMELLYFISH")]; for p in pairs.iter() { println!("{}/{} = {}", p.0, p.1, jaro(p.0, p.1)); }
}</lang>
- Output:
MARTHA/MARHTA = 0.9444444444444445 DIXON/DICKSONX = 0.7666666666666666 JELLYFISH/SMELLYFISH = 0.8962962962962964
Scala
<lang scala>object Jaro extends App {
def distance(s1: String, s2: String): Double = { val s1_len = s1.length val s2_len = s2.length if (s1_len == 0 && s2_len == 0) return 1.0 val match_distance = Math.max(s1_len, s2_len) / 2 - 1 val s1_matches = Array.ofDim[Boolean](s1_len) val s2_matches = Array.ofDim[Boolean](s2_len) var matches = 0 for (i <- 0 until s1_len) { val start = Math.max(0, i - match_distance) val end = Math.min(i + match_distance + 1, s2_len) start until end find { j => !s2_matches(j) && s1(i) == s2(j) } match { case Some(j) => s1_matches(i) = true s2_matches(j) = true matches += 1 case None => } } if (matches == 0) return 0.0 var t = 0.0 var k = 0 0 until s1_len filter s1_matches foreach { i => while (!s2_matches(k)) k += 1 if (s1(i) != s2(k)) t += 0.5 k += 1 }
val m = matches.toDouble (m / s1_len + m / s2_len + (m - t) / m) / 3.0 }
val strings = List(("MARTHA", "MARHTA"), ("DIXON", "DICKSONX"), ("JELLYFISH", "SMELLYFISH")) strings.foreach { s => println(distance(s._1, s._2)) }
}</lang>
Sidef
<lang ruby>func jaro(s, t) {
return 1 if (s == t)
var s_len = s.len var t_len = t.len
var match_distance = ((s_len `max` t_len) // 2 - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
for i (^s_len) { var start = (0 `max` i-match_distance) var end = (i+match_distance `min` t_len-1)
for k (start..end) { t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 for i (^s_len) { s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
for pair in [
[%c"MARTHA", %c"MARHTA"], [%c"DIXON", %c"DICKSONX"], [%c"JELLYFISH", %c"SMELLYFISH"],
] {
say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Tcl
<lang Tcl>proc jaro {s1 s2} {
set l1 [string length $s1] set l2 [string length $s2] set dmax [expr {max($l1, $l2)/2 - 1}] ;# window size to scan for matches set m1 {} ;# match indices set m2 {} for {set i 0} {$i < $l1} {incr i} { set jmin [expr {$i - $dmax}] ;# don't worry about going out-of-bounds set jmax [expr {$i + $dmax}] ;# because [string index] will return {} safely for {set j $jmin} {$j <= $jmax} {incr j} { if {$j in $m2} continue ;# don't double-count matches if {[string index $s1 $i] eq [string index $s2 $j]} { lappend m1 $i lappend m2 $j break } } } set T 0 ;# number of transpositions set oj -1 foreach j $m2 { if {$j < $oj} {incr T} set oj $j } set T [expr {$T / 2.0}] set M [expr {1.0 * [llength $m1]}] ;# number of matches expr { ( ($M / $l1) + ($M / $l2) + (($M - $T) / $M) ) / 3.0 }
}
foreach {s t} {
DWAYNE DUANE MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
} {
puts "[jaro $s $t]:\t$s / $t"
}</lang>
- Output:
0.8222222222222223: DWAYNE / DUANE 0.9722222222222222: MARTHA / MARHTA 0.7666666666666666: DIXON / DICKSONX 0.8962962962962964: JELLYFISH / SMELLYFISH
zkl
<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){
cs:=Sink(String); foreach i,c in ([0..].zip(str1)){ str2.find(c,(0).max(i - matchDistance),i + matchDistance) : if(Void!=_) cs.write(c); } cs.close()
}
fcn jaro(str1,str2){
s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1; cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance);
matches:=cs12.len().toFloat(); if(not matches) return(0.0); transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2; ( matches/s1Len + matches/s2Len + ((matches - transpositions)/matches) ) / 3.0
}</lang> <lang zkl>foreach s,t in (T(
T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){ println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));
}</lang>
- Output:
jaro("MARTHA","MARHTA") = 0.9444444444 jaro("DIXON","DICKSONX") = 0.7666666667 jaro("JELLYFISH","SMELLYFISH") = 0.8962962963
ZX Spectrum Basic
<lang zxbasic>10 LET a$="MARTHA": LET b$="MARHTA": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 20 LET a$="DIXON": LET b$="DICKSONX": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 30 LET a$="JELLYFISH": LET b$="SMELLYFISH": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 900 STOP 1000 REM Jaro subroutine 1010 LET s1=LEN a$: LET s2=LEN b$: LET j=1: LET m=0: LET t=0 1030 IF s1>s2 THEN LET z$=a$: LET a$=b$: LET b$=z$: LET z=s1: LET s1=s2: LET s2=z 1035 LET maxdist=INT (s2/2) 1040 FOR i=1 TO s1 1050 IF a$(i)=b$(j) THEN LET m=m+1: LET b$(j)=" ": GO TO 2000 1080 FOR k=FN x(1,i-maxdist) TO FN n(s2,i+maxdist) 1090 IF a$(i)=b$(k) THEN LET t=t+1: LET m=m+1: LET b$(k)=" ": IF k>j THEN LET j=k 1100 NEXT k 2000 IF j<s2 THEN LET j=j+1: 2010 NEXT i 2020 IF m=0 THEN LET jaro=0: RETURN 2030 LET t=INT (t/2) 2040 LET jaro=(m/s1+m/s2+((m-t)/m))/3 2050 RETURN 5000 REM Functions 5010 DEF FN x(a,b)=(a AND a>b)+(b AND a<b)+(a AND a=b): REM max function 5020 DEF FN n(a,b)=(a AND a<b)+(b AND a>b)+(a AND a=b): REM min function</lang>
- Output:
MARTHA, MARHTA: 0.94444444 DIXON, DICKSONX: 0.76666667 JELLYFISH, SMELLYFISH: 0.8962963