Jaro-Winkler distance: Difference between revisions

julia example
(→‎{{header|Raku}}: Add a Raku example)
(julia example)
Line 72:
:*   Comparing string similarity algorithms. [https://medium.com/@appaloosastore/string-similarity-algorithms-compared-3f7b4d12f0ff Comparison of algorithms on Medium]
<br><br>
 
 
=={{header|Julia}}==
<lang julia># download("http://users.cs.duke.edu/~ola/ap/linuxwords", "linuxwords.txt")
const words = read("linuxwords.txt", String) |> split .|> strip
 
function jarowinklerdistance(s1, s2)
if length(s1) < length(s2)
s1, s2 = s2, s1
end
len1, len2 = length(s1), length(s2)
len2 == 0 && return 0.0
delta = max(0, len2 ÷ 2 - 1)
flag = zeros(Bool, len2) # flags for possible transpositions, begin as false
ch1_match = eltype(s1)[]
for (i, ch1) in enumerate(s1)
for (j, ch2) in enumerate(s2)
if (j <= i + delta) && (j >= i - delta) && (ch1 == ch2) && !flag[j]
flag[j] = true
push!(ch1_match, ch1)
break
end
end
end
matches = length(ch1_match)
matches == 0 && return 1.0
transpositions, i = 0, 0
for (j, ch2) in enumerate(s2)
if flag[j]
i += 1
transpositions += (ch2 != ch1_match[i])
end
end
jaro = (matches / len1 + matches / len2 + (matches - transpositions/2) / matches) / 3.0
commonprefix = count(i -> s1[i] == s2[i], 1:min(len2, 4))
return 1 - (jaro + commonprefix * 0.1 * (1 - jaro))
end
 
function closewords(s, maxdistance, maxtoreturn)
jw = 0.0
arr = [(w, jw) for w in words if (jw = jarowinklerdistance(s, w)) <= maxdistance]
sort!(arr, lt=(x, y) -> x[2] < y[2])
return length(arr) <= maxtoreturn ? arr : arr[1:maxtoreturn]
end
 
for s in ["accomodate", "definately", "goverment", "occured", "publically",
"recieve", "seperate", "untill", "wich"]
println("\nClose dictionary words ( distance < 0.15 using Jaro-Winkler distance) to '$s' are:")
println(" Word | Distance")
for (w, jw) in closewords(s, 0.15, 5)
println(rpad(w, 14), "| ", Float16(jw))
end
end
</lang>{{out}}
<pre>
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'accomodate' are:
Word | Distance
accommodate | 0.01819
accommodated | 0.03333
accommodates | 0.03333
accommodating | 0.08154
accommodation | 0.08154
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'definately' are:
Word | Distance
definitely | 0.04
defiantly | 0.04224
define | 0.08
definite | 0.085
definable | 0.0872
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'goverment' are:
Word | Distance
government | 0.05334
govern | 0.06665
governments | 0.0697
movement | 0.081
governmental | 0.0833
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'occured' are:
Word | Distance
occurred | 0.025
occur | 0.05713
occupied | 0.07855
occurs | 0.09045
accursed | 0.0917
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'publically' are:
Word | Distance
publicly | 0.04
public | 0.08
publicity | 0.10443
publication | 0.1327
biblically | 0.14
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'recieve' are:
Word | Distance
receive | 0.03333
received | 0.0625
receiver | 0.0625
receives | 0.0625
relieve | 0.06665
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'seperate' are:
Word | Distance
desperate | 0.07086
separate | 0.0917
temperate | 0.1042
separated | 0.1144
separates | 0.1144
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'untill' are:
Word | Distance
until | 0.03333
untie | 0.1067
untimely | 0.10834
Antilles | 0.1263
untidy | 0.1333
 
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'wich' are:
Word | Distance
witch | 0.05334
which | 0.06
witches | 0.11426
rich | 0.11664
wick | 0.11664
</pre>
 
 
4,103

edits