Jaro-Winkler distance: Difference between revisions

Content added Content deleted
(Added C++ solution)
mNo edit summary
Line 72: Line 72:
:*   Comparing string similarity algorithms. [https://medium.com/@appaloosastore/string-similarity-algorithms-compared-3f7b4d12f0ff Comparison of algorithms on Medium]
:*   Comparing string similarity algorithms. [https://medium.com/@appaloosastore/string-similarity-algorithms-compared-3f7b4d12f0ff Comparison of algorithms on Medium]
<br><br>
<br><br>

=={{header|Elm}}==
{{Author | zh5}}
<lang Elm>module JaroWinkler exposing (similarity)


commonPrefixLength : List a -> List a -> Int -> Int
commonPrefixLength xs ys counter =
case ( xs, ys ) of
( x :: xs_, y :: ys_ ) ->
if x == y then
commonPrefixLength xs_ ys_ (counter + 1)

else
counter

_ ->
counter

similarity : String -> String -> Float
similarity s1 s2 =
let
chars1 =
String.toList s1

chars2 =
String.toList s2

jaroScore =
jaro chars1 chars2

l =
toFloat <| min (commonPrefixLength chars1 chars2 0) 4

p =
0.1
in
jaroScore + (l * p * (1.0 - jaroScore))


containtsInNextN : Int -> a -> List a -> Bool
containtsInNextN i a items =
case ( i, items ) of
( 0, _ ) ->
False

( _, [] ) ->
False

( _, item :: rest ) ->
if item == a then
True

else
containtsInNextN (i - 1) a rest


exists : Int -> Int -> List a -> a -> Bool
exists startAt endAt items i =
if endAt < startAt then
False

else if startAt == 0 then
case items of
first :: rest ->
if i == first then
True

else
exists 0 (endAt - 1) rest i

[] ->
False

else
exists 0 (endAt - startAt) (List.drop startAt items) i


existsInWindow : a -> List a -> Int -> Int -> Bool
existsInWindow item items offset radius =
let
startAt =
max 0 (offset - radius)

endAt =
min (offset + radius) (List.length items - 1)
in
exists startAt endAt items item


transpositions : List a -> List a -> Int -> Int
transpositions xs ys counter =
case ( xs, ys ) of
( [], _ ) ->
counter

( _, [] ) ->
counter

( x :: xs_, y :: ys_ ) ->
if x /= y then
transpositions xs_ ys_ (counter + 1)

else
transpositions xs_ ys_ counter


commonItems : List a -> List a -> Int -> List a
commonItems items1 items2 radius =
items1
|> List.indexedMap
(\index item ->
if existsInWindow item items2 index radius then
[ item ]

else
[]
)
|> List.concat


jaro : List Char -> List Char -> Float
jaro chars1 chars2 =
let
minLenth =
min (List.length chars1) (List.length chars2)

matchRadius =
minLenth // 2 + (minLenth |> modBy 2)

c1 =
commonItems chars1 chars2 matchRadius

c2 =
commonItems chars2 chars1 matchRadius

c1length =
toFloat (List.length c1)

c2length =
toFloat (List.length c2)

mismatches =
transpositions c1 c2 0

transpositionScore =
(toFloat mismatches + abs (c1length - c2length)) / 2.0

s1length =
toFloat (List.length chars1)

s2length =
toFloat (List.length chars2)

tLength =
max c1length c2length

result =
(c1length / s1length + c2length / s2length + (tLength - transpositionScore) / tLength) / 3.0
in
if isNaN result then
0.0

else
result
</lang>