Jaro similarity: Difference between revisions
(added FreeBASIC) |
(→{{header|Ruby}}: simplify) |
||
Line 829: | Line 829: | ||
=={{header|Ruby}}== |
=={{header|Ruby}}== |
||
<lang ruby>def jaro(s, t) |
<lang ruby>def jaro(s, t) |
||
⚫ | |||
s_len = s.size |
s_len = s.size |
||
t_len = t.size |
t_len = t.size |
||
⚫ | |||
match_distance = ([s_len, t_len].max / 2) - 1 |
match_distance = ([s_len, t_len].max / 2) - 1 |
||
s_matches = [] |
s_matches = [] |
||
t_matches = [] |
t_matches = [] |
||
matches = 0.0 |
matches = 0.0 |
||
⚫ | |||
⚫ | |||
⚫ | |||
j_start = [0, i-match_distance].max |
j_start = [0, i-match_distance].max |
||
j_end = [i+match_distance, t_len-1].min |
j_end = [i+match_distance, t_len-1].min |
||
⚫ | |||
(j_start..j_end).each do |j| |
(j_start..j_end).each do |j| |
||
t_matches[j] && next |
t_matches[j] && next |
||
Line 855: | Line 852: | ||
end |
end |
||
end |
end |
||
return 0.0 if matches == 0.0 |
return 0.0 if matches == 0.0 |
||
k = 0 |
k = 0 |
||
⚫ | |||
s_len.times do |i| |
|||
s_matches[i] || next |
s_matches[i] || next |
||
until t_matches[k] |
k += 1 until t_matches[k] |
||
k += 1 |
|||
⚫ | |||
s[i] == t[k] || (transpositions += 1.0) |
s[i] == t[k] || (transpositions += 1.0) |
||
k += 1 |
k += 1 |
||
end |
end |
||
((matches / s_len) + |
((matches / s_len) + |
||
(matches / t_len) + |
|||
((matches - transpositions/2.0) / matches)) / 3.0 |
|||
end |
end |
||
%w( |
|||
MARTHA MARHTA |
|||
DIXON DICKSONX |
|||
JELLYFISH |
JELLYFISH SMELLYFISH |
||
).each_slice(2) |
).each_slice(2) do |s,t| |
||
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}" |
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}" |
||
end</lang> |
end</lang> |
Revision as of 02:50, 16 October 2016
You are encouraged to solve this task according to the task description, using any language you may know.
The Jaro distance is a measure of similarity between two strings.
The higher the Jaro distance for two strings is, the more similar the strings are.
The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters;
- is half the number of transpositions.
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in .
The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro-distance algorithm and show the distances for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro–Winkler distance on Wikipedia.
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
Clojure
<lang clojure> (ns test-project-intellij.core
(:gen-class))
(defn find-matches [s t]
" find match locations in the two strings " " s_matches is set to true wherever there is a match in t and t_matches is set conversely " (let [s_len (count s) t_len (count t) match_distance (int (- (/ (max s_len t_len) 2) 1)) matches 0 transpositions 0 fn-start (fn [i] (max 0 (- i match_distance))) ; function to compute starting position fn-end (fn [i] (min (+ i match_distance 1) (- t_len 1))) ] ; function to compute end position (loop [i 0 start (fn-start i) end (fn-end i) k start s_matches (vec (repeat (count s) false)) t_matches (vec (repeat (count t) false)) matches 0]
(if (< i s_len)
(if (<= k end)
(if (get t_matches k) ; continue with next k (recur i start end (inc k) s_matches t_matches matches)
(if (= (get s i) (get t k)) ; match a position, so update matches, s_matches, t_matches to reflect match (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) (assoc s_matches i true) (assoc t_matches k true) (inc matches)) ; no match so try next k (recur i start end (inc k) s_matches t_matches matches)))
; End of k iterations, so increment i and set k to start based upon i (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) s_matches t_matches matches))
; End of i iterations [matches s_matches t_matches]))))
(defn count-transpositions [s t s_matches t_matches]
" Utility function to count the number of transpositions " (let [s_len (count s)] (loop [i 0 k 0 transpositions 0]
(if (< i s_len) ; still elements in s (since i < s_len) (if (not (get s_matches i nil)) ; skip to next i since there are no matches in s (recur (inc i) k transpositions) ; checking for match in t (if (not (get t_matches k nil)) ; keeping looping around as long as there are no matches in t (recur i (inc k) transpositions) (if (not= (get s i) (get t k)) ; increment transposition count (if strings don't equal at match location) (recur (inc i) (inc k) (inc transpositions)) ; was a match, so advance i and k without increasing transpositions count (recur (inc i) (inc k) transpositions)))) ; Return count transpositions))))
(defn jaro [s t]
" Main Jaro Distance routine" (if (= s t) 1 (let [[matches s_matches t_matches] (find-matches s t)] (if (= 0 matches) 0 (let [s_len (count s) t_len (count t) transpositions (count-transpositions s t s_matches t_matches)] (float (/ (+ (/ matches s_len) (/ matches t_len) (/ (- matches (/ transpositions 2)) matches)) 3)))))))
(println (jaro "MARTHA" "MARHTA"))
(println (jaro "DIXON" "DICKSONX"))
(println (jaro "JELLYFISH" "SMELLYFISH"))
</lang>
- Output:
0.9444444 0.76666665 0.8962963
FreeBASIC
<lang freebasic>' version 09-10-2016 ' compile with: fbc -s console
- Macro max(x, y)
IIf((x) > (y), (x), (y))
- EndMacro
- Macro min(x, y)
IIf((x) < (y), (x), (y))
- EndMacro
Function jaro(word1 As String, word2 As String) As Double
If Len(word1) > Len(word2) Then Swap word1, word2
Dim As Long i, j, j1, m, t Dim As Long s1 = Len(word1) Dim As Long s2 = Len(word2) Dim As Long max_dist = s2 \ 2 -1 ' integer division
For i = 0 To s1 -1 If word1[i] = word2[j] Then m = m +1 word2[j] = 32 Else For j1 = max(0, i - max_dist) To min(s2 -1, i + max_dist) If word1[i] = word2[j1] Then t = t +1 m = m +1 word2[j1] = 32 If j1 > j Then j = j1 End If Next End If j = j + 1 Next If m = 0 Then Return 0
t = t \ 2 Return (m / s1 + m / s2 + ((m - t) / m)) / 3
End Function
' ------=< MAIN >=------
Print Print " jaro (MARTHA, MARHTA) ="; jaro("MARTHA", "MARHTA") Print " jaro (DIXON, DICKSONX) ="; jaro("DIXON", "DICKSONX") Print " jaro (JELLYFISH, SMELLYFISH) ="; jaro("JELLYFISH", "SMELLYFISH")
' empty keyboard buffer
While Inkey <> "" : Wend
Print : Print "hit any key to end program"
Sleep
End</lang>
- Output:
jaro (MARTHA, MARHTA) = 0.9444444444444444 jaro (DIXON, DICKSONX) = 0.7666666666666667 jaro (JELLYFISH, SMELLYFISH) = 0.8962962962962963
Go
<lang go>package main
import "fmt"
func jaro(str1, str2 string) float64 {
if len(str1) == 0 && len(str2) == 0 { return 1 } if len(str1) == 0 || len(str2) == 0 { return 0 } match_distance := len(str1) if len(str2) > match_distance { match_distance = len(str2) } match_distance = match_distance/2 - 1 str1_matches := make([]bool, len(str1)) str2_matches := make([]bool, len(str2)) matches := 0. transpositions := 0. for i := range str1 { start := i - match_distance if start < 0 { start = 0 } end := i + match_distance + 1 if end > len(str2) { end = len(str2) } for k := start; k < end; k++ { if str2_matches[k] { continue } if str1[i] != str2[k] { continue } str1_matches[i] = true str2_matches[k] = true matches++ break } } if matches == 0 { return 0 } k := 0 for i := range str1 { if !str1_matches[i] { continue } for !str2_matches[k] { k++ } if str1[i] != str2[k] { transpositions++ } k++ } transpositions /= 2 return (matches/float64(len(str1)) + matches/float64(len(str2)) + (matches-transpositions)/matches) / 3
}
func main() {
fmt.Printf("%f\n", jaro("MARTHA", "MARHTA")) fmt.Printf("%f\n", jaro("DIXON", "DICKSONX")) fmt.Printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"))
}</lang>
- Output:
0.944444 0.766667 0.896296
J
Implementation:
<lang J>jaro=: dyad define
d=. ((x >.&# y)%2)-1 e=. (x =/y) * d >: |x -/&(i.@#) y xm=. (+./"1 e)#x ym=. (+./"2 e)#y m=. xm <.&# ym t=. (+/xm ~:&(m&{.) ym)%2 s1=. #x s2=. #y ((m%s1)+(m%s2)+(m-t)%m)%3
)</lang>
Task examples:
<lang J> 'MARTHA' jaro 'MARHTA' 0.944444
'DIXON' jaro 'DICKSONX'
0.766667
'JELLYFISH' jaro 'SMELLYFISH'
0.896296</lang>
Java
<lang java>public class JaroDistance {
public static double jaro(String s, String t) { int s_len = s.length(); int t_len = t.length();
if (s_len == 0 && t_len == 0) return 1;
int match_distance = Integer.max(s_len, t_len) / 2 - 1;
boolean[] s_matches = new boolean[s_len]; boolean[] t_matches = new boolean[t_len];
int matches = 0; int transpositions = 0;
for (int i = 0; i < s_len; i++) { int start = Integer.max(0, i-match_distance); int end = Integer.min(i+match_distance+1, t_len);
for (int j = start; j < end; j++) { if (t_matches[j]) continue; if (s.charAt(i) != t.charAt(j)) continue; s_matches[i] = true; t_matches[j] = true; matches++; break; } }
if (matches == 0) return 0;
int k = 0; for (int i = 0; i < s_len; i++) { if (!s_matches[i]) continue; while (!t_matches[k]) k++; if (s.charAt(i) != t.charAt(k)) transpositions++; k++; }
return (((double)matches / s_len) + ((double)matches / t_len) + (((double)matches - transpositions/2.0) / matches)) / 3.0; }
public static void main(String[] args) { System.out.println(jaro( "MARTHA", "MARHTA")); System.out.println(jaro( "DIXON", "DICKSONX")); System.out.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
PARI/GP
This version was translated from Java and Perl.
<lang parigp> \\Jaro distance between 2 strings s1 and s2. \\ 4/12/16 aev jaroDist(s1,s2)={ my(vt1=Vecsmall(s1),vt2=Vecsmall(s2),n1=#s1,n2=#s2,d,
md=max(n1,n2)\2-1,cs,ce,mc=0,tr=0,k=1,ds, s1m=vector(n1,z,0),s2m=vector(n2,z,0));
if(!n1||!n2, return(0)); for(i=1,n1,
cs=max(1,i-md); ce=min(i+md+1,n2); for(j=cs,ce, if(s2m[j],next); if(vt1[i]!=vt2[j], next); mc++; s1m[i]=1; s2m[j]=1; break; );\\fend j
);\\fend i if(!mc, return(0)); for(i=1,n1,
if(!s1m[i], next); while(!s2m[k], k++); if(vt1[i]!=vt2[k], tr++); k++
);\\fend i d=(mc/n1+mc/n2+(mc-tr/2)/mc)/3.0; ds=Strprintf("%.5f",d); print(" *** Jaro distance is: ",ds," for strings: ",s1,", ",s2); return(d); }
{ \\ Testing: jaroDist("MARTHA","MARHTA"); jaroDist("DIXON","DICKSONX"); jaroDist("JELLYFISH","SMELLYFISH"); jaroDist("DWAYNE","DUANE"); } </lang>
- Output:
*** Jaro distance is: 0.94444 for strings: MARTHA, MARHTA *** Jaro distance is: 0.76667 for strings: DIXON, DICKSONX *** Jaro distance is: 0.89630 for strings: JELLYFISH, SMELLYFISH *** Jaro distance is: 0.82222 for strings: DWAYNE, DUANE
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
return 1 if $s_len == 0 and $t_len == 0;
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; until ($t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Perl 6
<lang perl6>sub jaro ($s, $t) {
return 1 if $s eq $t;
my $s_len = + my @s = $s.comb; my $t_len = + my @t = $t.comb;
my $match_distance = ($s_len max $t_len) div 2 - 1;
my @s_matches; my @t_matches; my $matches = 0;
for ^@s -> $i {
my $start = 0 max $i - $match_distance; my $end = $i + $match_distance min $t_len;
for $start .. $end -> $j { @t_matches[$j] and next; @s[$i] eq @t[$j] or next; @s_matches[$i] = 1; @t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
for ^@s -> $i { @s_matches[$i] or next; until @t_matches[$k] { ++$k } @s[$i] eq @t[$k] or ++$transpositions; ++$k; }
($matches / $s_len + $matches / $t_len + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Python
<lang python>from __future__ import division
def jaro(s, t):
s_len = len(s) t_len = len(t)
if s_len == 0 and t_len == 0: return 1
match_distance = (max(s_len, t_len) // 2) - 1
s_matches = [False] * s_len t_matches = [False] * t_len
matches = 0 transpositions = 0
for i in range(s_len): start = max(0, i-match_distance) end = min(i+match_distance+1, t_len)
for j in range(start, end): if t_matches[j]: continue if s[i] != t[j]: continue s_matches[i] = True t_matches[j] = True matches += 1 break
if matches == 0: return 0
k = 0 for i in range(s_len): if not s_matches[i]: continue while not t_matches[k]: k += 1 if s[i] != t[k]: transpositions += 1 k += 1
return ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
for s,t in [( 'MARTHA', 'MARHTA'),
( 'DIXON', 'DICKSONX'), ('JELLYFISH', 'SMELLYFISH')]: print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Racket
(kinda)
Returns an exact value for the Jaro distance.
<lang racket>#lang racket/base
(require data/bit-vector)
(define (jaro-distance str1 str2)
(define str1-len (string-length str1)) (define str2-len (string-length str2)) (cond [(and (zero? str1-len) (zero? str2-len)) 0] [(or (zero? str1-len) (zero? str2-len)) 1] [else ;; vectors of bools that signify if that char in the matching string has a match (define str1-matches (make-bit-vector str1-len)) (define str2-matches (make-bit-vector str2-len)) (define matches ;; max distance between two chars to be considered matching (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2)))) (for/fold ((matches 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1))) (define start (max 0 (- i match-distance))) (define end (min (+ i match-distance 1) str2-len)) (for/fold ((matches matches)) ((k (in-range start end)) (c2 (in-string str2 start)) #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue #:when (char=? c1 c2) ; if str1 and str2 are not #:final #t) ;; otherwise assume there is a match (bit-vector-set! str1-matches i #t) (bit-vector-set! str2-matches k #t) (add1 matches))))) (cond [(zero? matches) 0] [else (define-values (transpositions*2 k+) (for/fold ((transpositions 0) (k 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1)) (b1 (in-bit-vector str1-matches)) ;; if there are no matches in str1 continue #:when b1) (define k+ (for/first ((k+ (in-range k str2-len)) (b2 (in-bit-vector str2-matches k)) #:when b2) k+)) (values (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions (add1 k+)))) ;; while there is no match in str2 increment k ;; divide the number of transpositions by two as per the algorithm specs ;; this division is valid because the counted transpositions include both ;; instances of the transposed characters. (define transpositions (quotient transpositions*2 2)) ;; return the Jaro distance (/ (+ (/ matches str1-len) (/ matches str2-len) (/ (- matches transpositions) matches)) 3)])]))
(module+ test
(jaro-distance "MARTHA" "MARHTA"); 0.944444 (exact->inexact (jaro-distance "MARTHA" "MARHTA")); 0.944444 (jaro-distance "DIXON" "DICKSONX"); 0.766667 (exact->inexact (jaro-distance "DIXON" "DICKSONX")); 0.766667 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
- Output:
The exact->inexact
calls in the tests give an inexact, floating point version of the rational values.
17/18 0.9444444444444444 23/30 0.7666666666666667 121/135 0.8962962962962963
REXX
<lang rexx>/*REXX program computes the Jaro distance between two strings (or a list of strings).*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/
@.1= 'MARTHA MARHTA' @.2= 'DIXON DICKSONX' @.3= 'JELLYFISH SMELLYFISH' @.4= 'DWAYNE DUANE' end
do j=1 while @.j\== /*process all the strings in the list. */ d=jaroDist(@.j) say 'Jaro distance is ' format(d, , 5) " for strings: " @.j end /*j*/ /* └──── digits past the decimal point.*/
exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0
if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /* [↓] see if the char is near enough.*/ do k=1 for L1; p=pos(substr(s.1, k, 1), s.2, max(1, k-f)); r.k=p if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ else r.k=0 /* ··· otherwise, don't count it.*/ end /*k*/ t=0 do o=1 for L1; om=o-1 if pos( substr(s.1, o, 1), s.2)==0 | r.o==0 | r.om==0 then iterate if r.o<r.om then t=t+1 end /*o*/ /* [↑] count number of transpositions.*/
if m==0 then return 0 return (m/L1 + m/L2 + (m-t)/m) / 3</lang>
output when using the default inputs:
Jaro distance is 0.94444 for strings: MARTHA MARHTA Jaro distance is 0.76667 for strings: DIXON DICKSONX Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH Jaro distance is 0.82222 for strings: DWAYNE DUANE
Ruby
<lang ruby>def jaro(s, t)
return 1.0 if s == t s_len = s.size t_len = t.size match_distance = ([s_len, t_len].max / 2) - 1 s_matches = [] t_matches = [] matches = 0.0 s_len.times do |i| j_start = [0, i-match_distance].max j_end = [i+match_distance, t_len-1].min (j_start..j_end).each do |j| t_matches[j] && next s[i] == t[j] || next s_matches[i] = true t_matches[j] = true matches += 1.0 break end end return 0.0 if matches == 0.0 k = 0 transpositions = 0.0 s_len.times do |i| s_matches[i] || next k += 1 until t_matches[k] s[i] == t[k] || (transpositions += 1.0) k += 1 end ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2.0) / matches)) / 3.0
end
%w(
MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
).each_slice(2) do |s,t|
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"
end</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Sidef
<lang ruby>func jaro(s, t) {
return 1 if (s == t)
var s_len = s.len var t_len = t.len
var match_distance = ((::max(s_len, t_len) // 2) - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
for i in ^s_len { var start = ::max(0, i-match_distance) var end = ::min(i+match_distance, t_len-1)
for k in (start .. end) { t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 for i in ^s_len { s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
for pair in [
[%c"MARTHA", %c"MARHTA"], [%c"DIXON", %c"DICKSONX"], [%c"JELLYFISH", %c"SMELLYFISH"],
] {
say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
zkl
<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){
cs:=Sink(String); foreach i,c in ([0..].zip(str1)){ str2.find(c,(0).max(i - matchDistance),i + matchDistance) : if(Void!=_) cs.write(c); } cs.close()
}
fcn jaro(str1,str2){
s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1; cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance);
matches:=cs12.len().toFloat(); if(not matches) return(0.0); transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2; ( matches/s1Len + matches/s2Len + ((matches - transpositions)/matches) ) / 3.0
}</lang> <lang zkl>foreach s,t in (T(
T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){ println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));
}</lang>
- Output:
jaro("MARTHA","MARHTA") = 0.9444444444 jaro("DIXON","DICKSONX") = 0.7666666667 jaro("JELLYFISH","SMELLYFISH") = 0.8962962963