Jaro-Winkler distance: Difference between revisions

m
typo
No edit summary
m (typo)
 
(6 intermediate revisions by 4 users not shown)
Line 2:
 
The Jaro-Winkler distance is a metric for measuring the edit distance between words.
It is similar to the more basic LevensteinLevenshtein distance but the Jaro distance also accounts
for transpositions between letters in the words. With the Winkler modification to the Jaro
metric, the Jaro-Winkler distance also adds an increase in similarity for words which
Line 76:
{{trans|Python}}
 
<langsyntaxhighlight lang="11l">V WORDS = File(‘linuxwords.txt’).read_lines()
V MISSPELLINGS = [‘accomodate’,
‘definately’,
Line 123:
print("\nClose dictionary words ( distance < 0.15 using Jaro-Winkler distance) to \" "STR" \" are:\n Word | Distance")
L(w) within_distance(0.15, STR, 5)
print(‘#14 | #.4’.format(w, jaro_winkler_distance(STR, w)))</langsyntaxhighlight>
 
{{out}}
Line 155:
=={{header|Elm}}==
Author: zh5
<langsyntaxhighlight Elmlang="elm">module JaroWinkler exposing (similarity)
 
 
Line 317:
else
result
</syntaxhighlight>
</lang>
 
=={{header|ALGOL 68}}==
Line 326:
<br>
Prints the 6 closest matches regarddless of their distance (i.e. we don't restrict it to matches closer that 0.15).
<langsyntaxhighlight lang="algol68">PROC jaro sim = ( STRING sp1, sp2 )REAL:
IF STRING s1 = sp1[ AT 0 ];
STRING s2 = sp2[ AT 0 ];
Line 457:
print( ( newline ) )
OD
FI</langsyntaxhighlight>
{{out}}
<pre>
Line 535:
=={{header|C++}}==
{{trans|Swift}}
<langsyntaxhighlight lang="cpp">#include <algorithm>
#include <cstdlib>
#include <fstream>
Line 638:
}
return EXIT_SUCCESS;
}</langsyntaxhighlight>
 
{{out}}
Line 718:
=={{header|F_Sharp|F#}}==
This task uses [http://www.rosettacode.org/wiki/Jaro_distance#F.23 Jaro Distance (F#)]
<langsyntaxhighlight lang="fsharp">
// Calculate Jaro-Winkler Similarity of 2 Strings. Nigel Galloway: August 7th., 2020
let Jw P n g=let L=float(let i=Seq.map2(fun n g->n=g) n g in (if Seq.length i>4 then i|>Seq.take 4 else i)|>Seq.takeWhile id|>Seq.length)
Line 727:
["accomodate";"definately";"goverment";"occured";"publically";"recieve";"seperate";"untill";"wich"]|>
List.iter(fun n->printfn "%s" n;fN n|>Array.take 5|>Array.iter(fun n->printf "%A" n);printfn "\n")
</syntaxhighlight>
</lang>
{{out}}
<pre>
Line 760:
=={{header|Go}}==
This uses unixdict and borrows code from the [[Jaro_distance#Go]] task. Otherwise it is a translation of the Wren entry.
<langsyntaxhighlight lang="go">package main
 
import (
Line 889:
fmt.Println()
}
}</langsyntaxhighlight>
 
{{out}}
Line 967:
Implementation:
 
<langsyntaxhighlight Jlang="j">jaro=: {{
Eq=. (x=/y)*(<.<:-:x>.&#y)>:|x -/&i.&# y
xM=. (+./"1 Eq)#x
Line 981:
simj=. x jaro y
-.simj + l*p*-.simj
}}</langsyntaxhighlight>
 
Task example:
 
<langsyntaxhighlight Jlang="j">task=: {{
words=. <;._2 fread '/usr/share/dict/words'
for_word. ;:'accomodate definately goverment occured publically recieve seperate untill wich' do.
Line 1,040:
0.0533333 winch
0.0533333 witch
</syntaxhighlight>
</lang>
=={{header|Java}}==
{{trans|C++}}
<langsyntaxhighlight lang="java">import java.io.*;
import java.util.*;
 
Line 1,152:
}
}
}</langsyntaxhighlight>
 
{{out}}
Line 1,236:
This entry, which uses unixdict.txt, borrows the implementation in jq of the Jaro similarity measure as defined at
[[Jaro_similarity#jq]]; since it is quite long, it is not repeated here.
<langsyntaxhighlight lang="jq"># See [[Jaro_similarity#jq]] for the implementation of jaro/2
 
def length_of_common_prefix($s1; $s2):
Line 1,267:
(.[] | "\(.[0] | lpad(21)) : \(.[-1] * 1000 | round / 1000)") ;
 
task</langsyntaxhighlight>
{{out}}
Invocation: jq -rRn -f program.jq unixdict.txt
Line 1,322:
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia"># download("http://users.cs.duke.edu/~ola/ap/linuxwords", "linuxwords.txt")
const words = read("linuxwords.txt", String) |> split .|> strip
 
Line 1,372:
end
end
</langsyntaxhighlight>{{out}}
<pre>
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to 'accomodate' are:
Line 1,448:
 
=={{header|Mathematica}}/{{header|Wolfram Language}}==
<langsyntaxhighlight Mathematicalang="mathematica">ClearAll[JWD]
JWD[a_][b_]:=Experimental`JaroWinklerDistance[a,b]
dict=DictionaryLookup[];
Line 1,459:
TakeSmallestBy[dict->{"Element","Value"},JWD["seperate"],5]//Grid
TakeSmallestBy[dict->{"Element","Value"},JWD["untill"],5]//Grid
TakeSmallestBy[dict->{"Element","Value"},JWD["wich"],5]//Grid</langsyntaxhighlight>
{{out}}
<pre>accommodate 0.0181818
Line 1,517:
=={{header|Nim}}==
{{trans|Go}}
<langsyntaxhighlight Nimlang="nim">import lenientops
 
func jaroSim(s1, s2: string): float =
Line 1,589:
echo &"{c.dist:0.4f} {c.word}"
if i == 5: break
echo()</langsyntaxhighlight>
 
{{out}}
Line 1,662:
 
=={{header|Perl}}==
<langsyntaxhighlight lang="perl">use strict;
use warnings;
use List::Util qw(min max head);
Line 1,714:
printf "%15s : %0.4f\n", $_, $J{$_}
for head 5, sort { $J{$a} <=> $J{$b} or $a cmp $b } grep { $J{$_} < 0.15 } keys %J;
}</langsyntaxhighlight>
{{out}}
<pre style="height:40ex">Closest 5 dictionary words with a Jaro-Winkler distance < .15 from 'accomodate':
Line 1,781:
=={{header|Phix}}==
Uses jaro() from [[Jaro_distance#Phix]] (reproduced below for your convenience) and the standard unix_dict()
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #008080;">function</span> <span style="color: #000000;">jaro</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">str1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">str2</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">str1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">upper</span><span style="color: #0000FF;">(</span><span style="color: #000000;">str1</span><span style="color: #0000FF;">))</span>
Line 1,858:
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<!--</langsyntaxhighlight>-->
Output identical to <del>Go/Wren</del> Algol68
 
=={{header|Python}}==
<langsyntaxhighlight lang="python">"""
Test Jaro-Winkler distance metric.
linuxwords.txt is from http://users.cs.duke.edu/~ola/ap/linuxwords
Line 1,928:
for w in within_distance(0.15, STR, 5):
print('{:>14} | {:6.4f}'.format(w, jaro_winkler_distance(STR, w)))
</langsyntaxhighlight>{{out}}
<pre>
Close dictionary words ( distance < 0.15 using Jaro-Winkler distance) to " accomodate " are:
Line 2,009:
using the unixdict.txt file from www.puzzlers.org
 
<syntaxhighlight lang="raku" perl6line>sub jaro-winkler ($s, $t) {
 
return 0 if $s eq $t;
Line 2,070:
 
printf "%15s : %0.4f\n", .key, .value for %result.grep({ .value < .15 }).sort({+.value, ~.key}).head(5);
}</langsyntaxhighlight>
{{out}}
<pre>Closest 5 dictionary words with a Jaro-Winkler distance < .15 from accomodate:
Line 2,136:
=={{header|Rust}}==
{{trans|Python}}
<langsyntaxhighlight lang="rust">use std::fs::File;
use std::io::{self, BufRead};
 
Line 2,246:
Err(error) => eprintln!("{}", error),
}
}</langsyntaxhighlight>
 
{{out}}
Line 2,326:
=={{header|Swift}}==
{{trans|Rust}}
<langsyntaxhighlight lang="swift">import Foundation
 
func loadDictionary(_ path: String) throws -> [String] {
Line 2,415:
} catch {
print(error.localizedDescription)
}</langsyntaxhighlight>
 
{{out}}
Line 2,493:
</pre>
 
=={{header|VlangTypescript}}==
{{trans|goJava}}
<syntaxhighlight lang="typescript">
<lang vlang>import os
var fs = require('fs')
 
// Jaro Winkler Distance Formula
function jaroDistance(string1: string, string2: string): number{
// Compute Jaro-Winkler distance between two string
// Swap strings if string1 is shorter than string 2
if (string1.length < string2.length){
const tempString: string = string1;
string1 = string2;
string2 = tempString
}
let len1: number = string1.length
let len2: number = string2.length
if (!len2){
return 0.0
}
const delta: number = Math.max(1, len1 / 2.0) - 1.0;
// Flags for transpositions
let flag: boolean[] = Array(len2).fill(false)
let ch1Match: string[] = Array(len1).fill('')
// Count number of matching characters
let matches = 0
// Check if characters on both string matches
for (let i: number = 0; i < len1; i++){
const ch1: string = string1[i]
for (let j = 0; j < len2; j++){
const ch2: string = string2[j]
if (j <= i + delta && j + delta >= 1 && ch1 == ch2 && !flag[j]){
flag[j] = true
ch1Match[matches++] = ch1;
break;
}
}
}
if (!matches){
return 1.0
}
// Count number of transpositions (shared characters placed in different positions)
let transpositions: number = 0.0
for (let i: number = 0, j: number = 0; j < len2; j++){
if (flag[j]){
if (string2[j] != ch1Match[i]){
transpositions++
}
i++
}
}
const m: number = matches
// Jaro Similarity Formula simj = ( (m / length of s1) + (m / length of s2) + (m - t) / m ) / 3
const jaro: number = (m / len1 + m / len2 + (m - transpositions / 2.0) / m) / 3.0
// Length of common prefix between string up to 4 characters
let commonPrefix: number = 0.0
len2 = Math.min(4, len2)
for (let i: number = 0; i < len2; i++){
if (string1[i] == string2[i]){
commonPrefix++
}
}
// Jaro Winkler Distance Formula simw = simj + lp(1 - simj)
return 1.0 - (jaro + commonPrefix * 0.1 * (1.0 - jaro))
}
 
// Compute Jaro Winkler Distance for every word on the dictionary against the misspelled word
function withinDistance(words: string[] ,maxDistance: number, string: string, maxToReturn: number): (string | number)[][]{
let result: (string | number)[][] = new Array()
words.forEach(word =>{
const distance = jaroDistance(word, string)
// check if computed jaro winkler distance is within the set distance parameter
if (distance <= maxDistance){
const tuple = [distance, word]
result.push(tuple)
}
})
result.sort()
// Limit of matches set to maxtoReturn
return result.length <= maxToReturn ? result : result.slice(0, maxToReturn)
}
 
function loadDictionary(fileName: string): string[]{
let words: string[] = new Array()
try{
//attacomsian.com/blog/reading-a-file-line-by-line-in-nodejs
const data = fs.readFileSync(fileName, 'utf-8')
const lines: string[] = data.split(/\r?\n/)
lines.forEach(line => {
words.push(line)
})
return words
}
catch(error){
console.log("Error reading dictionary")
}
}
 
function main(): void{
try {
const misspellings = [
"accomodate​",
"definately​",
"goverment",
"occured",
"publically",
"recieve",
"seperate",
"untill",
"wich"
]
//unixdict.txt from users.cs.duke.edu/~ola/ap/linuxwords
let words: string[] = loadDictionary("unixdict.txt")
 
misspellings.forEach(spelling =>{
console.log("Misspelling:", spelling)
const closeWord = withinDistance(words, 0.15, spelling, 5)
closeWord.forEach(word =>{
console.log((word[0] as number).toFixed(4) + " " + word[1])
})
console.log("")
})
}
catch(error) {
console.log("Error on main")
}
}
main();
</syntaxhighlight>
{{out}}
<pre>
Misspelling: accomodate​
0.0364 accommodate
0.0515 accommodated
0.0515 accommodates
0.0979 accommodating
0.0979 accommodation
 
Misspelling: definately​
0.0564 definitely
0.0586 defiantly
0.0909 define
0.0977 definite
0.1013 defiant
 
Misspelling: goverment
0.0533 government
0.0667 govern
0.0697 governments
0.0833 governmental
0.0952 governs
 
Misspelling: occured
0.0250 occurred
0.0571 occur
0.0786 occupied
0.0905 occurs
0.0917 accursed
 
Misspelling: publically
0.0400 publicly
0.0800 public
0.1044 publicity
0.1327 publication
0.1400 biblically
 
Misspelling: recieve
0.0333 receive
0.0625 received
0.0625 receiver
0.0625 receives
0.0667 relieve
 
Misspelling: seperate
0.0708 desperate
0.1042 temperate
0.1083 separate
0.1167 repeated
0.1167 sept
 
Misspelling: untill
0.0333 until
0.1067 untie
0.1083 untimely
0.1111 till
0.1264 Antilles
 
Misspelling: wich
0.0533 witch
0.0600 which
0.1111 switch
0.1111 twitch
0.1143 witches
</pre>
 
=={{header|V (Vlang)}}==
{{trans|Go}}
<syntaxhighlight lang="v (vlang)">import os
 
fn jaro_sim(str1 string, str2 string) f64 {
Line 2,611 ⟶ 2,805:
println('')
}
}</langsyntaxhighlight>
 
{{out}}
Line 2,689 ⟶ 2,883:
{{libheader|Wren-sort}}
This uses unixdict and borrows code from the [[Jaro_distance#Wren]] task.
<langsyntaxhighlight ecmascriptlang="wren">import "io" for File
import "./fmt" for Fmt
import "./sort" for Sort
 
var jaroSim = Fn.new { |s1, s2|
Line 2,761 ⟶ 2,955:
for (c in closest.take(6)) Fmt.print("$0.4f $s", c[1], c[0])
System.print()
}</langsyntaxhighlight>
 
{{out}}
6,951

edits