Jaro-Winkler distance: Difference between revisions
Content added Content deleted
m (to task status) |
|||
Line 961: | Line 961: | ||
0.1111 switch |
0.1111 switch |
||
0.1111 twitch |
0.1111 twitch |
||
</pre> |
|||
=={{header|jq}}== |
|||
{{works with|jq}} |
|||
'''Works with gojq, the Go implementation of jq''' |
|||
This entry, which uses unixdict.txt, borrows the implementation in jq of the Jaro similarity measure as defined at |
|||
[[Jaro_similarity#jq]]; since it is quite long, it is not repeated here. |
|||
<lang jq># See [[Jaro_similarity#jq]] for the implementation of jaro/2 |
|||
def length_of_common_prefix($s1; $s2): |
|||
if ($s1|length) > ($s2|length) then length_of_common_prefix($s2; $s1) |
|||
else ($s1|explode) as $x1 |
|||
| ($s2|explode) as $x2 |
|||
| first( range(0;$x1|length) | select( $x1[.] != $x2[.] )) // ($x1|length) |
|||
end; |
|||
# Output: the Jaro-WInkler distance using 0.1 as the common-prefix multiplier |
|||
def jaro_winkler($s1; $s2): |
|||
if $s1 == $s2 then 0 |
|||
else jaro($s1; $s2) as $j |
|||
| length_of_common_prefix($s1[:4]; $s2[:4]) as $l |
|||
| 1 - ($j + 0.1 * $l * (1 - $j)) |
|||
end ; |
|||
# Input: an array of words |
|||
# Output: [[match, distance] ...] |
|||
def candidates($word; $threshold): |
|||
map(jaro_winkler($word; . ) as $x | select($x <= $threshold) | [., $x] ); |
|||
def lpad($len): tostring | ($len - length) as $l | (" " * $l)[:$l] + .; |
|||
def task: |
|||
[inputs] # the dictionary |
|||
| ("accomodate", "definately", "goverment", "occured", "publically", "recieve", "seperate", "untill", "wich") as $word |
|||
| candidates($word; 0.15) | sort_by(.[-1]) | .[:5] |
|||
| "Matches for \($word|lpad(10)): Distance", |
|||
(.[] | "\(.[0] | lpad(21)) : \(.[-1] * 1000 | round / 1000)") ; |
|||
task</lang> |
|||
{{out}} |
|||
Invocation: jq -rRn -f program.jq unixdict.txt |
|||
<pre> |
|||
Matches for accomodate: Distance |
|||
accommodate : 0.018 |
|||
accordant : 0.104 |
|||
accolade : 0.114 |
|||
acclimate : 0.122 |
|||
accompanist : 0.133 |
|||
Matches for definately: Distance |
|||
define : 0.08 |
|||
definite : 0.085 |
|||
defiant : 0.089 |
|||
definitive : 0.12 |
|||
deflate : 0.127 |
|||
Matches for goverment: Distance |
|||
govern : 0.08 |
|||
governor : 0.13 |
|||
governess : 0.133 |
|||
governance : 0.149 |
|||
Matches for occured: Distance |
|||
occurred : 0.025 |
|||
occur : 0.057 |
|||
occurrent : 0.095 |
|||
occlude : 0.106 |
|||
concurred : 0.122 |
|||
Matches for publically: Distance |
|||
public : 0.08 |
|||
publication : 0.133 |
|||
Matches for recieve: Distance |
|||
receive : 0.063 |
|||
reeve : 0.1 |
|||
relieve : 0.105 |
|||
recife : 0.108 |
|||
recipe : 0.108 |
|||
Matches for seperate: Distance |
|||
desperate : 0.079 |
|||
separate : 0.092 |
|||
temperate : 0.116 |
|||
sept : 0.117 |
|||
septate : 0.131 |
|||
Matches for untill: Distance |
|||
until : 0.033 |
|||
till : 0.111 |
|||
huntsville : 0.133 |
|||
unital : 0.142 |
|||
Matches for wich: Distance |
|||
winch : 0.107 |
|||
witch : 0.107 |
|||
which : 0.12 |
|||
wichita : 0.126 |
|||
</pre> |
</pre> |
||