Jaro-Winkler distance: Difference between revisions
Content added Content deleted
(→{{header|Swift}}: Separated Vlang entry.) |
(Added TypeScript implementation) |
||
Line 2,491: | Line 2,491: | ||
witches | 0.1143 |
witches | 0.1143 |
||
</pre> |
|||
=={{header|Typescript}}== |
|||
{{trans|Java}} |
|||
<lang typescript> |
|||
var fs = require('fs') |
|||
// Jaro Winkler Distance Formula |
|||
function jaroDistance(string1: string, string2: string): number{ |
|||
// Compute Jaro-Winkler distance between two string |
|||
// Swap strings if string1 is shorter than string 2 |
|||
if (string1.length < string2.length){ |
|||
const tempString: string = string1; |
|||
string1 = string2; |
|||
string2 = tempString |
|||
} |
|||
let len1: number = string1.length |
|||
let len2: number = string2.length |
|||
if (!len2){ |
|||
return 0.0 |
|||
} |
|||
const delta: number = Math.max(1, len1 / 2.0) - 1.0; |
|||
// Flags for transpositions |
|||
let flag: boolean[] = Array(len2).fill(false) |
|||
let ch1Match: string[] = Array(len1).fill('') |
|||
// Count number of matching characters |
|||
let matches = 0 |
|||
// Check if characters on both string matches |
|||
for (let i: number = 0; i < len1; i++){ |
|||
const ch1: string = string1[i] |
|||
for (let j = 0; j < len2; j++){ |
|||
const ch2: string = string2[j] |
|||
if (j <= i + delta && j + delta >= 1 && ch1 == ch2 && !flag[j]){ |
|||
flag[j] = true |
|||
ch1Match[matches++] = ch1; |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
if (matches == 0){ |
|||
return 1.0 |
|||
} |
|||
// Count number of transpositions (shared characters placed in different positions) |
|||
let transpositions: number = 0.0 |
|||
for (let i: number = 0, j: number = 0; j < len2; j++){ |
|||
if (flag[j]){ |
|||
if (string2[j] != ch1Match[i]){ |
|||
transpositions++ |
|||
} |
|||
i++ |
|||
} |
|||
} |
|||
const m: number = matches |
|||
// Jaro Similarity Formula simj = ( (m / length of s1) + (m / length of s2) + (m - t) / m ) / 3 |
|||
const jaro: number = (m / len1 + m / len2 + (m - transpositions / 2.0) / m) / 3.0 |
|||
// Length of common prefix between string up to 4 characters |
|||
let commonPrefix: number = 0.0 |
|||
len2 = Math.min(4, len2) |
|||
for (let i: number = 0; i < len2; i++){ |
|||
if (string1[i] == string2[i]){ |
|||
commonPrefix++ |
|||
} |
|||
} |
|||
// Jaro Winkler Distance Formula simw = simj + lp(1 - simj) |
|||
return 1.0 - (jaro + commonPrefix * 0.1 * (1.0 - jaro)) |
|||
} |
|||
// Compute Jaro Winkler Distance for every word on the dictionary against the misspelled word |
|||
function withinDistance(words: string[] ,maxDistance: number, string: string, maxToReturn: number): (string | number)[][]{ |
|||
let result: (string | number)[][] = new Array() |
|||
words.forEach(word =>{ |
|||
const distance = jaroDistance(word, string) |
|||
// check if computed jaro winkler distance is within the set distance parameter |
|||
if (distance <= maxDistance){ |
|||
const tuple = [distance, word] |
|||
result.push(tuple) |
|||
} |
|||
}) |
|||
result.sort() |
|||
// Limit of matches set to maxtoReturn |
|||
return result.length <= maxToReturn ? result : result.slice(0, maxToReturn) |
|||
} |
|||
function loadDictionary(fileName: string): string[]{ |
|||
let words: string[] = new Array() |
|||
try{ |
|||
//attacomsian.com/blog/reading-a-file-line-by-line-in-nodejs |
|||
const data = fs.readFileSync(fileName, 'utf-8') |
|||
const lines: string[] = data.split(/\r?\n/) |
|||
lines.forEach(line => { |
|||
words.push(line) |
|||
}) |
|||
return words |
|||
} |
|||
catch(error){ |
|||
console.log("Error reading dictionary") |
|||
} |
|||
} |
|||
function main(): void{ |
|||
try { |
|||
const misspellings = [ |
|||
"accomodate", |
|||
"definately", |
|||
"goverment", |
|||
"occured", |
|||
"publically", |
|||
"recieve", |
|||
"seperate", |
|||
"untill", |
|||
"wich" |
|||
] |
|||
//unixdict.txt from users.cs.duke.edu/~ola/ap/linuxwords |
|||
let words: string[] = loadDictionary("unixdict.txt") |
|||
misspellings.forEach(spelling =>{ |
|||
console.log("Misspelling:", spelling) |
|||
const closeWord = withinDistance(words, 0.15, spelling, 5) |
|||
closeWord.forEach(word =>{ |
|||
console.log((word[0] as number).toFixed(4) + " " + word[1]) |
|||
}) |
|||
console.log("") |
|||
}) |
|||
} |
|||
catch(error) { |
|||
console.log("Error on main") |
|||
} |
|||
} |
|||
main(); |
|||
</lang> |
|||
{{out}} |
|||
<pre> |
|||
Misspelling: accomodate |
|||
0.0364 accommodate |
|||
0.0515 accommodated |
|||
0.0515 accommodates |
|||
0.0979 accommodating |
|||
0.0979 accommodation |
|||
Misspelling: definately |
|||
0.0564 definitely |
|||
0.0586 defiantly |
|||
0.0909 define |
|||
0.0977 definite |
|||
0.1013 defiant |
|||
Misspelling: goverment |
|||
0.0533 government |
|||
0.0667 govern |
|||
0.0697 governments |
|||
0.0833 governmental |
|||
0.0952 governs |
|||
Misspelling: occured |
|||
0.0250 occurred |
|||
0.0571 occur |
|||
0.0786 occupied |
|||
0.0905 occurs |
|||
0.0917 accursed |
|||
Misspelling: publically |
|||
0.0400 publicly |
|||
0.0800 public |
|||
0.1044 publicity |
|||
0.1327 publication |
|||
0.1400 biblically |
|||
Misspelling: recieve |
|||
0.0333 receive |
|||
0.0625 received |
|||
0.0625 receiver |
|||
0.0625 receives |
|||
0.0667 relieve |
|||
Misspelling: seperate |
|||
0.0708 desperate |
|||
0.1042 temperate |
|||
0.1083 separate |
|||
0.1167 repeated |
|||
0.1167 sept |
|||
Misspelling: untill |
|||
0.0333 until |
|||
0.1067 untie |
|||
0.1083 untimely |
|||
0.1111 till |
|||
0.1264 Antilles |
|||
Misspelling: wich |
|||
0.0533 witch |
|||
0.0600 which |
|||
0.1111 switch |
|||
0.1111 twitch |
|||
0.1143 witches |
|||
</pre> |
</pre> |
||