Text completion: Difference between revisions
m (→{{header|Phix}}: use unix_dict(), and a properly transpiled apply()) |
|||
Line 570: | Line 570: | ||
=={{header|Phix}}== |
=={{header|Phix}}== |
||
uses levenshtein() from [[Levenshtein_distance#Phix]] and |
uses levenshtein() from [[Levenshtein_distance#Phix]] (reproduced below for your convenience) and the standard unix_dict(). |
||
<lang Phix> |
<!--<lang Phix>(phixonline)--> |
||
<span style="color: #008080;">function</span> <span style="color: #000000;">levenshtein</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">a</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">b</span><span style="color: #0000FF;">)</span> |
|||
sequence words = get_text(join_path({"demo","unixdict.txt"}),GT_LF_STRIPPED), |
|||
<span style="color: #004080;">integer</span> <span style="color: #000000;">n</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">a</span><span style="color: #0000FF;">),</span> |
|||
leven = apply(words,levenshtein,word) |
|||
<span style="color: #000000;">m</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">b</span><span style="color: #0000FF;">)</span> |
|||
function ln(string /*w*/, integer i, n) return leven[i]=n end function |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">m</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
for n=1 to 4 do |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">m</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">n</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
printf(1,"Words at Levenshtein distance of %d (%g%% similarity) from \"%s\": \n%s\n", |
|||
<span style="color: #004080;">sequence</span> <span style="color: #000000;">d</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">m</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span> |
|||
{n,100-round(100*n/length(word)),word,join_by(filter(words,ln,n),1,6)}) |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">n</span> <span style="color: #008080;">do</span> |
|||
⚫ | |||
<span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">m</span> <span style="color: #008080;">do</span> |
|||
<span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">][</span><span style="color: #000000;">j</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">j</span> |
|||
<span style="color: #000080;font-style:italic;">-- next := min({ prev +substitution, deletion, or insertion }):</span> |
|||
<span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">][</span><span style="color: #000000;">j</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">min</span><span style="color: #0000FF;">({</span><span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]+(</span><span style="color: #000000;">a</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]!=</span><span style="color: #000000;">b</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]),</span> <span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">j</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">][</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">})</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #008080;">return</span> <span style="color: #000000;">d</span><span style="color: #0000FF;">[$][$]</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span> |
|||
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> |
|||
<span style="color: #7060A8;">requires</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"1.0.1"</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- (this needs apply() to be properly transpiled!)</span> |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">word</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"complition"</span> |
|||
<span style="color: #004080;">sequence</span> <span style="color: #000000;">words</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">unix_dict</span><span style="color: #0000FF;">(),</span> |
|||
<span style="color: #000000;">leven</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">apply</span><span style="color: #0000FF;">(</span><span style="color: #000000;">words</span><span style="color: #0000FF;">,</span><span style="color: #000000;">levenshtein</span><span style="color: #0000FF;">,</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">function</span> <span style="color: #000000;">ln</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000080;font-style:italic;">/*w*/</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">return</span> <span style="color: #000000;">leven</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">n</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">4</span> <span style="color: #008080;">do</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Words at Levenshtein distance of %d (%g%% similarity) from \"%s\": \n%s\n"</span><span style="color: #0000FF;">,</span> |
|||
<span style="color: #0000FF;">{</span><span style="color: #000000;">n</span><span style="color: #0000FF;">,</span><span style="color: #000000;">100</span><span style="color: #0000FF;">-</span><span style="color: #7060A8;">round</span><span style="color: #0000FF;">(</span><span style="color: #000000;">100</span><span style="color: #0000FF;">*</span><span style="color: #000000;">n</span><span style="color: #0000FF;">/</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)),</span><span style="color: #000000;">word</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">join_by</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">filter</span><span style="color: #0000FF;">(</span><span style="color: #000000;">words</span><span style="color: #0000FF;">,</span><span style="color: #000000;">ln</span><span style="color: #0000FF;">,</span><span style="color: #000000;">n</span><span style="color: #0000FF;">),</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #000000;">6</span><span style="color: #0000FF;">)})</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
⚫ | |||
{{out}} |
{{out}} |
||
(matches Delphi/Go/Wren) |
(matches Delphi/Go/Wren) |
Revision as of 03:50, 5 August 2021
- Task
Write a program that takes in a user inputted word and prints out possible words that are valid in the English dictionary. Please state any dictionaries or files/binaries/dependencies used in your program. Do show the similarity of the inputted word and outcome as a percentage. Any algorithm can be used to accomplish this task.
- Resources
Github Repo
Raw Text, Save as .txt file
Hamming Distance
Jaro-Winkler Distance
SoundEx Algorithm
SoundEx Algorithm Wiki
Dice's Coefficient
Dice Coefficient Wiki
- Possible Output
Input word: complition compaction : 80.00% similar. completion : 90.00% similar. completions : 81.82% similar. complexion : 80.00% similar.
- Extension
- How can you make the accuracy of your program higher?
C++
<lang cpp>#include <algorithm>
- include <fstream>
- include <iostream>
- include <numeric>
- include <string>
- include <vector>
// See https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows int levenshtein_distance(const std::string& str1, const std::string& str2) {
size_t m = str1.size(), n = str2.size(); std::vector<int> cost(n + 1); std::iota(cost.begin(), cost.end(), 0); for (size_t i = 0; i < m; ++i) { cost[0] = i + 1; int prev = i; for (size_t j = 0; j < n; ++j) { int c = (str1[i] == str2[j]) ? prev : 1 + std::min(std::min(cost[j + 1], cost[j]), prev); prev = cost[j + 1]; cost[j + 1] = c; } } return cost[n];
}
template <typename T> void print_vector(const std::vector<T>& vec) {
auto i = vec.begin(); if (i == vec.end()) return; std::cout << *i++; for (; i != vec.end(); ++i) std::cout << ", " << *i;
}
int main(int argc, char** argv) {
if (argc != 3) { std::cerr << "usage: " << argv[0] << " dictionary word\n"; return EXIT_FAILURE; } std::ifstream in(argv[1]); if (!in) { std::cerr << "Cannot open file " << argv[1] << '\n'; return EXIT_FAILURE; } std::string word(argv[2]); if (word.empty()) { std::cerr << "Word must not be empty\n"; return EXIT_FAILURE; } constexpr size_t max_dist = 4; std::vector<std::string> matches[max_dist + 1]; std::string match; while (getline(in, match)) { int distance = levenshtein_distance(word, match); if (distance <= max_dist) matches[distance].push_back(match); } for (size_t dist = 0; dist <= max_dist; ++dist) { if (matches[dist].empty()) continue; std::cout << "Words at Levenshtein distance of " << dist << " (" << 100 - (100 * dist)/word.size() << "% similarity) from '" << word << "':\n"; print_vector(matches[dist]); std::cout << "\n\n"; } return EXIT_SUCCESS;
}</lang>
- Output:
Using the same dictionary as the Julia example:
Words at Levenshtein distance of 1 (90% similarity) from 'complition': completion Words at Levenshtein distance of 2 (80% similarity) from 'complition': coalition, competition, compilation, composition Words at Levenshtein distance of 3 (70% similarity) from 'complition': companion, competitions, completing, complications, computation, condition Words at Levenshtein distance of 4 (60% similarity) from 'complition': collection, combination, commission, comparison, compensation, competing, competitive, complaint, complete, completed, completely, complexity, compliance, compliant, compression, computing, conclusion, conditions, connection, convention, conviction, cooperation, corporation, correction, correlation, corruption, nomination, opinion, opposition, option, pollution, population, position, simulation, solution
Delphi
Using unixdict.txt. <lang Delphi> program Text_Completion;
{$APPTYPE CONSOLE}
{$R *.res}
uses
System.SysUtils, System.Classes, System.Math, System.Generics.Collections;
type
TSujestions = TDictionary<Integer, string>;
function Levenshtein(s, t: string): integer; var
d: array of array of integer; i, j, cost: integer;
begin
SetLength(d, Length(s) + 1); for i := Low(d) to High(d) do begin SetLength(d[i], Length(t) + 1); end;
for i := Low(d) to High(d) do begin d[i, 0] := i; for j := Low(d[i]) to High(d[i]) do begin d[0, j] := j; end; end;
for i := Low(d) + 1 to High(d) do begin for j := Low(d[i]) + 1 to High(d[i]) do begin if s[i] = t[j] then begin cost := 0; end else begin cost := 1; end;
d[i, j] := Min(Min(d[i - 1, j] + 1, //deletion d[i, j - 1] + 1), //insertion d[i - 1, j - 1] + cost //substitution ); end; end; Result := d[Length(s), Length(t)];
end;
function FindSujestions(Search: string; dict: TStringList): TSujestions; var
I, ld: Integer; w: string;
begin
Result := TSujestions.Create; for I := 0 to 3 do Result.Add(I, );
for I := 0 to dict.Count - 1 do begin w := dict[I]; ld := Levenshtein(Search, w); if ld < 4 then Result[ld] := Result[ld] + ' ' + w; end;
end;
function Similarity(Search: string; Distance: Integer): Double; var
ALength: Double;
begin
ALength := Search.Length; Result := (ALength - Distance) * 100 / ALength;
end;
var
dict: TStringList; Search: string; i: Integer; Sujestions: TSujestions;
begin
dict := TStringList.Create; dict.LoadFromFile('unixdict.txt'); Search := 'complition';
Sujestions := FindSujestions(Search, dict);
Writeln('Input word: '#10, Search);
for i := 1 to 3 do begin Writeln(Format('Words which are %4.1f%% similar:', [Similarity(Search, i)])); Writeln(sujestions[i], #10); end;
Sujestions.Free; dict.Free;
Readln;
end.
</lang>
- Output:
Input word: complition Words which are 90,0% similar: completion Words which are 80,0% similar: coalition competition compilation complexion composition Words which are 70,0% similar: cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition
Factor
<lang factor>USING: formatting fry http.client io kernel lcs literals math math.ranges namespaces prettyprint.config sequences splitting ;
CONSTANT: words $[
"https://www.mit.edu/~ecprice/wordlist.10000" http-get nip "\n" split harvest
] CONSTANT: word "complition"
- lev-dist-of ( str n -- n )
[ words ] 2dip '[ _ levenshtein _ = ] filter ;
- similarity ( n -- x ) word length / 100 * 100 swap - ;
10000 margin set ! Prevent prettyprinter from wrapping sequences 4 [1,b] [
dup [ similarity ] [ drop word ] [ word swap lev-dist-of ] tri "Words at Levenshtein distance of %d (%.1f%% similarity) from %u:\n%u\n\n" printf
] each</lang>
- Output:
Words at Levenshtein distance of 1 (90.0% similarity) from "complition": { "completion" } Words at Levenshtein distance of 2 (80.0% similarity) from "complition": { "coalition" "competition" "compilation" "composition" } Words at Levenshtein distance of 3 (70.0% similarity) from "complition": { "companion" "competitions" "completing" "complications" "computation" "condition" } Words at Levenshtein distance of 4 (60.0% similarity) from "complition": { "collection" "combination" "commission" "comparison" "compensation" "competing" "competitive" "complaint" "complete" "completed" "completely" "complexity" "compliance" "compliant" "compression" "computing" "conclusion" "conditions" "connection" "convention" "conviction" "cooperation" "corporation" "correction" "correlation" "corruption" "nomination" "opinion" "opposition" "option" "pollution" "population" "position" "simulation" "solution" }
Go
<lang go>package main
import (
"bytes" "fmt" "io/ioutil" "log"
)
func levenshtein(s, t string) int {
d := make([][]int, len(s)+1) for i := range d { d[i] = make([]int, len(t)+1) } for i := range d { d[i][0] = i } for j := range d[0] { d[0][j] = j } for j := 1; j <= len(t); j++ { for i := 1; i <= len(s); i++ { if s[i-1] == t[j-1] { d[i][j] = d[i-1][j-1] } else { min := d[i-1][j] if d[i][j-1] < min { min = d[i][j-1] } if d[i-1][j-1] < min { min = d[i-1][j-1] } d[i][j] = min + 1 } }
} return d[len(s)][len(t)]
}
func main() {
search := "complition" b, err := ioutil.ReadFile("unixdict.txt") if err != nil { log.Fatal("Error reading file") } words := bytes.Fields(b) var lev [4][]string for _, word := range words { s := string(word) ld := levenshtein(search, s) if ld < 4 { lev[ld] = append(lev[ld], s) } } fmt.Printf("Input word: %s\n\n", search) for i := 1; i < 4; i++ { length := float64(len(search)) similarity := (length - float64(i)) * 100 / length fmt.Printf("Words which are %4.1f%% similar:\n", similarity) fmt.Println(lev[i]) fmt.Println() }
}</lang>
- Output:
Input word: complition Words which are 90.0% similar: [completion] Words which are 80.0% similar: [coalition competition compilation complexion composition] Words which are 70.0% similar: [cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition]
Java
Github Repo Uses dependencies given. <lang Java> import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Scanner;
//uses https://github.com/dwyl/english-words
public class textCompletionConcept {
public static int correct = 0; public static ArrayList<String> listed = new ArrayList<>(); public static void main(String[]args) throws IOException, URISyntaxException { Scanner input = new Scanner(System.in); System.out.println("Input word: "); String errorRode = input.next(); File file = new File(new File(textCompletionConcept.class.getProtectionDomain().getCodeSource().getLocation().toURI()).getPath() + File.separator + "words.txt"); Scanner reader = new Scanner(file); while(reader.hasNext()){ double percent; String compareToThis = reader.nextLine(); char[] s1 = errorRode.toCharArray(); char[] s2 = compareToThis.toCharArray(); int maxlen = Math.min(s1.length, s2.length); for (int index = 0; index < maxlen; index++) { String x = String.valueOf(s1[index]); String y = String.valueOf(s2[index]); if (x.equals(y)) { correct++; } } double length = Math.max(s1.length, s2.length); percent = correct / length; percent *= 100; boolean perfect = false; if (percent >= 80 && compareToThis.charAt(0) == errorRode.charAt(0)) { if(String.valueOf(percent).equals("100.00")){ perfect = true; } String addtoit = compareToThis + " : " + String.format("%.2f", percent) + "% similar."; listed.add(addtoit); } if(compareToThis.contains(errorRode) && !perfect && errorRode.length() * 2 > compareToThis.length()){ String addtoit = compareToThis + " : 80.00% similar."; listed.add(addtoit); } correct = 0; }
for(String x : listed){ if(x.contains("100.00% similar.")){ System.out.println(x); listed.clear(); break; } }
for(String x : listed){ System.out.println(x); } }
} </lang>
- Output
Input word: complition compaction : 80.00% similar. completion : 90.00% similar. completions : 81.82% similar. complexion : 80.00% similar. Process finished with exit code 0
Julia
See https://en.wikipedia.org/wiki/Levenshtein_distance, the number of one character edits to obtain one word from another. <lang julia>using StringDistances
const fname = download("https://www.mit.edu/~ecprice/wordlist.10000", "wordlist10000.txt") const words = read(fname, String) |> split .|> strip .|> string const wrd = "complition"
levdistof(n, string) = filter(w -> Levenshtein()(string, w) == n, words)
for n in 1:4
println("Words at Levenshtein distance of $n (", 100 - Int(round(100 * n / length(wrd))), "% similarity) from \"$wrd\": \n", levdistof(n, wrd), "\n")
end
</lang>
- Output:
Words at Levenshtein distance of 1 (90% similarity) from "complition": ["completion"] Words at Levenshtein distance of 2 (80% similarity) from "complition": ["coalition", "competition", "compilation", "composition"] Words at Levenshtein distance of 3 (70% similarity) from "complition": ["companion", "competitions", "completing", "complications", "computation", "condition"] Words at Levenshtein distance of 4 (60% similarity) from "complition": ["collection", "combination", "commission", "comparison", "compensation", "competing", "competitive", "complaint", "complete", "completed", "completely", "complexity", "compliance", "compliant", "compression", "computing", "conclusion", "conditions", "connection", "convention", "conviction", "cooperation", "corporation", "correction", "correlation", "corruption", "nomination", "opinion", "opposition", "option", "pollution", "population", "position", "simulation", "solution"]
Nim
We use the function editDistance
from the std/editdistance
module to get the Levenshtein distance (computed by considering in Unicode codepoints).
<lang Nim>import std/editdistance, sequtils, strformat, strutils
let search = "complition" let words = toSeq("unixdict.txt".lines) var lev: array[4, seq[string]] for word in words:
let ld = editDistance(search, word) if ld < 4: lev[ld].add word
echo &"Input word: {search}\n" let length = float(search.len) for i in 0..3:
if lev[i].len == 0: continue # No result. let similarity = (length - float(i)) * 100 / length echo &"Words which are {similarity:4.1f}% similar:" echo lev[i].join(" ") echo()</lang>
- Output:
Input word: complition Words which are 90.0% similar: completion Words which are 80.0% similar: coalition competition compilation complexion composition Words which are 70.0% similar: cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition
Perl
Inspired by Raku Sorenson-Dice implementation (doesn't handle Unicode, but module Text::Dice
can).
<lang perl>use strict;
use warnings;
use feature 'say';
use Path::Tiny;
use List::Util 'head';
sub bi_gram { (lc shift) =~ /(?<=\K.)./g }
sub score {
my($phrase, $word) = @_; my %count;
my @match = bi_gram $phrase; $count{$_}++ for @match, @$word; 2 * (grep { $count{$_} > 1 } keys %count) / (@match + @$word);
}
sub sorenson {
my($dict,$word,$cutoff) = @_; $cutoff //= 0.55; my(%matches,$s);
($s = score($word, $$dict{$_})) > $cutoff and $matches{$_} = $s for keys %$dict; %matches;
}
my %dict = map { $_ => [ bi_gram($_) ] } path('unixdict.txt')->slurp =~ /.{3,}/gm;
for my $word (<complition inconsqual>) {
my(%scored,@ranked);
%scored = sorenson(\%dict,$word); push @ranked, sprintf "%.3f $_", $scored{$_} for sort { $scored{$b} <=> $scored{$a} } keys %scored; say "\n$word:\n" . join("\n", head 10, @ranked);
}</lang>
- Output:
complition: 0.778 completion 0.737 competition 0.737 composition 0.706 coalition 0.700 incompletion 0.667 complexion 0.667 complicity 0.667 decomposition 0.632 compilation 0.632 compunction inconsqual: 0.609 inconsequential 0.588 continual 0.571 squall 0.556 conceptual 0.556 continuant 0.556 inconstant
Phix
uses levenshtein() from Levenshtein_distance#Phix (reproduced below for your convenience) and the standard unix_dict().
function levenshtein(string a, b) integer n = length(a), m = length(b) if n=0 then return m end if if m=0 then return n end if sequence d = repeat(repeat(0, m+1), n+1) for i=1 to n do d[i+1][1] = i for j=1 to m do d[1][j+1] = j -- next := min({ prev +substitution, deletion, or insertion }): d[i+1][j+1] = min({d[i][j]+(a[i]!=b[j]), d[i][j+1]+1, d[i+1][j]+1}) end for end for return d[$][$] end function with javascript_semantics requires("1.0.1") -- (this needs apply() to be properly transpiled!) string word = "complition" sequence words = unix_dict(), leven = apply(words,levenshtein,word) function ln(string /*w*/, integer i, n) return leven[i]=n end function for n=1 to 4 do printf(1,"Words at Levenshtein distance of %d (%g%% similarity) from \"%s\": \n%s\n", {n,100-round(100*n/length(word)),word,join_by(filter(words,ln,n),1,6)}) end for
- Output:
(matches Delphi/Go/Wren)
Words at Levenshtein distance of 1 (90% similarity) from "complition": completion Words at Levenshtein distance of 2 (80% similarity) from "complition": coalition competition compilation complexion composition Words at Levenshtein distance of 3 (70% similarity) from "complition": cognition collision combustion commotion companion compassion complain complicity compton compulsion compunction computation condition contrition demolition incompletion volition Words at Levenshtein distance of 4 (60% similarity) from "complition": abolition admonition ambition ammunition campion caption collusion combination commission communion compactify comparison compatriot competitive competitor complaint complete compliant complicate compliment compline compositor compression conception concision conclusion concretion congestion consolation contention convention convolution corruption cotillion decomposition depletion dominion gumption implosion imposition munition oblivion opinion opposition option pollution position simpleton soliton solution sorption templeton temptation tomlinson
Raku
(formerly Perl 6)
Hamming distance
<lang perl6>sub MAIN ( Str $user_word = 'complition', Str $filename = 'words.txt' ) {
my @s1 = $user_word.comb; my @listed = gather for $filename.IO.lines -> $line { my @s2 = $line.comb;
my $correct = 100 * sum( @s1 Zeq @s2) / max(+@s1, +@s2);
my $score = ( $correct >= 100 and @s1[0] eq @s2[0] ) ?? 100 !! ( $correct >= 80 and @s1[0] eq @s2[0] ) ?? $correct !! ( $line.contains($user_word) and @s1 * 2 > @s2 ) ?? 80 !! 0; take [$score, $line] if $score; }
@listed = @listed[$_] with @listed.first: :k, { .[0] == 100 };
say "{.[0].fmt('%.2f')}% {.[1]}" for @listed;
}</lang>
- Output:
80.00% compaction 90.00% completion 81.82% completions 80.00% complexion
Sorenson-Dice
Sorenson-Dice tends to return relatively low percentages even for small differences, especially for short words. We need to "lower the bar" to get any results at all. Different variations of the algorithm do or don't regularize case. This one does, though it doesn't much matter for the tested words.
Using unixdict.txt from www.puzzlers.org
<lang perl6>sub sorenson ($phrase, %hash) {
my $match = bigram $phrase; %hash.race.map: { [(2 * ($match ∩ .value) / ($match + .value)).round(.001), .key] }
}
sub bigram (\these) { Bag.new( flat these.fc.words.map: { .comb.rotor(2 => -1)».join } ) }
- Load the dictionary
my %hash = './unixdict.txt'.IO.slurp.words.race.map: { $_ => .&bigram };
- Testing
for <complition inconsqual Sørenson> -> $w {
say "\n$w:"; .say for sorenson($w, %hash).grep(*.[0] >= .55).sort({-.[0],~.[1]}).head(10);
}</lang>
- Output:
complition: [0.778 completion] [0.737 competition] [0.737 composition] [0.706 coalition] [0.7 incompletion] [0.667 complexion] [0.667 complicity] [0.667 decomposition] [0.632 compilation] [0.632 compunction] inconsqual: [0.609 inconsequential] [0.588 continual] [0.571 squall] [0.556 conceptual] [0.556 inconstant] Sørenson: [0.714 sorenson] [0.667 benson] [0.615 swenson] [0.571 evensong] [0.571 sorensen]
REXX
The method used to find a word (that is contained in a specified dictionary) closest to a specified string of letters is:
Perform any of (three methods of actions):
- delete any letter
- insert any letter
- (any letter is inserted at any location)
- (this includes prefixing any letter)
- (this includes suffixing any letter)
- substitute any letter at any location
(Only lowercase letters were used for the [above] three methods).
Perform any of the above along with any combination of any method.
This will, in effect, delete/insert/substitute any two letters:
- delete + delete
- delete + insert
- delete + substitute
- insert + delete
- insert + insert
- insert + substitute
- substitute + delete
- substitute + insert
- substitute + substitute
No attempt was made to change (by any method) three letters or more. <lang rexx>/*REXX pgm finds (dictionary) words which can be found in a specified word wheel (grid).*/ parse arg what iFID . /*obtain optional arguments from the CL*/ if what==|what=="," then what= 'complition' /*Not specified? Then use the default.*/ if iFID==|iFID=="," then iFID= 'UNIXDICT.TXT' /* " " " " " " */ @abc= 'abcdefghijklmnopqrstuvwxyz' /*(Latin) lowercase letters to be used.*/ L= length(@abc) /* " " " the Latin letters. */ wrds= 0 /*# words that are in the dictionary. */ dups= 0 /*" " " " duplicates. */ ills= 0 /*" " " contain "not" letters.*/ say ' Reading the file: ' iFID /*align the text. */ @.= . /*non─duplicated dictionary words. */ $= /*the list of dictionary words in grid.*/
do recs=0 while lines(iFID)\==0 /*process all words in the dictionary. */ x= space( linein(iFID), 0) /*elide any blanks in the dictinary. */ if @.x\==. then do; dups= dups+1; iterate; end /*is this a duplicate? */ if \datatype(x,'M') then do; ills= ills+1; iterate; end /*has word non─letters? */ @.x= /*signify that X is a dictionary word*/ wrds= wrds + 1 /*bump the number of "good" dist. words*/ end /*recs*/
a= say ' number of records (words) in the dictionary: ' right( commas(recs), 9) say ' number of ill─formed words in the dictionary: ' right( commas(ills), 9) say ' number of duplicate words in the dictionary: ' right( commas(dups), 9) say ' number of acceptable words in the dictionary: ' right( commas(wrds), 9) say ' the "word" to be used for text completion: ' what say call del what; a= a result; call del what,1; a= a result call ins what; a= a result; call ins what,1; a= a result call sub what; a= a result; call sub what,1; a= a result call prune
- = words($)
say commas(#) ' similar words found:'
do j=1 for #; _= word($, j); say right( count(_,what), 24) _ end /*j*/
exit # /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ commas: parse arg _; do ?=length(_)-3 to 1 by -3; _= insert(',', _, ?); end; return _ prune: do k=1 for words(a); _= word(a,k); if wordpos(_,$)==0 then $= $ _; end; return recur: $= $ del(z); $= $ ins(z); $= $ sub(z); return /*──────────────────────────────────────────────────────────────────────────────────────*/ count: procedure; parse arg x,y; cnt= 0; w= length(x)
do j=1 for w; p= pos( substr(x, j, 1), y); if p==0 then iterate y= overlay(., y, p); cnt= cnt + 1 end /*j*/ return ' ' left("("format(cnt/w*100,,2)/1'%)', 9) /*express as a percent.*/
/*──────────────────────────────────────────────────────────────────────────────────────*/ del: procedure expose @. @abc L; parse arg y,r; $=
do j=1 for length(y); z= space(left(y,j-1) || substr(y,j+1), 0) if @.z\==. then $= $ z; if r==1 then call recur end /*j*/; return space($)
/*──────────────────────────────────────────────────────────────────────────────────────*/ ins: procedure expose @. @abc L; parse arg y,r; $=
do j=1 for length(y) do k=1 for L; z= space(left(y,j-1) || substr(@abc,k,1) || substr(y,j), 0) if @.z\==. then $= $ z; if r==1 then call recur end /*k*/ end /*j*/; return space($)
/*──────────────────────────────────────────────────────────────────────────────────────*/ sub: procedure expose @. @abc L; parse arg y,r; $=
do j=1 for length(y) do k=1 for L; z= space(left(y,j-1) || substr(@abc,k,1) || substr(y,j+1), 0) if @.z\==. then $= $ z; if r==1 then call recur end /*k*/ end /*j*/; return space($)</lang>
- output when using the default inputs:
Reading the file: UNIXDICT.TXT number of records (words) in the dictionary: 25,104 number of ill─formed words in the dictionary: 126 number of duplicate words in the dictionary: 0 number of acceptable words in the dictionary: 24,978 the "word" to be used for text completion: complition 6 similar words found: (88.89%) coalition (90%) completion (81.82%) competition (90.91%) compilation (81.82%) composition (80%) complexion
The input file is the same dictionary that the Java entry used.
- output when using the inputs of: , GitHub.dict
Reading the file: GitHub.dict number of records (words) in the dictionary: 466,551 number of ill─formed words in the dictionary: 50,254 number of duplicate words in the dictionary: 0 number of acceptable words in the dictionary: 416,297 the "word" to be used for text completion: complition 11 similar words found: (88.89%) coalition (90%) completion (81.82%) commolition (81.82%) comparition (81.82%) competition (90.91%) compilation (81.82%) composition (81.82%) complection (83.33%) complication (80%) compaction (80%) complexion
- output when using the inputs of: , my.dict
This is my personal dictionary.
Reading the file: my.dict number of records (words) in the dictionary: 947,364 number of ill─formed words in the dictionary: 192,683 number of duplicate words in the dictionary: 6 number of acceptable words in the dictionary: 754,675 the "word" to be used for text completion: complition 12 similar words found: (88.89%) coalition (90%) completion (81.82%) commolition (81.82%) comparition (81.82%) competition (90.91%) compilation (90.91%) compiletion (81.82%) composition (81.82%) complection (83.33%) complication (80%) compaction (80%) complexion
Wren
This uses 'unixdict' and the Levenshtein distance algorithm to test for similarity. <lang ecmascript>import "io" for File import "/fmt" for Fmt
var levenshtein = Fn.new { |s, t|
var ls = s.count var lt = t.count var d = List.filled(ls + 1, null) for (i in 0..ls) { d[i] = List.filled(lt + 1, 0) d[i][0] = i } for (j in 0..lt) d[0][j] = j for (j in 1..lt) { for (i in 1..ls) { if (s[i-1] == t[j-1]) { d[i][j] = d[i-1][j-1] } else { var min = d[i-1][j] if (d[i][j-1] < min) min = d[i][j-1] if (d[i-1][j-1] < min) min = d[i-1][j-1] d[i][j] = min + 1 } } } return d[-1][-1]
}
var search = "complition" var words = File.read("unixdict.txt").split("\n").map { |w| w.trim() }.where { |w| w != "" } var lev = [[], [], [], []] var ld for (word in words) {
if ((ld = levenshtein.call(search, word)) < 4) { lev[ld].add(word) }
}
System.print("Input word: %(search)\n") for (i in 1..3) {
var count = search.count var similarity = (count - i) * 100 / count Fmt.print("Words which are $4.1f\% similar:", similarity) System.print(lev[i]) System.print()
} </lang>
- Output:
Words which are 90.0% similar: [completion] Words which are 80.0% similar: [coalition, competition, compilation, complexion, composition] Words which are 70.0% similar: [cognition, collision, combustion, commotion, companion, compassion, complain, complicity, compton, compulsion, compunction, computation, condition, contrition, demolition, incompletion, volition]