Levenshtein distance: Difference between revisions

From Rosetta Code
Content added Content deleted
(added ocaml)
Line 397: Line 397:
edocattesor-->drowsyhtesiar: 8
edocattesor-->drowsyhtesiar: 8
</pre>
</pre>

=={{header|OCaml}}==

Translation of the pseudo-code of the Wikipedia article:

<lang ocaml>let minimum a b c =
min a (min b c)

let levenshtein_distance s t =
let m = String.length s
and n = String.length t in
(* for all i and j, d.(i).(j) will hold the Levenshtein distance between
the first i characters of s and the first j characters of t *)
let d = Array.make_matrix (m+1) (n+1) 0 in

for i = 0 to m do
d.(i).(0) <- i (* the distance of any first string to an empty second string *)
done;
for j = 0 to n do
d.(0).(j) <- j (* the distance of any second string to an empty first string *)
done;

for j = 1 to n do
for i = 1 to m do

if s.[i-1] = t.[j-1] then
d.(i).(j) <- d.(i-1).(j-1) (* no operation required *)
else
d.(i).(j) <- minimum
(d.(i-1).(j) + 1) (* a deletion *)
(d.(i).(j-1) + 1) (* an insertion *)
(d.(i-1).(j-1) + 1) (* a substitution *)
done;
done;

d.(m).(n)
;;

let test s t =
Printf.printf " %s -> %s = %d\n" s t (levenshtein_distance s t);
;;

let () =
test "kitten" "sitting";
test "rosettacode" "raisethysword";
;;</lang>


=={{header|PicoLisp}}==
=={{header|PicoLisp}}==

Revision as of 20:40, 11 June 2011

Task
Levenshtein distance
You are encouraged to solve this task according to the task description, using any language you may know.
This page uses content from Wikipedia. The original article was at Levenshtein distance. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance)

In information theory and computer science, the Levenshtein distance is a metric for measuring the amount of difference between two sequences (i.e. an edit distance). The Levenshtein distance between two strings is defined as the minimum number of edits needed to transform one string into the other, with the allowable edit operations being insertion, deletion, or substitution of a single character.

For example, the Levenshtein distance between "kitten" and "sitting" is 3, since the following three edits change one into the other, and there is no way to do it with fewer than three edits:

  1. kitten → sitten (substitution of 'k' with 's')
  2. sitten → sittin (substitution of 'e' with 'i')
  3. sittin → sitting (insert 'g' at the end).

The Levenshtein distance between "rosettacode", "raisethysword" is 8; The distance between two strings is same as that when both strings is reversed.

Task : Implements a Levenshtein distance function, or uses a library function, to show the Levenshtein distance between "kitten" and "sitting".

Other edit distance at Rosetacode.org :

Ada

<lang Ada>with Ada.Text_IO;

procedure Main is

  function Levenshtein_Distance (S, T : String) return Natural is
     D : array (0 .. S'Length, 0 .. T'Length) of Natural;
  begin
     for I in D'Range (1) loop
        D (I, 0) := I;
     end loop;
     for I in D'Range (2) loop
        D (0, I) := I;
     end loop;
     for J in T'Range loop
        for I in S'Range loop
           if S (I) = T (J) then
              D (I, J) := D (I - 1, J - 1);
           else
              D (I, J) :=
                 Natural'Min
                   (Natural'Min (D (I - 1, J) + 1, D (I, J - 1) + 1),
                    D (I - 1, J - 1) + 1);
           end if;
        end loop;
     end loop;
     return D (S'Length, T'Length);
  end Levenshtein_Distance;

begin

  Ada.Text_IO.Put_Line
    ("kitten -> sitting:" &
     Integer'Image (Levenshtein_Distance ("kitten", "sitting")));
  Ada.Text_IO.Put_Line
    ("rosettacode -> raisethysword:" &
     Integer'Image (Levenshtein_Distance ("rosettacode", "raisethysword")));

end Main;</lang>

output:

kitten -> sitting: 3
rosettacode -> raisethysword: 8

C#

This is a straightforward translation of the Wikipedia pseudocode.

<lang csharp> using System;

namespace LevenshteinDistance {

   class Program
   {
       static int LevenshteinDistance(string s, string t)
       {
           int[,] d = new int[s.Length + 1, t.Length + 1];
           for (int i = 0; i <= s.Length; i++)
               d[i, 0] = i;
           for (int j = 0; j <= t.Length; j++)
               d[0, j] = j;
           for (int j = 1; j <= t.Length; j++)
               for (int i = 1; i <= s.Length; i++)
                   if (s[i - 1] == t[j - 1])
                       d[i, j] = d[i - 1, j - 1];  //no operation
                   else
                       d[i, j] = Math.Min(Math.Min(
                           d[i - 1, j] + 1,    //a deletion
                           d[i, j - 1] + 1),   //an insertion
                           d[i - 1, j - 1] + 1 //a substitution
                           );
           return d[s.Length, t.Length];
       }
       static void Main(string[] args)
       {
           if (args.Length == 2)
               Console.WriteLine("{0} -> {1} = {2}",
                   args[0], args[1], LevenshteinDistance(args[0], args[1]));
           else
               Console.WriteLine("Usage:-\n\nLevenshteinDistance <string1> <string2>");
       }
   }

} </lang> Example output:-

> LevenshteinDistance kitten sitting
kitten -> sitting = 3

> LevenshteinDistance rosettacode raisethysword
rosettacode -> raisethysword = 8

D

The standard library std.algorithm module includes a Levenshtein distance function: <lang d>import std.stdio, std.algorithm;

void main() {

   writeln(levenshteinDistance("kitten", "sitting"));

}</lang>

Output:

3

From the Java version: <lang d>import std.stdio, std.algorithm;

int distance(string s1, string s2) {

 auto costs = new int[s2.length + 1];
 foreach (i; 0 .. s1.length + 1) {
   int lastValue = i;
   foreach (j; 0 .. s2.length + 1) {
     if (i == 0)
       costs[j] = j;
     else {
       if (j > 0) {
         int newValue = costs[j - 1];
         if (s1[i - 1] != s2[j - 1])
           newValue = min(newValue, lastValue, costs[j]) + 1;
         costs[j - 1] = lastValue;
         lastValue = newValue;
       }
     }
   }
   if (i > 0)
     costs[s2.length] = lastValue;
 }
 return costs[s2.length];

}

void main() {

 foreach(p; [["kitten", "sitting"], ["rosettacode", "raisethysword"]])
   writefln("distance(%s, %s): %d", p[0], p[1], distance(p[0], p[1]));

}</lang> A recursive version, as suggested. I think the result distance is the maximum depth of recursion. A distance of 8 may cost an aeon time to finish! <lang d>import std.stdio, std.algorithm, std.range;

int lDistR(T) (T[] a, T[] b) {

   if (a.length == 0)
       return b.length;
   if (b.length == 0)
       return a.length;
   if (a.length == b.length) {
       auto r = iota(a.length);
       auto u = until!((i){ return a[i] != b[i]; })(r);
       if (equal(r, u)) // u is a short-circuit test until any
                        // mismatch
           return 0;    // u is equivalent to r, means no
                        // mismatch found
   }
   T[][] candidate;
   immutable alen = a.length;
   immutable blen = b.length;
   // mutate _a_ by 1 edit to create members of candidate
   // delete an _a_ element
   if (alen > blen)
       foreach (i; 0 .. alen)
           candidate ~= a[0 .. i] ~ a[i+1 .. $];
   // insert matching _b_ element to _a_
   if (alen < blen) {
       foreach (i; 0 .. alen+1) // from left
           candidate ~=  a[0 .. i] ~ b[i] ~ a[i .. $];
       foreach (i; 0 .. alen+1) // from right
           candidate ~=  a[0 .. $-i] ~ b[$ - i - 1] ~ a[$-i .. $];
   }
   // subsistute matching _a_ element with _b_'s
   if (alen == blen)
       foreach (i; 0 .. alen)
           if (a[i] != b[i])
               candidate ~= a[0..i] ~ b[i] ~ a[i+1 .. $];
   // exclusive cases, so only 1 edit is make to create each
   // new candidate
   auto minDist = int.max; // minimum distance on this run
   foreach (e; candidate) {
       // _a_ &  _e_ is 1 edit away
       immutable tempDist = lDistR(e, b) + 1;
       if (minDist > tempDist)
           minDist = tempDist;
   }
   return minDist;

}

void main() {

   writeln(lDistR("kitten", "sitting"));

}</lang>

Frink

Frink has a built-in function to calculate the Levenshtein edit distance between two strings: <lang frink> println[editDistance["kitten","sitting"]] </lang>

Go

WP algorithm: <lang go>package main

import "fmt"

func main() {

   fmt.Println(ld("kitten", "sitting"))

}

func ld(s, t string) int {

   d := make([][]int, len(s)+1)
   for i := range d {
       d[i] = make([]int, len(t)+1)
   }
   for i := range d {
       d[i][0] = i
   }
   for j := range d[0] {
       d[0][j] = j
   }
   for j := 1; j <= len(t); j++ {
       for i := 1; i <= len(s); i++ {
           if s[i-1] == t[j-1] {
               d[i][j] = d[i-1][j-1]
           } else {
               min := d[i-1][j]
               if d[i][j-1] < min {
                   min = d[i][j-1]
               }
               if d[i-1][j-1] < min {
                   min = d[i-1][j-1]
               }
               d[i][j] = min + 1
           }
       }
   }
   return d[len(s)][len(t)]

}</lang> Output:

3

Haskell

<lang haskell>levenshtein s1 s2 = last $ foldl transform [0 .. length s1] s2

 where transform ns@(n:ns') c = scanl calc (n+1) $ zip3 s1 ns ns'
         where calc z (c', x, y) = minimum [y+1, z+1, x + fromEnum (c' /= c)]

main = print (levenshtein "kitten" "sitting")</lang> Output:

3

Icon and Unicon

<lang unicon>procedure main()

   every process(!&input)

end

procedure process(s)

   s ? (s1 := tab(upto(' \t')), s2 := (tab(many(' \t')), tab(0))) | fail
   write("'",s1,"' -> '",s2,"' = ", levenshtein(s1,s2))

end

procedure levenshtein(s, t)

   if (n := *s+1) = 1 then return *t
   if (m := *t+1) = 1 then return *s

   every !(d := list(n,0)) := list(m, 0)
   every i := 1 to max(n,m) do d[i,1] := d[1,i] := i
   every d[1(i := 2 to n, s_i := s[iM1 := i-1]), j := 2 to m] :=
            min(d[iM1,j], d[i,jM1:=j-1],
                d[iM1,jM1] + if s_i == t[jM1] then -1 else 0) + 1
   return d[n,m]-1

end</lang>

->leven
kitten  sitting
'kitten' -> 'sitting' = 3
->

J

One approach would be a literal transcription of the wikipedia implementation:

<lang j>levenshtein=:4 :0

 D=. x +/&i.&>:&# y
 for_i.1+i.#x do.
   for_j.1+i.#y do.
     if. ((<:i){x)=(<:j){y do.
       D=.(D {~<<:i,j) (<i,j)} D
     else.
       min=. 1+<./D{~(i,j) <@:-"1#:1 2 3
       D=. min (<i,j)} D
     end.
   end.
 end.
 {:{:D

)</lang>

However, this is a rather slow and bulky approach. Another alternative would be:

<lang j>levD=: i.@-@>:@#@] ,~ >:@i.@-@#@[ ,.~ ~:/ lev=: [: {. {."1@((1&{ ,~ (1 1 , {.) <./@:+ }.)@,/\.)@,./@levD</lang>

First, we setup up an matrix of costs, with 0 or 1 for unexplored cells (1 being where the character pair corresponding to that cell position has two different characters). Note that the "cost to reach the empty string" cells go on the bottom and the right, instead of the top and the left, because this works better with J's "reduce" operator.

Then we reduce the rows of that matrix using an operation that treats those two rows as columns and reduces the rows of this derived matrix with an operation that gives the (unexplored cell + the minumum of the other cells) followed by (the cell adjacent to the previously unexplored cell.

Example use:

<lang j> 'kitten' levenshtein 'sitting' 3

  'kitten' lev 'sitting'

3</lang>

Time and space use:

<lang j> ts=: 6!:2,7!:2

  ts kitten levenshtein sitting

0.00153132 12800

  ts kitten lev sitting

0.000132101 5376</lang>

(The J flavored variant winds up being about 10 times faster, in J, for this test case, than the explicit version.)

See the Levenshtein distance essay on the Jwiki for additional solutions.

Java

Based on the algorithm outlined in the Wikipedia article:

<lang java>public class LevenshteinDistance {

 public static int computeDistance(String s1, String s2)
 {
   s1 = s1.toLowerCase();
   s2 = s2.toLowerCase();
   
   int[] costs = new int[s2.length() + 1];
   for (int i = 0; i <= s1.length(); i++)
   {
     int lastValue = i;
     for (int j = 0; j <= s2.length(); j++)
     {
       if (i == 0)
         costs[j] = j;
       else
       {
         if (j > 0)
         {
           int newValue = costs[j - 1];
           if (s1.charAt(i - 1) != s2.charAt(j - 1))
             newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
           costs[j - 1] = lastValue;
           lastValue = newValue;
         }
       }
     }
     if (i > 0)
       costs[s2.length()] = lastValue;
   }
   return costs[s2.length()];
 }
 public static void printDistance(String s1, String s2)
 {
   System.out.println(s1 + "-->" + s2 + ": " + computeDistance(s1, s2));
 }
 
 public static void main(String[] args)
 {
   printDistance("kitten", "sitting");
   printDistance("rosettacode", "raisethysword");
   printDistance(new StringBuilder("rosettacode").reverse().toString(), new StringBuilder("raisethysword").reverse().toString());
   for (int i = 1; i < args.length; i += 2)
     printDistance(args[i - 1], args[i]);
 }

} </lang>

Output:

kitten-->sitting: 3
rosettacode-->raisethysword: 8
edocattesor-->drowsyhtesiar: 8

OCaml

Translation of the pseudo-code of the Wikipedia article:

<lang ocaml>let minimum a b c =

 min a (min b c)

let levenshtein_distance s t =

 let m = String.length s
 and n = String.length t in
 (* for all i and j, d.(i).(j) will hold the Levenshtein distance between
    the first i characters of s and the first j characters of t *)
 let d = Array.make_matrix (m+1) (n+1) 0 in
 for i = 0 to m do
   d.(i).(0) <- i  (* the distance of any first string to an empty second string *)
 done;
 for j = 0 to n do
   d.(0).(j) <- j  (* the distance of any second string to an empty first string *)
 done;
 for j = 1 to n do
   for i = 1 to m do
     if s.[i-1] = t.[j-1] then
       d.(i).(j) <- d.(i-1).(j-1)  (* no operation required *)
     else
       d.(i).(j) <- minimum
                      (d.(i-1).(j) + 1)   (* a deletion *)
                      (d.(i).(j-1) + 1)   (* an insertion *)
                      (d.(i-1).(j-1) + 1) (* a substitution *)
   done;
 done;
 d.(m).(n)

let test s t =

 Printf.printf " %s -> %s = %d\n" s t (levenshtein_distance s t);

let () =

 test "kitten" "sitting";
 test "rosettacode" "raisethysword";
</lang>

PicoLisp

Translation of the pseudo-code in the Wikipedia article: <lang PicoLisp>(de levenshtein (A B)

  (let D
     (cons
        (range 0 (length A))
        (mapcar
           '((I) (cons I (copy A)))
           (range 1 (length B)) ) )
     (for (J . Y) B
        (for (I . X) A
           (set
              (nth D (inc J) (inc I))
              (if (= X Y)
                 (get D J I)
                 (inc
                    (min
                       (get D J (inc I))
                       (get D (inc J) I)
                       (get D J I) ) ) ) ) ) ) ) )</lang>

or, using 'map' to avoid list indexing: <lang PicoLisp>(de levenshtein (A B)

  (let D
     (cons
        (range 0 (length A))
        (mapcar
           '((I) (cons I (copy A)))
           (range 1 (length B)) ) )
     (map
        '((B Y)
           (map
              '((A X P)
                 (set (cdr P)
                    (if (= (car A) (car B))
                       (car X)
                       (inc (min (cadr X) (car P) (car X))) ) ) )
              A
              (car Y)
              (cadr Y) ) )
        B
        D ) ) )</lang>

Output in both cases:

: (levenshtein (chop "kitten") (chop "sitting"))
-> 3

PureBasic

Based on Wikipedia's pseudocode. <lang PureBasic>Procedure LevenshteinDistance(A_string$, B_String$)

 Protected m, n, i, j, min, k, l
 m = Len(A_string$)
 n = Len(B_String$)
 Dim D(m, n)
 
 For i=0 To m: D(i,0)=i: Next
 For j=0 To n: D(0,j)=j: Next
 
 For j=1 To n
   For i=1 To m
     If Mid(A_string$,i,1) = Mid(B_String$,j,1)
       D(i,j) = D(i-1, j-1); no operation required
     Else
       min = D(i-1, j)+1   ; a deletion
       k   = D(i, j-1)+1   ; an insertion
       l   = D(i-1, j-1)+1 ; a substitution
       If k < min: min=k: EndIf
       If l < min: min=l: EndIf
       D(i,j) = min
     EndIf
   Next
 Next
 ProcedureReturn D(m,n)

EndProcedure

- Testing

n = LevenshteinDistance("kitten", "sitting") MessageRequester("Info","Levenshtein Distance= "+Str(n))</lang>

Python

Implementation of the wikipedia algorithm, optimized for memory <lang python> def levenshteinDistance(s1,s2): if len(s1) > len(s2): s1,s2 = s2,s1 distances = range(len(s1) + 1) for index2,char2 in enumerate(s2): newDistances = [index2+1] for index1,char1 in enumerate(s1): if char1 == char2: newDistances.append(distances[index1]) else: newDistances.append(1 + min((distances[index1],distances[index1+1],newDistances[-1]))) distances = newDistances return distances[-1]

print levenshteinDistance("kitten","sitting"),levenshteinDistance("rosettacode","raisethysword") </lang>

Output:

3 8

Ruby

Implementation of the wikipedia algorithm <lang ruby> def levenshtein_distance(s, t)

 m = s.length
 n = t.length
 return m if n == 0
 return n if m == 0
 d = Array.new(m+1) {Array.new(n+1)}
 (0..m).each {|i| d[i][0] = i}
 (0..n).each {|j| d[0][j] = j}
 (1..n).each do |j|
   (1..m).each do |i|
     d[i][j] = if s[i-1] == t[j-1]  # adjust index into string
                 d[i-1][j-1]       # no operation required
               else
                 [ d[i-1][j]+1,    # deletion
                   d[i][j-1]+1,    # insertion
                   d[i-1][j-1]+1,  # substitution
                 ].min
               end
   end
 end
 d[m][n]

end

[ ['kitten','sitting'], ['saturday','sunday'], ["rosettacode", "raisethysword"] ].each do |s,t|

 puts "levenshtein_distance('#{s}', '#{t}') = #{levenshtein_distance(s, t)}"

end</lang>

output

levenshtein_distance('kitten', 'sitting') = 3
levenshtein_distance('saturday', 'sunday') = 3
levenshtein_distance('rosettacode', 'raisethysword') = 8

Tcl

<lang tcl>proc levenshteinDistance {s t} {

   # Edge cases
   if {![set n [string length $t]]} {

return [string length $s]

   } elseif {![set m [string length $s]]} {

return $n

   }
   # Fastest way to initialize
   for {set i 0} {$i <= $m} {incr i} {

lappend d 0 lappend p $i

   }
   # Loop, computing the distance table (well, a moving section)
   for {set j 0} {$j < $n} {} {

set tj [string index $t $j] lset d 0 [incr j] for {set i 0} {$i < $m} {} { set a [expr {[lindex $d $i]+1}] set b [expr {[lindex $p $i]+([string index $s $i] ne $tj)}] set c [expr {[lindex $p [incr i]]+1}] # Faster than min($a,$b,$c) lset d $i [expr {$a<$b ? $c<$a ? $c : $a : $c<$b ? $c : $b}] } # Swap set nd $p; set p $d; set d $nd

   }
   # The score is at the end of the last-computed row
   return [lindex $p end]

}</lang> Usage: <lang tcl>puts [levenshteinDistance "kitten" "sitting"]; # Prints 3</lang>