Levenshtein distance/Alignment

Revision as of 19:53, 19 August 2016 by rosettacode>Gerard Schildberger (added whitespace before the TOC (table of contents), added a ;Task: (bold) header.)

The Levenshtein distance algorithm returns the number of atomic operations (insertion, deletion or edition) that must be performed on a string in order to obtain an other one, but it does not say anything about the actual operations used or their order.

Levenshtein distance/Alignment is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

An alignment is a notation used to describe the operations used to turn a string into an other. At some point in the strings, the minus character ('-') is placed in order to signify that a character must be added at this very place. For instance, an alignment between the words 'place' and 'palace' is:

P-LACE
PALACE


Task

Write a function that shows the alignment of two strings for the corresponding levenshtein distance.

As an example, use the words "rosettacode" and "raisethysword".

You can either implement an algorithm, or use a dedicated library (thus showing us how it is named in your language).

C

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <string.h>

typedef struct edit_s edit_t, *edit; struct edit_s { char c1, c2; int n; edit next; };

void leven(char *a, char *b) { int i, j, la = strlen(a), lb = strlen(b); edit *tbl = malloc(sizeof(edit) * (1 + la)); tbl[0] = calloc((1 + la) * (1 + lb), sizeof(edit_t)); for (i = 1; i <= la; i++) tbl[i] = tbl[i-1] + (1+lb);

for (i = la; i >= 0; i--) { char *aa = a + i; for (j = lb; j >= 0; j--) { char *bb = b + j; if (!*aa && !*bb) continue;

edit e = &tbl[i][j]; edit repl = &tbl[i+1][j+1]; edit dela = &tbl[i+1][j]; edit delb = &tbl[i][j+1];

e->c1 = *aa; e->c2 = *bb; if (!*aa) { e->next = delb; e->n = e->next->n + 1; continue; } if (!*bb) { e->next = dela; e->n = e->next->n + 1; continue; }

e->next = repl; if (*aa == *bb) { e->n = e->next->n; continue; }

if (e->next->n > delb->n) { e->next = delb; e->c1 = 0; } if (e->next->n > dela->n) { e->next = dela; e->c1 = *aa; e->c2 = 0; } e->n = e->next->n + 1; } }

edit p = tbl[0]; printf("%s -> %s: %d edits\n", a, b, p->n);

while (p->next) { if (p->c1 == p->c2) printf("%c", p->c1); else { putchar('('); if (p->c1) putchar(p->c1); putchar(','); if (p->c2) putchar(p->c2); putchar(')'); }

p = p->next; } putchar('\n');

free(tbl[0]); free(tbl); }

int main(void) { leven("raisethysword", "rosettacode"); return 0; }</lang>

Output:
raisethysword -> rosettacode: 8 edits
r(a,o)(i,)set(h,t)(y,a)(s,c)(w,)o(r,d)(d,e)

D

Using the standard library. <lang d>void main() {

   import std.stdio, std.algorithm;
   immutable s1 = "rosettacode";
   immutable s2 = "raisethysword";
   string s1b, s2b;
   size_t pos1, pos2;
   foreach (immutable c; levenshteinDistanceAndPath(s1, s2)[1]) {
       final switch (c) with (EditOp) {
           case none, substitute:
               s1b ~= s1[pos1++];
               s2b ~= s2[pos2++];
               break;
           case insert:
               s1b ~= "_";
               s2b ~= s2[pos2++];
               break;
           case remove:
               s1b ~= s1[pos1++];
               s2b ~= "_";
               break;
       }
   }
   writeln(s1b, "\n", s2b);

}</lang>

Output:
r_oset_tacode
raisethysword

Go

Library: biogo

Alignment computed by the Needleman-Wunch algorithm, which is used in bioinformatics. <lang go>package main

import (

   "fmt"
   "github.com/biogo/biogo/align"
   ab "github.com/biogo/biogo/alphabet"
   "github.com/biogo/biogo/feat"
   "github.com/biogo/biogo/seq/linear"

)

func main() {

   // Alphabets for things like DNA are predefined in biogo, but we
   // define our own here.
   lc := ab.Must(ab.NewAlphabet("-abcdefghijklmnopqrstuvwxyz",
       feat.Undefined, '-', 0, true))
   // Construct scoring matrix for Needleman-Wunch algorithm.
   // We leave zeros on the diagonal for the Levenshtein distance of an
   // exact match and put -1s everywhere else for the Levenshtein distance
   // of an edit.
   nw := make(align.NW, lc.Len())
   for i := range nw {
       r := make([]int, lc.Len())
       nw[i] = r
       for j := range r {
           if j != i {
               r[j] = -1
           }
       }
   }
   // define input sequences
   a := &linear.Seq{Seq: ab.BytesToLetters([]byte("rosettacode"))}
   a.Alpha = lc
   b := &linear.Seq{Seq: ab.BytesToLetters([]byte("raisethysword"))}
   b.Alpha = lc
   // perform alignment
   aln, err := nw.Align(a, b)
   // format and display result
   if err != nil {
       fmt.Println(err)
       return
   }
   fa := align.Format(a, b, aln, '-')
   fmt.Printf("%s\n%s\n", fa[0], fa[1])
   aa := fmt.Sprint(fa[0])
   ba := fmt.Sprint(fa[1])
   ma := make([]byte, len(aa))
   for i := range ma {
       if aa[i] == ba[i] {
           ma[i] = ' '
       } else {
           ma[i] = '|'
       }
   }
   fmt.Println(string(ma))

}</lang>

Output:

The lines after the alignment point out the 8 edits.

r-oset-tacode
raisethysword
 ||   |||| ||

Java

<lang java>public class LevenshteinAlignment {

   public static String[] alignment(String a, String b) {
       a = a.toLowerCase();
       b = b.toLowerCase();
       // i == 0
       int[][] costs = new int[a.length()+1][b.length()+1];
       for (int j = 0; j <= b.length(); j++)
           costs[0][j] = j;
       for (int i = 1; i <= a.length(); i++) {
           costs[i][0] = i;
           for (int j = 1; j <= b.length(); j++) {
               costs[i][j] = Math.min(1 + Math.min(costs[i-1][j], costs[i][j-1]), a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1);
           }
       }

// walk back through matrix to figure out path StringBuilder aPathRev = new StringBuilder(); StringBuilder bPathRev = new StringBuilder(); for (int i = a.length(), j = b.length(); i != 0 && j != 0; ) { if (costs[i][j] == (a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1)) { aPathRev.append(a.charAt(--i)); bPathRev.append(b.charAt(--j)); } else if (costs[i][j] == 1 + costs[i-1][j]) { aPathRev.append(a.charAt(--i)); bPathRev.append('-'); } else if (costs[i][j] == 1 + costs[i][j-1]) { aPathRev.append('-'); bPathRev.append(b.charAt(--j)); } }

       return new String[]{aPathRev.reverse().toString(), bPathRev.reverse().toString()};
   }
   public static void main(String[] args) {

String[] result = alignment("rosettacode", "raisethysword"); System.out.println(result[0]); System.out.println(result[1]);

   }

}</lang>

Output:
r-oset-tacode
raisethysword

Mathematica / Wolfram Language

<lang Mathematica>DamerauLevenshteinDistance["rosettacode", "raisethysword"]</lang>

Output:
8

Perl

<lang perl>use strict; use warnings;

use List::Util qw(min);

sub levenshtein_distance_alignment {

   my @s = ('^', split //, shift);
   my @t = ('^', split //, shift);

   my @A;
   @{$A[$_][0]}{qw(d s t)} = ($_, join(, @s[1 .. $_]), ('~' x $_)) for 0 .. $#s;
   @{$A[0][$_]}{qw(d s t)} = ($_, ('-' x $_), join , @t[1 .. $_])  for 0 .. $#t;
   for my $i (1 .. $#s) {
       for my $j (1 .. $#t) {

if ($s[$i] ne $t[$j]) { $A[$i][$j]{d} = 1 + ( my $min = min $A[$i-1][$j]{d}, $A[$i][$j-1]{d}, $A[$i-1][$j-1]{d} ); @{$A[$i][$j]}{qw(s t)} = $A[$i-1][$j]{d} == $min ? ($A[$i-1][$j]{s}.$s[$i], $A[$i-1][$j]{t}.'-') : $A[$i][$j-1]{d} == $min ? ($A[$i][$j-1]{s}.'-', $A[$i][$j-1]{t}.$t[$j]) : ($A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j]); }

           else {

@{$A[$i][$j]}{qw(d s t)} = ( $A[$i-1][$j-1]{d}, $A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j] );

           }
       }
   }
   return @{$A[-1][-1]}{'s', 't'};

}

print join "\n", levenshtein_distance_alignment "rosettacode", "raisethysword";</lang>

Output:
ro-settac-o-de
raisethysword-

Perl 6

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.
Translation of: Perl

<lang Perl 6>sub align ( Str $σ, Str $t ) {

   my @s = *, $σ.comb;
   my @t = *, $t.comb;
    
   my @A;
   @A[$_][ 0]<d s t> = $_, @s[1..$_].join, '-' x $_ for ^@s;
   @A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t;
    
   for 1 ..^ @s X 1..^ @t -> (\i, \j) {

if @s[i] ne @t[j] { @A[i][j]<d> = 1 + my $min = min @A[i-1][j]<d>, @A[i][j-1]<d>, @A[i-1][j-1]<d>; @A[i][j] = @A[i-1][j]<d> == $min ?? (@A[i-1][j] Z~ @s[i], '-') !! @A[i][j-1]<d> == $min ?? (@A[i][j-1] Z~ '-', @t[j]) !! (@A[i-1][j-1] Z~ @s[i], @t[j]); } else { @A[i][j]<d s t> = @A[i-1][j-1]<d s t> Z~ , @s[i], @t[j]; }

   }
    
   return @A[*-1][*-1];

}

.say for align |<rosettacode raisethysword>;</lang>

Output:
ro-settac-o-de
raisethysword-

Racket

Simple version (no aligment)

First we will analyze this solution that only computes the distance. See http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html for a discussion of the code.

<lang racket>#lang racket

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 0]
      [(empty? s) (length t)]
      [(empty? t) (length s)]
      [else
       (if (equal? (first s) (first t))
           (levenshtein (rest s) (rest t))
           (min (add1 (levenshtein (rest s) t))
                (add1 (levenshtein s (rest t)))
                (add1 (levenshtein (rest s) (rest t)))))]))))</lang>

Demonstration: <lang racket>(levenshtein (string->list "rosettacode")

            (string->list "raisethysword"))</lang>
Output:
8

Complete version

Now we extend the code from http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html to show also the alignment. The code is very similar, but it stores the partial results (number of edits and alignment of each substring) in a lev structure. <lang Racket>#lang racket

(struct lev (n s t))

(define (lev-add old n sx tx)

 (lev (+ n (lev-n old))
      (cons sx (lev-s old))
      (cons tx (lev-t old))))

(define (list-repeat n v)

 (build-list n (lambda (_) v)))

(define (memoize f)

 (local ([define table (make-hash)])
   (lambda args
     (dict-ref! table args (λ () (apply f args))))))

(define levenshtein/list

 (memoize
  (lambda (s t)
    (cond
      [(and (empty? s) (empty? t)) 
       (lev 0 '() '())]
      [(empty? s) 
       (lev (length t) (list-repeat (length t) #\-) t)]
      [(empty? t) 
       (lev (length s) s (list-repeat (length s) #\-))]
      [else
       (if (equal? (first s) (first t))
           (lev-add (levenshtein/list (rest s) (rest t))
                    0 (first s) (first t))
           (argmin lev-n (list (lev-add (levenshtein/list (rest s) t)
                                        1 (first s) #\-)
                               (lev-add (levenshtein/list s (rest t))
                                        1 #\- (first t))
                               (lev-add (levenshtein/list (rest s) (rest t))
                                        1 (first s) (first t)))))]))))

(define (levenshtein s t)

 (let ([result (levenshtein/list (string->list s) 
                                 (string->list t))])
   (values (lev-n result)
           (list->string (lev-s result)) 
           (list->string (lev-t result)))))</lang>

Demonstration: <lang racket>(let-values ([(dist exp-s exp-t)

             (levenshtein "rosettacode" "raisethysword")])
 (printf "levenshtein: ~a edits\n" dist)
 (displayln exp-s)
 (displayln exp-t))</lang>
Output:
levenshtein: 8 edits
r-oset-taco-de
raisethysword-

Ruby

Translation of: Tcl

uses "lcs" from here <lang ruby>require 'lcs'

def levenshtein_align(a, b)

 apos, bpos = LCS.new(a, b).backtrack2
 
 c = ""
 d = ""
 x0 = y0 = -1
 dx = dy = 0
 apos.zip(bpos) do |x,y|
   diff = x + dx - y - dy
   if diff < 0
     dx -= diff
     c += "-" * (-diff)
   elsif diff > 0
     dy += diff
     d += "-" * diff
   end
   c += a[x0+1..x]
   x0 = x
   d += b[y0+1..y]
   y0 = y
 end
 
 c += a[x0+1..-1]
 d += b[y0+1..-1]
 diff = a.length + y0 - b.length - x0
 if diff < 0
   c += "-" * (-diff)
 elsif diff > 0
   d += "-" * diff
 end
 [c, d]

end

puts levenshtein_align("rosettacode", "raisethysword")</lang>

Output:
r-oset-taco-de
raisethysword-

Sidef

Translation of: Perl

<lang ruby>func align(s, t) {

   s.chars!.prepend!('^')
   t.chars!.prepend!('^')
   var A = []
   for i in ^s { A[i][0]{<d s t>} = (i, s.ft(1, i).join, '~' * i) }
   for i in ^t { A[0][i]{<d s t>} = (i, '-' * i, t.ft(1, i).join) }
   s.end.times { |i|
     t.end.times { |j|
       if (s[i] != t[j]) {
         A[i][j]{:d} = 1+(
           var min = Math.min(A[i-1][j]{:d}, A[i][j-1]{:d}, A[i-1][j-1]{:d})
         )
         A[i][j]{} = @|(A[i-1][j]{:d} == min
             ? [A[i-1][j]{:s}+s[i], A[i-1][j]{:t}+'-']
             : (A[i][j-1]{:d} == min
             ? [A[i][j-1]{:s}+'-', A[i][j-1]{:t}+t[j]]
             : [A[i-1][j-1]{:s}+s[i], A[i-1][j-1]{:t}+t[j]]))
       }
       else {
         A[i][j]{<d s t>} = (
             A[i-1][j-1]{:d},
             A[i-1][j-1]{:s}+s[i],
             A[i-1][j-1]{:t}+t[j],
         )
       }
     }
   }
   return [A[-1][-1]{}]

}

align("rosettacode", "raisethysword").each { .say }</lang>

Output:
ro-settac-o-de
raisethysword-

Tcl

Library: Tcllib (Package: struct::list)

<lang tcl>package require struct::list proc levenshtein/align {a b} {

   lassign [struct::list longestCommonSubsequence [split $a ""] [split $b ""]]\

apos bpos

   set c ""
   set d ""
   set x0 [set y0 -1]
   set dx [set dy 0]
   foreach x $apos y $bpos {

if {$x+$dx < $y+$dy} { set n [expr {($y+$dy)-($x+$dx)}] incr dx $n append c [string repeat "-" $n] } elseif {$x+$dx > $y+$dy} { set n [expr {($x+$dx)-($y+$dy)}] incr dy $n append d [string repeat "-" $n] } append c [string range $a $x0+1 $x] set x0 $x append d [string range $b $y0+1 $y] set y0 $y

   }
   append c [string range $a $x0+1 end]
   append d [string range $b $y0+1 end]
   set al [string length $a]
   set bl [string length $b]
   if {$al+$y0 < $bl+$x0} {

append c [string repeat "-" [expr {$bl+$x0-$y0-$al}]]

   } elseif {$bl+$x0 < $al+$y0} {

append d [string repeat "-" [expr {$al+$y0-$x0-$bl}]]

   }
   return $c\n$d

}

puts [levenshtein/align "rosettacode" "raisethysword"]</lang>

Output:
r-oset-taco-de
raisethysword-

zkl

Translation of: Java

<lang zkl>fcn alignment(a,b){

  a,b = a.toLower(), b.toLower();
  costs := (a.len()+1).pump(List(),'wrap(a){ [1..b.len()].pump(List(a)) });
  foreach i,j in (a.len()+1, [1..b.len()]){
     costs[i][j] = ( 1 + costs[i-1][j].min(costs[i][j-1]) ) 
        .min(if(a[i-1] == b[j-1]) costs[i-1][j-1] else costs[i-1][j-1] + 1);
  }
  // walk back through matrix to figure out path
  aPathRev,bPathRev := Data(),Data();  // byte buckets
  i,j := a.len(), b.len(); 
  while(i!=0 and j!= 0){
     if (costs[i][j] == 
         ( if(a[i-1]==b[j-1]) costs[i-1][j-1] else costs[i-1][j-1]+1 )){
        aPathRev.append(a[i-=1]);

bPathRev.append(b[j-=1]);

     } else if(costs[i][j] == 1+costs[i-1][j]){

aPathRev.append(a[i-=1]); bPathRev.append("-");

     } else if (costs[i][j] == 1+costs[i][j-1]){

aPathRev.append("-"); bPathRev.append(b[j-=1]);

     }
  }
  return(aPathRev.text.reverse(), bPathRev.text.reverse())

}</lang> <lang zkl>result := alignment("rosettacode", "raisethysword"); println(result[0]); println(result[1]);</lang>

Output:
r-oset-tacode
raisethysword