Levenshtein distance/Alignment: Difference between revisions
m (→{{header|Go}}: fix imports) |
(→{{header|Go}}: Additional output to point out edits.) |
||
Line 189: | Line 189: | ||
fa := align.Format(a, b, aln, '-') |
fa := align.Format(a, b, aln, '-') |
||
fmt.Printf("%s\n%s\n", fa[0], fa[1]) |
fmt.Printf("%s\n%s\n", fa[0], fa[1]) |
||
aa := fmt.Sprint(fa[0]) |
|||
ba := fmt.Sprint(fa[1]) |
|||
ma := make([]byte, len(aa)) |
|||
for i := range ma { |
|||
if aa[i] == ba[i] { |
|||
ma[i] = ' ' |
|||
} else { |
|||
ma[i] = '|' |
|||
} |
|||
} |
|||
fmt.Println(string(ma)) |
|||
}</lang> |
}</lang> |
||
{{out}} |
{{out}} |
||
The lines after the alignment point out the 8 edits. |
|||
<pre> |
<pre> |
||
r-oset-tacode |
r-oset-tacode |
||
raisethysword |
raisethysword |
||
|| |||| || |
|||
</pre> |
</pre> |
||
Revision as of 02:36, 30 March 2015
The Levenshtein distance algorithm returns the number of atomic operations (insertion, deletion or edition) that must be performed on a string in order to obtain an other one, but it does not say anything about the actual operations used or their order.
An alignment is a notation used to describe the operations used to turn a string into an other. At some point in the strings, the minus character ('-') is placed in order to signify that a character must be added at this very place. For instance, an alignment between the words 'place' and 'palace' is:
P-LACE PALACE
For this task, write a function that shows the alignment of two strings for the corresponding levenshtein distance. As an example, use the words "rosettacode" and "raisethysword".
You can either implement an algorithm, or use a dedicated library (thus showing us how it is named in your language).
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <string.h>
typedef struct edit_s edit_t, *edit; struct edit_s { char c1, c2; int n; edit next; };
void leven(char *a, char *b) { int i, j, la = strlen(a), lb = strlen(b); edit *tbl = malloc(sizeof(edit) * (1 + la)); tbl[0] = calloc((1 + la) * (1 + lb), sizeof(edit_t)); for (i = 1; i <= la; i++) tbl[i] = tbl[i-1] + (1+lb);
for (i = la; i >= 0; i--) { char *aa = a + i; for (j = lb; j >= 0; j--) { char *bb = b + j; if (!*aa && !*bb) continue;
edit e = &tbl[i][j]; edit repl = &tbl[i+1][j+1]; edit dela = &tbl[i+1][j]; edit delb = &tbl[i][j+1];
e->c1 = *aa; e->c2 = *bb; if (!*aa) { e->next = delb; e->n = e->next->n + 1; continue; } if (!*bb) { e->next = dela; e->n = e->next->n + 1; continue; }
e->next = repl; if (*aa == *bb) { e->n = e->next->n; continue; }
if (e->next->n > delb->n) { e->next = delb; e->c1 = 0; } if (e->next->n > dela->n) { e->next = dela; e->c1 = *aa; e->c2 = 0; } e->n = e->next->n + 1; } }
edit p = tbl[0]; printf("%s -> %s: %d edits\n", a, b, p->n);
while (p->next) { if (p->c1 == p->c2) printf("%c", p->c1); else { putchar('('); if (p->c1) putchar(p->c1); putchar(','); if (p->c2) putchar(p->c2); putchar(')'); }
p = p->next; } putchar('\n');
free(tbl[0]); free(tbl); }
int main(void) { leven("raisethysword", "rosettacode"); return 0; }</lang>
- Output:
raisethysword -> rosettacode: 8 edits r(a,o)(i,)set(h,t)(y,a)(s,c)(w,)o(r,d)(d,e)
D
Using the standard library. <lang d>void main() {
import std.stdio, std.algorithm;
immutable s1 = "rosettacode"; immutable s2 = "raisethysword";
string s1b, s2b; size_t pos1, pos2;
foreach (immutable c; levenshteinDistanceAndPath(s1, s2)[1]) { final switch (c) with (EditOp) { case none, substitute: s1b ~= s1[pos1++]; s2b ~= s2[pos2++]; break; case insert: s1b ~= "_"; s2b ~= s2[pos2++]; break; case remove: s1b ~= s1[pos1++]; s2b ~= "_"; break; } }
writeln(s1b, "\n", s2b);
}</lang>
- Output:
r_oset_tacode raisethysword
Go
Alignment computed by the Needleman-Wunch algorithm, which is used in bioinformatics. <lang go>package main
import (
"fmt"
"github.com/biogo/biogo/align" ab "github.com/biogo/biogo/alphabet" "github.com/biogo/biogo/feat" "github.com/biogo/biogo/seq/linear"
)
func main() {
// Alphabets for things like DNA are predefined in biogo, but we // define our own here. lc := ab.Must(ab.NewAlphabet("-abcdefghijklmnopqrstuvwxyz", feat.Undefined, '-', 0, true)) // Construct scoring matrix for Needleman-Wunch algorithm. // We leave zeros on the diagonal for the Levenshtein distance of an // exact match and put -1s everywhere else for the Levenshtein distance // of an edit. nw := make(align.NW, lc.Len()) for i := range nw { r := make([]int, lc.Len()) nw[i] = r for j := range r { if j != i { r[j] = -1 } } } // define input sequences a := &linear.Seq{Seq: ab.BytesToLetters([]byte("rosettacode"))} a.Alpha = lc b := &linear.Seq{Seq: ab.BytesToLetters([]byte("raisethysword"))} b.Alpha = lc // perform alignment aln, err := nw.Align(a, b) // format and display result if err != nil { fmt.Println(err) return } fa := align.Format(a, b, aln, '-') fmt.Printf("%s\n%s\n", fa[0], fa[1]) aa := fmt.Sprint(fa[0]) ba := fmt.Sprint(fa[1]) ma := make([]byte, len(aa)) for i := range ma { if aa[i] == ba[i] { ma[i] = ' ' } else { ma[i] = '|' } } fmt.Println(string(ma))
}</lang>
- Output:
The lines after the alignment point out the 8 edits.
r-oset-tacode raisethysword || |||| ||
Java
<lang java>public class LevenshteinAlignment {
public static String[] alignment(String a, String b) { a = a.toLowerCase(); b = b.toLowerCase(); // i == 0 int[][] costs = new int[a.length()+1][b.length()+1]; for (int j = 0; j <= b.length(); j++) costs[0][j] = j; for (int i = 1; i <= a.length(); i++) { costs[i][0] = i; for (int j = 1; j <= b.length(); j++) { costs[i][j] = Math.min(1 + Math.min(costs[i-1][j], costs[i][j-1]), a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1); } }
// walk back through matrix to figure out path StringBuilder aPathRev = new StringBuilder(); StringBuilder bPathRev = new StringBuilder(); for (int i = a.length(), j = b.length(); i != 0 && j != 0; ) { if (costs[i][j] == (a.charAt(i - 1) == b.charAt(j - 1) ? costs[i-1][j-1] : costs[i-1][j-1] + 1)) { aPathRev.append(a.charAt(--i)); bPathRev.append(b.charAt(--j)); } else if (costs[i][j] == 1 + costs[i-1][j]) { aPathRev.append(a.charAt(--i)); bPathRev.append('-'); } else if (costs[i][j] == 1 + costs[i][j-1]) { aPathRev.append('-'); bPathRev.append(b.charAt(--j)); } }
return new String[]{aPathRev.reverse().toString(), bPathRev.reverse().toString()}; }
public static void main(String[] args) {
String[] result = alignment("rosettacode", "raisethysword"); System.out.println(result[0]); System.out.println(result[1]);
}
}</lang>
- Output:
r-oset-tacode raisethysword
Mathematica / Wolfram Language
<lang Mathematica>DamerauLevenshteinDistance["rosettacode", "raisethysword"]</lang>
- Output:
8
Perl
<lang perl>use strict; use warnings;
use List::Util qw(min);
sub levenshtein_distance_alignment {
my @s = ('^', split //, shift); my @t = ('^', split //, shift); my @A; @{$A[$_][0]}{qw(d s t)} = ($_, join(, @s[1 .. $_]), ('~' x $_)) for 0 .. $#s; @{$A[0][$_]}{qw(d s t)} = ($_, ('-' x $_), join , @t[1 .. $_]) for 0 .. $#t; for my $i (1 .. $#s) { for my $j (1 .. $#t) {
if ($s[$i] ne $t[$j]) { $A[$i][$j]{d} = 1 + ( my $min = min $A[$i-1][$j]{d}, $A[$i][$j-1]{d}, $A[$i-1][$j-1]{d} ); @{$A[$i][$j]}{qw(s t)} = $A[$i-1][$j]{d} == $min ? ($A[$i-1][$j]{s}.$s[$i], $A[$i-1][$j]{t}.'-') : $A[$i][$j-1]{d} == $min ? ($A[$i][$j-1]{s}.'-', $A[$i][$j-1]{t}.$t[$j]) : ($A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j]); }
else {
@{$A[$i][$j]}{qw(d s t)} = ( $A[$i-1][$j-1]{d}, $A[$i-1][$j-1]{s}.$s[$i], $A[$i-1][$j-1]{t}.$t[$j] );
} } } return @{$A[-1][-1]}{'s', 't'};
}
print join "\n", levenshtein_distance_alignment "rosettacode", "raisethysword";</lang>
- Output:
ro-settac-o-de raisethysword-
Perl 6
<lang Perl 6>sub align ( Str $σ, Str $t ) {
my @s = *, $σ.comb; my @t = *, $t.comb; my @A; @A[$_][ 0]<d s t> = $_, @s[1..$_].join, '-' x $_ for ^@s; @A[ 0][$_]<d s t> = $_, '-' x $_, @t[1..$_].join for ^@t; for 1 ..^ @s X 1..^ @t -> \i, \j {
if @s[i] ne @t[j] {
@A[i][j]<d> = 1 + my $min =
min @A[i-1][j]<d>, @A[i][j-1]<d>, @A[i-1][j-1]<d>;
@A[i][j] =
@A[i-1][j]<d> == $min ?? (@A[i-1][j] Z~ @s[i], '-') !!
@A[i][j-1]<d> == $min ?? (@A[i][j-1] Z~ '-', @t[j]) !!
(@A[i-1][j-1] Z~ @s[i], @t[j]);
} else {
@A[i][j]<d s t> = @A[i-1][j-1]<d s t> Z~ , @s[i], @t[j];
}
} return @A[*-1][*-1];
}
.say for align |<rosettacode raisethysword>;</lang>
- Output:
ro-settac-o-de raisethysword-
Racket
Simple version (no aligment)
First we will analyze this solution that only computes the distance. See http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html for a discussion of the code.
<lang racket>#lang racket
(define (memoize f)
(local ([define table (make-hash)]) (lambda args (dict-ref! table args (λ () (apply f args))))))
(define levenshtein
(memoize (lambda (s t) (cond [(and (empty? s) (empty? t)) 0] [(empty? s) (length t)] [(empty? t) (length s)] [else (if (equal? (first s) (first t)) (levenshtein (rest s) (rest t)) (min (add1 (levenshtein (rest s) t)) (add1 (levenshtein s (rest t))) (add1 (levenshtein (rest s) (rest t)))))]))))</lang>
Demonstration: <lang racket>(levenshtein (string->list "rosettacode")
(string->list "raisethysword"))</lang>
- Output:
8
Complete version
Now we extend the code from http://blog.racket-lang.org/2012/08/dynamic-programming-versus-memoization.html to show also the alignment. The code is very similar, but it stores the partial results (number of edits and alignment of each substring) in a lev structure. <lang Racket>#lang racket
(struct lev (n s t))
(define (lev-add old n sx tx)
(lev (+ n (lev-n old)) (cons sx (lev-s old)) (cons tx (lev-t old))))
(define (list-repeat n v)
(build-list n (lambda (_) v)))
(define (memoize f)
(local ([define table (make-hash)]) (lambda args (dict-ref! table args (λ () (apply f args))))))
(define levenshtein/list
(memoize (lambda (s t) (cond [(and (empty? s) (empty? t)) (lev 0 '() '())] [(empty? s) (lev (length t) (list-repeat (length t) #\-) t)] [(empty? t) (lev (length s) s (list-repeat (length s) #\-))] [else (if (equal? (first s) (first t)) (lev-add (levenshtein/list (rest s) (rest t)) 0 (first s) (first t)) (argmin lev-n (list (lev-add (levenshtein/list (rest s) t) 1 (first s) #\-) (lev-add (levenshtein/list s (rest t)) 1 #\- (first t)) (lev-add (levenshtein/list (rest s) (rest t)) 1 (first s) (first t)))))]))))
(define (levenshtein s t)
(let ([result (levenshtein/list (string->list s) (string->list t))]) (values (lev-n result) (list->string (lev-s result)) (list->string (lev-t result)))))</lang>
Demonstration: <lang racket>(let-values ([(dist exp-s exp-t)
(levenshtein "rosettacode" "raisethysword")]) (printf "levenshtein: ~a edits\n" dist) (displayln exp-s) (displayln exp-t))</lang>
- Output:
levenshtein: 8 edits r-oset-taco-de raisethysword-
Ruby
uses "lcs" from here <lang ruby>require 'lcs'
def levenshtein_align(a, b)
apos, bpos = LCS.new(a, b).backtrack2 c = "" d = "" x0 = y0 = -1 dx = dy = 0 apos.zip(bpos) do |x,y| diff = x + dx - y - dy if diff < 0 dx -= diff c += "-" * (-diff) elsif diff > 0 dy += diff d += "-" * diff end c += a[x0+1..x] x0 = x d += b[y0+1..y] y0 = y end c += a[x0+1..-1] d += b[y0+1..-1] diff = a.length + y0 - b.length - x0 if diff < 0 c += "-" * (-diff) elsif diff > 0 d += "-" * diff end [c, d]
end
puts levenshtein_align("rosettacode", "raisethysword")</lang>
- Output:
r-oset-taco-de raisethysword-
Tcl
<lang tcl>package require struct::list proc levenshtein/align {a b} {
lassign [struct::list longestCommonSubsequence [split $a ""] [split $b ""]]\
apos bpos
set c "" set d "" set x0 [set y0 -1] set dx [set dy 0] foreach x $apos y $bpos {
if {$x+$dx < $y+$dy} { set n [expr {($y+$dy)-($x+$dx)}] incr dx $n append c [string repeat "-" $n] } elseif {$x+$dx > $y+$dy} { set n [expr {($x+$dx)-($y+$dy)}] incr dy $n append d [string repeat "-" $n] } append c [string range $a $x0+1 $x] set x0 $x append d [string range $b $y0+1 $y] set y0 $y
} append c [string range $a $x0+1 end] append d [string range $b $y0+1 end] set al [string length $a] set bl [string length $b] if {$al+$y0 < $bl+$x0} {
append c [string repeat "-" [expr {$bl+$x0-$y0-$al}]]
} elseif {$bl+$x0 < $al+$y0} {
append d [string repeat "-" [expr {$al+$y0-$x0-$bl}]]
} return $c\n$d
}
puts [levenshtein/align "rosettacode" "raisethysword"]</lang>
- Output:
r-oset-taco-de raisethysword-