Burrows–Wheeler transform

From Rosetta Code
Burrows–Wheeler transform is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
This page uses content from Wikipedia. The original article was at Burrows–Wheeler_transform. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance)


The Burrows–Wheeler transform (BWT, also called block-sorting compression) rearranges a character string into runs of similar characters.

This is useful for compression, since it tends to be easy to compress a string that has runs of repeated characters by techniques such as move-to-front transform and run-length encoding.

More importantly, the transformation is reversible, without needing to store any additional data.

The BWT is thus a "free" method of improving the efficiency of text compression algorithms, costing only some extra computation.


Source: Burrows–Wheeler transform

C[edit]

Translation of: Python
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
 
const char STX = '\002', ETX = '\003';
 
int compareStrings(const void *a, const void *b) {
char *aa = *(char **)a;
char *bb = *(char **)b;
return strcmp(aa, bb);
}
 
int bwt(const char *s, char r[]) {
int i, len = strlen(s) + 2;
char *ss, *str;
char **table;
if (strchr(s, STX) || strchr(s, ETX)) return 1;
ss = calloc(len + 1, sizeof(char));
sprintf(ss, "%c%s%c", STX, s, ETX);
table = malloc(len * sizeof(const char *));
for (i = 0; i < len; ++i) {
str = calloc(len + 1, sizeof(char));
strcpy(str, ss + i);
if (i > 0) strncat(str, ss, i);
table[i] = str;
}
qsort(table, len, sizeof(const char *), compareStrings);
for(i = 0; i < len; ++i) {
r[i] = table[i][len - 1];
free(table[i]);
}
free(table);
free(ss);
return 0;
}
 
void ibwt(const char *r, char s[]) {
int i, j, len = strlen(r);
char **table = malloc(len * sizeof(const char *));
for (i = 0; i < len; ++i) table[i] = calloc(len + 1, sizeof(char));
for (i = 0; i < len; ++i) {
for (j = 0; j < len; ++j) {
memmove(table[j] + 1, table[j], len);
table[j][0] = r[j];
}
qsort(table, len, sizeof(const char *), compareStrings);
}
for (i = 0; i < len; ++i) {
if (table[i][len - 1] == ETX) {
strncpy(s, table[i] + 1, len - 2);
break;
}
}
for (i = 0; i < len; ++i) free(table[i]);
free(table);
}
 
void makePrintable(const char *s, char t[]) {
strcpy(t, s);
for ( ; *t != '\0'; ++t) {
if (*t == STX) *t = '^';
else if (*t == ETX) *t = '|';
}
}
 
int main() {
int i, res, len;
char *tests[6], *t, *r, *s;
tests[0] = "banana";
tests[1] = "appellee";
tests[2] = "dogwood";
tests[3] = "TO BE OR NOT TO BE OR WANT TO BE OR NOT?";
tests[4] = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
tests[5] = "\002ABC\003";
for (i = 0; i < 6; ++i) {
len = strlen(tests[i]);
t = calloc(len + 1, sizeof(char));
makePrintable(tests[i], t);
printf("%s\n", t);
printf(" --> ");
r = calloc(len + 3, sizeof(char));
res = bwt(tests[i], r);
if (res == 1) {
printf("ERROR: String can't contain STX or ETX\n");
}
else {
makePrintable(r, t);
printf("%s\n", t);
}
s = calloc(len + 1, sizeof(char));
ibwt(r, s);
makePrintable(s, t);
printf(" --> %s\n\n", t);
free(t);
free(r);
free(s);
}
return 0;
}
Output:
banana
 --> |annb^aa
 --> banana

appellee
 --> |e^elplepa
 --> appellee

dogwood
 --> |do^oodwg
 --> dogwood

TO BE OR NOT TO BE OR WANT TO BE OR NOT?
 --> |?OOORREEETTRTW   BBB  ATTT   NNOOONOO^   
 --> TO BE OR NOT TO BE OR WANT TO BE OR NOT?

SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
 --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT
 --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES

^ABC|
 --> ERROR: String can't contain STX or ETX
 --> 

Go[edit]

Translation of: Python
package main
 
import (
"fmt"
"sort"
"strings"
)
 
const stx = "\002"
const etx = "\003"
 
func bwt(s string) (string, error) {
if strings.Index(s, stx) >= 0 || strings.Index(s, etx) >= 0 {
return "", fmt.Errorf("String can't contain STX or ETX")
}
s = stx + s + etx
le := len(s)
table := make([]string, le)
table[0] = s
for i := 1; i < le; i++ {
table[i] = s[i:] + s[:i]
}
sort.Strings(table)
lastBytes := make([]byte, le)
for i := 0; i < le; i++ {
lastBytes[i] = table[i][le-1]
}
return string(lastBytes), nil
}
 
func ibwt(r string) string {
le := len(r)
table := make([]string, le)
for range table {
for i := 0; i < le; i++ {
table[i] = r[i:i+1] + table[i]
}
sort.Strings(table)
}
for _, row := range table {
if strings.HasSuffix(row, etx) {
return row[1 : le-1]
}
}
return ""
}
 
func makePrintable(s string) string {
// substitute ^ for STX and | for ETX to print results
t := strings.Replace(s, stx, "^", 1)
return strings.Replace(t, etx, "|", 1)
}
 
func main() {
tests := []string{
"banana",
"appellee",
"dogwood",
"TO BE OR NOT TO BE OR WANT TO BE OR NOT?",
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
"\002ABC\003",
}
for _, test := range tests {
fmt.Println(makePrintable(test))
fmt.Print(" --> ")
t, err := bwt(test)
if err != nil {
fmt.Println("ERROR:", err)
} else {
fmt.Println(makePrintable(t))
}
r := ibwt(t)
fmt.Println(" -->", r, "\n")
}
}
Output:
banana
 --> |annb^aa
 --> banana 

appellee
 --> |e^elplepa
 --> appellee 

dogwood
 --> |do^oodwg
 --> dogwood 

TO BE OR NOT TO BE OR WANT TO BE OR NOT?
 --> |?OOORREEETTRTW   BBB  ATTT   NNOOONOO^   
 --> TO BE OR NOT TO BE OR WANT TO BE OR NOT? 

SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
 --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT
 --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES 

^ABC|
 --> ERROR: String can't contain STX or ETX
 -->  

Kotlin[edit]

Translation of: Python
// Version 1.2.60
 
const val STX = "\u0002"
const val ETX = "\u0003"
 
fun bwt(s: String): String {
if (s.contains(STX) || s.contains(ETX)) {
throw RuntimeException("String can't contain STX or ETX")
}
val ss = STX + s + ETX
val table = Array<String>(ss.length) { ss.substring(it) + ss.substring(0, it) }
table.sort()
return String(table.map { it[it.lastIndex] }.toCharArray())
}
 
fun ibwt(r: String): String {
val len = r.length
val table = Array<String>(len) { "" }
repeat(len) {
for (i in 0 until len) {
table[i] = r[i].toString() + table[i]
}
table.sort()
}
for (row in table) {
if (row.endsWith(ETX)) {
return row.substring(1, len - 1)
}
}
return ""
}
 
fun makePrintable(s: String): String {
// substitute ^ for STX and | for ETX to print results
return s.replace(STX, "^").replace(ETX, "|")
}
 
fun main(args: Array<String>) {
val tests = listOf(
"banana",
"appellee",
"dogwood",
"TO BE OR NOT TO BE OR WANT TO BE OR NOT?",
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
"\u0002ABC\u0003"
)
for (test in tests) {
println(makePrintable(test))
print(" --> ")
var t = ""
try {
t = bwt(test)
println(makePrintable(t))
}
catch (ex: RuntimeException) {
println("ERROR: " + ex.message)
}
val r = ibwt(t)
println(" --> $r\n")
}
}
Output:
banana
 --> |annb^aa
 --> banana

appellee
 --> |e^elplepa
 --> appellee

dogwood
 --> |do^oodwg
 --> dogwood

TO BE OR NOT TO BE OR WANT TO BE OR NOT?
 --> |?OOORREEETTRTW   BBB  ATTT   NNOOONOO^   
 --> TO BE OR NOT TO BE OR WANT TO BE OR NOT?

SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
 --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT
 --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES

^ABC|
 --> ERROR: String can't contain STX or ETX
 --> 

Perl 6[edit]

Works with: Rakudo version 2018.06
# STX can be any character that doesn't appear in the text.
# Using a visible character here for ease of viewing.
 
constant \STX = '👍';
 
# Burrows-Wheeler transform
sub transform (Str $s is copy) {
note "String can't contain STX character." and exit if $s.contains: STX;
$s = STX ~ $s;
(^$s.chars).map({ $s.comb.list.rotate: $_ }).sort[*;*-1].join
}
 
# Burrows-Wheeler inverse transform
sub ɯɹoɟsuɐɹʇ (Str $s) {
my @t = $s.comb.sort;
@t = ($s.comb Z~ @t).sort for 1..^$s.chars;
@t.first( *.ends-with: STX ).chop
}
 
# TESTING
for |<BANANA dogwood SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES>,
'TO BE OR NOT TO BE OR WANT TO BE OR NOT?', "Oops{STX}"
-> $phrase {
say 'Original: ', $phrase;
say 'Transformed: ', transform $phrase;
say 'Inverse transformed: ', ɯɹoɟsuɐɹʇ transform $phrase;
say '';
}
Output:
Original:            BANANA
Transformed:         BNN👍AAA
Inverse transformed: BANANA

Original:            dogwood
Transformed:         👍ooodwgd
Inverse transformed: dogwood

Original:            SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
Transformed:         TEXYDST.E.IXIXIXXSSMPPS.B..E.👍.UESFXDIIOIIITS
Inverse transformed: SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES

Original:            TO BE OR NOT TO BE OR WANT TO BE OR NOT?
Transformed:         OOORREEETTRTW   BBB  ATTT   NNOOONOO👍   ?
Inverse transformed: TO BE OR NOT TO BE OR WANT TO BE OR NOT?

Original:            Oops👍
String can't contain STX character.

Python[edit]

This Python implementation sacrifices speed for simplicity: the program is short, but takes more than the linear time that would be desired in a practical implementation.

Using the STX/ETX control codes to mark the start and end of the text, and using s[i:] + s[:i] to construct the ith rotation of s, the forward transform takes the last character of each of the sorted rows. The inverse transform repeatedly inserts r as the left column of the table and sorts the table. After the whole table is built, it returns the row that ends with ETX, minus the STX and ETX.

 
def bwt(s):
"""Apply Burrows-Wheeler transform to input string."""
assert "\002" not in s and "\003" not in s, "Input string cannot contain STX and ETX characters"
s = "\002" + s + "\003" # Add start and end of text marker
table = sorted(s[i:] + s[:i] for i in range(len(s))) # Table of rotations of string
last_column = [row[-1:] for row in table] # Last characters of each row
return "".join(last_column) # Convert list of characters into string
 
 
def ibwt(r):
"""Apply inverse Burrows-Wheeler transform."""
table = [""] * len(r) # Make empty table
for i in range(len(r)):
table = sorted(r[i] + table[i] for i in range(len(r))) # Add a column of r
s = [row for row in table if row.endswith("\003")][0] # Find the correct row (ending in ETX)
return s.rstrip("\003").strip("\002") # Get rid of start and end markers
 

REXX[edit]

Programming note:   a little bit of code was added to support more (legal) characters in the input string for the BWT
function.   The error recovery and error messages are rudimentary when an illegal character in the input is detected.

/*REXX program performs a  Burrows─Wheeler transform  (BWT)  on a character string(s).  */
$.= /*the default text for (all) the inputs*/
parse arg $.1 /*obtain optional arguments from the CL*/
if $.1='' then do; $.1= "banana" /*Not specified? Then use the defaults*/
$.2= "BANANA"
$.3= "appellee"
$.4= "dogwood"
$.5= "TO BE OR NOT TO BE OR WANT TO BE OR NOT?"
$.6= "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES"
$.7= "^ABC|"
$.7= "bad─bad thingy"'fd'x /* ◄─── this string can't be processed.*/
end
 
do t=1 while $.t\='' /*process each of the inputs (or input)*/
if t\==1 then say /*insert a blank line between outputs. */
out= BWT($.t) /*invoke the BWT function, get result*/
ori= iBWT(out) /* " " iBWT " " " */
say ' input ───► ' $.t /*display input string to term.*/
say ' output ───► ' out /* " output " " " */
say 'original ───► ' ori /* " reconstituted " " " */
end /*t*/
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
BWT: procedure expose ?.; parse arg y,,$ /*obtain the input; nullify $ string. */
 ?.1= 'fd'x;  ?.2="fc"x /*assign the STX and ETX strings. */
do i=1 for 2 /* [↓] check for invalid input string.*/
loc= verify(y, ?.i, 'M') /*look for invalid character in input. */
if loc\==0 then call BWTer /*there an " " " "  ? */
end /*i*/ /* [↑] if error, perform a hard exit.*/
y= ?.1 || y || ?.2 /*obtain the input; add a fence to it.*/
L= length(y); m= L-1 /*get the length of new text; get L-1.*/
@.1= y /*define the first element of the table*/
do j=2 for m; _= j-1 /*now, define the rest of the elements.*/
@.j= right(@._,1)left(@._,m) /*construct a table from the Y input.*/
end /*j*/ /* [↑] each element: left & right part*/
call cSort L /*invoke lexicographical sort for array*/
do k=1 for L /* [↓] construct the answer from array*/
$=$ || right(@.k, 1) /*build the answer from each of ··· */
end /*k*/ /* ··· the array's right─most character*/
return $ /*return the constructed answer. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
BWTer: er= '***error*** BWT: '; say er "invalid input: " y
say er 'The input string contains an invalid character at position' loc"."; exit 13
/*──────────────────────────────────────────────────────────────────────────────────────*/
iBWT: procedure expose ?.; parse arg y,,@. /*obtain the input; nullify @. string.*/
L= length(y) /*compute the length of the input str. */
do j=1 for L /* [↓] step through each input letters*/
do k=1 for L /* [↓] step through each row of table.*/
@.k= substr(y, k, 1) || @.k /*construct a row of the table of chars*/
end /*k*/ /* [↑] order of table row is inverted.*/
call cSort L /*invoke lexicographical sort for array*/
end /*j*/ /* [↑] answer is the penultimate entry*/
do #=1
if right(@.#, 1)==?.2 then return substr(@.#, 2, L-2) /*return correct result*/
end /*#*/
/*──────────────────────────────────────────────────────────────────────────────────────*/
cSort: procedure expose @.; parse arg n; m=n-1 /*N: is the number of @ array elements.*/
do m=m for m by -1 until ok; ok=1 /*keep sorting the @ array until done.*/
do j=1 for m; k=j+1; if @.j<<[email protected].k then iterate /*elements in order? */
[email protected].j; @.[email protected].k; @.k=_; ok=0 /*swap two elements; flag as not done.*/
end /*j*/
end /*m*/; return
output   when using the default inputs:
   input ───►  banana
  output ───►  bnn²aaaⁿ
original ───►  banana

   input ───►  BANANA
  output ───►  BNN²AAAⁿ
original ───►  BANANA

   input ───►  appellee
  output ───►  ²lpelepaeⁿ
original ───►  appellee

   input ───►  dogwood
  output ───►  ²ooodwgdⁿ
original ───►  dogwood

   input ───►  TO BE OR NOT TO BE OR WANT TO BE OR NOT?
  output ───►  OOORREEETTRTW   BBB  ATTT   NNOOONOO²   ?ⁿ
original ───►  TO BE OR NOT TO BE OR WANT TO BE OR NOT?

   input ───►  SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
  output ───►  TEXYDST.E.IXIXIXXSSMPPS.B..E.².UESFXDIIOIIITSⁿ
original ───►  SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES

***error***  BWT:  invalid input:  bad─bad thingy²
***error***  BWT:  The input string contains an invalid character at position 15.

Sidef[edit]

Translation of: Python
class BurrowsWheelerTransform (String L = "\002") {
 
method encode(String s) {
assert(!s.contains(L), "String cannot contain `#{L.dump}`")
s = (L + s)
s.len.of{|i| s.substr(i) + s.substr(0, i) }.sort.map{.last}.join
}
 
method decode(String s) {
var t = s.len.of("")
var c = s.chars
{ t = (c »+« t).sort } * s.len
t.first { .begins_with(L) }.substr(L.len)
}
}
 
var tests = [
"banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
]
 
var bwt = BurrowsWheelerTransform(L: '$')
 
tests.each { |str|
var enc = bwt.encode(str)
var dec = bwt.decode(enc)
say "BWT(#{dec.dump}) = #{enc.dump}"
assert_eq(str, dec)
}
Output:
BWT("banana") = "annb$aa"
BWT("appellee") = "e$elplepa"
BWT("dogwood") = "do$oodwg"
BWT("TOBEORNOTTOBEORTOBEORNOT") = "TOOOBBBRRTTTEEENNOOOOR$TO"
BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = "STEXYDST.E.IXXIIXXSSMPPS.B..EE.$.USFXDIIOIIIT"

zkl[edit]

class BurrowsWheelerTransform{
fcn init(chr="$"){ var special=chr; }
fcn encode(str){
_assert_(not str.holds(special), "String cannot contain char \"%s\"".fmt(special) );
str=str.append(special);
str.len().pump(List().merge,'wrap(n){ String(str[n,*],str[0,n]) })
.pump(String,T("get",-1)); // last char of each "permutation"
}
fcn decode(str){
table:=List.createLong(str.len(),""); // ("",""..), mutable
do(str.len()){
foreach n in (str.len()){ table[n]=str[n] + table[n] }
table.sort();
} // --> ("$dogwood","d$dogwoo","dogwood$",...)
table.filter1("%s*".fmt(special).glob)[1,*]; // str[0]==$, often first element
}
}
BWT:=BurrowsWheelerTransform();
//BWT.encode("$"); // --> assert(bbb.zkl:25): String cannot contain char "$"
 
tests:=T(
"banana", "appellee", "dogwood", "TO BE OR NOT TO BE OR WANT TO BE OR NOT?",
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",);
foreach test in (tests){
enc:=BWT.encode(test);
println("%s\n -->%s\n -->%s".fmt(test,enc,BWT.decode(enc)));
}
Output:
banana
  -->annb$aa
  -->banana
appellee
  -->e$elplepa
  -->appellee
dogwood
  -->do$oodwg
  -->dogwood
TO BE OR NOT TO BE OR WANT TO BE OR NOT?
  -->OOORREEETTR?TW   BBB  ATTT   NNOOONOO$   
  -->TO BE OR NOT TO BE OR WANT TO BE OR NOT?
SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
  -->STEXYDST.E.IXXIIXXSSMPPS.B..EE.$.USFXDIIOIIIT
  -->SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES