Burrows–Wheeler transform
This page uses content from Wikipedia. The original article was at Burrows–Wheeler_transform. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance) |
The Burrows–Wheeler transform (BWT, also called block-sorting compression) rearranges a character string into runs of similar characters. This is useful for compression, since it tends to be easy to compress a string that has runs of repeated characters by techniques such as move-to-front transform and run-length encoding. More importantly, the transformation is reversible, without needing to store any additional data. The BWT is thus a "free" method of improving the efficiency of text compression algorithms, costing only some extra computation.
Source: Burrows–Wheeler transform
C
<lang c>#include <string.h>
- include <stdio.h>
- include <stdlib.h>
const char STX = '\002', ETX = '\003';
int compareStrings(const void *a, const void *b) {
char *aa = *(char **)a; char *bb = *(char **)b; return strcmp(aa, bb);
}
int bwt(const char *s, char r[]) {
int i, len = strlen(s) + 2; char *ss, *str; char **table; if (strchr(s, STX) || strchr(s, ETX)) return 1; ss = calloc(len + 1, sizeof(char)); sprintf(ss, "%c%s%c", STX, s, ETX); table = malloc(len * sizeof(const char *)); for (i = 0; i < len; ++i) { str = calloc(len + 1, sizeof(char)); strcpy(str, ss + i); if (i > 0) strncat(str, ss, i); table[i] = str; } qsort(table, len, sizeof(const char *), compareStrings); for(i = 0; i < len; ++i) { r[i] = table[i][len - 1]; free(table[i]); } free(table); free(ss); return 0;
}
void ibwt(const char *r, char s[]) {
int i, j, len = strlen(r); char **table = malloc(len * sizeof(const char *)); for (i = 0; i < len; ++i) table[i] = calloc(len + 1, sizeof(char)); for (i = 0; i < len; ++i) { for (j = 0; j < len; ++j) { memmove(table[j] + 1, table[j], len); table[j][0] = r[j]; } qsort(table, len, sizeof(const char *), compareStrings); } for (i = 0; i < len; ++i) { if (table[i][len - 1] == ETX) { strncpy(s, table[i] + 1, len - 2); break; } } for (i = 0; i < len; ++i) free(table[i]); free(table);
}
void makePrintable(const char *s, char t[]) {
strcpy(t, s); for ( ; *t != '\0'; ++t) { if (*t == STX) *t = '^'; else if (*t == ETX) *t = '|'; }
}
int main() {
int i, res, len; char *tests[6], *t, *r, *s; tests[0] = "banana"; tests[1] = "appellee"; tests[2] = "dogwood"; tests[3] = "TO BE OR NOT TO BE OR WANT TO BE OR NOT?"; tests[4] = "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", tests[5] = "\002ABC\003"; for (i = 0; i < 6; ++i) { len = strlen(tests[i]); t = calloc(len + 1, sizeof(char)); makePrintable(tests[i], t); printf("%s\n", t); printf(" --> "); r = calloc(len + 3, sizeof(char)); res = bwt(tests[i], r); if (res == 1) { printf("ERROR: String can't contain STX or ETX\n"); } else { makePrintable(r, t); printf("%s\n", t); } s = calloc(len + 1, sizeof(char)); ibwt(r, s); makePrintable(s, t); printf(" --> %s\n\n", t); free(t); free(r); free(s); } return 0;
}</lang>
- Output:
banana --> |annb^aa --> banana appellee --> |e^elplepa --> appellee dogwood --> |do^oodwg --> dogwood TO BE OR NOT TO BE OR WANT TO BE OR NOT? --> |?OOORREEETTRTW BBB ATTT NNOOONOO^ --> TO BE OR NOT TO BE OR WANT TO BE OR NOT? SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES ^ABC| --> ERROR: String can't contain STX or ETX -->
Go
<lang go>package main
import (
"fmt" "sort" "strings"
)
const stx = "\002" const etx = "\003"
func bwt(s string) (string, error) {
if strings.Index(s, stx) >= 0 || strings.Index(s, etx) >= 0 { return "", fmt.Errorf("String can't contain STX or ETX") } s = stx + s + etx le := len(s) table := make([]string, le) table[0] = s for i := 1; i < le; i++ { table[i] = s[i:] + s[:i] } sort.Strings(table) lastBytes := make([]byte, le) for i := 0; i < le; i++ { lastBytes[i] = table[i][le-1] } return string(lastBytes), nil
}
func ibwt(r string) string {
le := len(r) table := make([]string, le) for range table { for i := 0; i < le; i++ { table[i] = r[i:i+1] + table[i] } sort.Strings(table) } for _, row := range table { if strings.HasSuffix(row, etx) { return row[1 : le-1] } } return ""
}
func makePrintable(s string) string {
// substitute ^ for STX and | for ETX to print results t := strings.Replace(s, stx, "^", 1) return strings.Replace(t, etx, "|", 1)
}
func main() {
tests := []string{ "banana", "appellee", "dogwood", "TO BE OR NOT TO BE OR WANT TO BE OR NOT?", "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "\002ABC\003", } for _, test := range tests { fmt.Println(makePrintable(test)) fmt.Print(" --> ") t, err := bwt(test) if err != nil { fmt.Println("ERROR:", err) } else { fmt.Println(makePrintable(t)) } r := ibwt(t) fmt.Println(" -->", r, "\n") }
}</lang>
- Output:
banana --> |annb^aa --> banana appellee --> |e^elplepa --> appellee dogwood --> |do^oodwg --> dogwood TO BE OR NOT TO BE OR WANT TO BE OR NOT? --> |?OOORREEETTRTW BBB ATTT NNOOONOO^ --> TO BE OR NOT TO BE OR WANT TO BE OR NOT? SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES ^ABC| --> ERROR: String can't contain STX or ETX -->
Kotlin
<lang scala>// Version 1.2.60
const val STX = "\u0002" const val ETX = "\u0003"
fun bwt(s: String): String {
if (s.contains(STX) || s.contains(ETX)) { throw RuntimeException("String can't contain STX or ETX") } val ss = STX + s + ETX val table = Array<String>(ss.length) { ss.substring(it) + ss.substring(0, it) } table.sort() return String(table.map { it[it.lastIndex] }.toCharArray())
}
fun ibwt(r: String): String {
val len = r.length val table = Array<String>(len) { "" } repeat(len) { for (i in 0 until len) { table[i] = r[i].toString() + table[i] } table.sort() } for (row in table) { if (row.endsWith(ETX)) { return row.substring(1, len - 1) } } return ""
}
fun makePrintable(s: String): String {
// substitute ^ for STX and | for ETX to print results return s.replace(STX, "^").replace(ETX, "|")
}
fun main(args: Array<String>) {
val tests = listOf( "banana", "appellee", "dogwood", "TO BE OR NOT TO BE OR WANT TO BE OR NOT?", "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "\u0002ABC\u0003" ) for (test in tests) { println(makePrintable(test)) print(" --> ") var t = "" try { t = bwt(test) println(makePrintable(t)) } catch (ex: RuntimeException) { println("ERROR: " + ex.message) } val r = ibwt(t) println(" --> $r\n") }
}</lang>
- Output:
banana --> |annb^aa --> banana appellee --> |e^elplepa --> appellee dogwood --> |do^oodwg --> dogwood TO BE OR NOT TO BE OR WANT TO BE OR NOT? --> |?OOORREEETTRTW BBB ATTT NNOOONOO^ --> TO BE OR NOT TO BE OR WANT TO BE OR NOT? SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES --> |STEXYDST.E.IXXIIXXSSMPPS.B..EE.^.USFXDIIOIIIT --> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES ^ABC| --> ERROR: String can't contain STX or ETX -->
Perl 6
<lang perl6># STX can be any character that doesn't appear in the text.
- Using a visible character here for ease of viewing.
constant \STX = '👍';
- Burrows-Wheeler transform
sub transform (Str $s is copy) {
note "String can't contain STX character." and exit if $s.contains: STX; $s = STX ~ $s; (^$s.chars).map({ $s.comb.list.rotate: $_ }).sort[*;*-1].join
}
- Burrows-Wheeler inverse transform
sub ɯɹoɟsuɐɹʇ (Str $s) {
my @t = $s.comb.sort; @t = ($s.comb Z~ @t).sort for 1..^$s.chars; @t.first( *.ends-with: STX ).chop
}
- TESTING
for |<BANANA dogwood SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES>,
'TO BE OR NOT TO BE OR WANT TO BE OR NOT?', "Oops{STX}" -> $phrase { say 'Original: ', $phrase; say 'Transformed: ', transform $phrase; say 'Inverse transformed: ', ɯɹoɟsuɐɹʇ transform $phrase; say ;
}</lang>
- Output:
Original: BANANA Transformed: BNN👍AAA Inverse transformed: BANANA Original: dogwood Transformed: 👍ooodwgd Inverse transformed: dogwood Original: SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES Transformed: TEXYDST.E.IXIXIXXSSMPPS.B..E.👍.UESFXDIIOIIITS Inverse transformed: SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES Original: TO BE OR NOT TO BE OR WANT TO BE OR NOT? Transformed: OOORREEETTRTW BBB ATTT NNOOONOO👍 ? Inverse transformed: TO BE OR NOT TO BE OR WANT TO BE OR NOT? Original: Oops👍 String can't contain STX character.
Python
This Python implementation sacrifices speed for simplicity: the program is short, but takes more than the linear time that would be desired in a practical implementation.
Using the STX/ETX control codes to mark the start and end of the text, and using s[i:] + s[:i] to construct the ith rotation of s, the forward transform takes the last character of each of the sorted rows. The inverse transform repeatedly inserts r as the left column of the table and sorts the table. After the whole table is built, it returns the row that ends with ETX, minus the STX and ETX.
<lang Python> def bwt(s):
"""Apply Burrows-Wheeler transform to input string.""" assert "\002" not in s and "\003" not in s, "Input string cannot contain STX and ETX characters" s = "\002" + s + "\003" # Add start and end of text marker table = sorted(s[i:] + s[:i] for i in range(len(s))) # Table of rotations of string last_column = [row[-1:] for row in table] # Last characters of each row return "".join(last_column) # Convert list of characters into string
def ibwt(r):
"""Apply inverse Burrows-Wheeler transform.""" table = [""] * len(r) # Make empty table for i in range(len(r)): table = sorted(r[i] + table[i] for i in range(len(r))) # Add a column of r s = [row for row in table if row.endswith("\003")][0] # Find the correct row (ending in ETX) return s.rstrip("\003").strip("\002") # Get rid of start and end markers
</lang>
Sidef
<lang ruby>class BurrowsWheelerTransform (String L = "\002") {
method encode(String s) { assert(!s.contains(L), "String cannot contain `#{L.dump}`") s = (L + s) s.len.of{|i| s.substr(i) + s.substr(0, i) }.sort.map{.last}.join }
method decode(String s) { var t = s.len.of("") var c = s.chars { t = (c »+« t).sort } * s.len t.first { .begins_with(L) }.substr(L.len) }
}
var tests = [
"banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT" "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
]
var bwt = BurrowsWheelerTransform(L: '$')
tests.each { |str|
var enc = bwt.encode(str) var dec = bwt.decode(enc) say "BWT(#{dec.dump}) = #{enc.dump}" assert_eq(str, dec)
}</lang>
- Output:
BWT("banana") = "annb$aa" BWT("appellee") = "e$elplepa" BWT("dogwood") = "do$oodwg" BWT("TOBEORNOTTOBEORTOBEORNOT") = "TOOOBBBRRTTTEEENNOOOOR$TO" BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = "STEXYDST.E.IXXIIXXSSMPPS.B..EE.$.USFXDIIOIIIT"
zkl
<lang zkl>class BurrowsWheelerTransform{
fcn init(chr="$"){ var special=chr; } fcn encode(str){ _assert_(not str.holds(special), "String cannot contain char \"%s\"".fmt(special) ); str=str.append(special); str.len().pump(List().merge,'wrap(n){ String(str[n,*],str[0,n]) }) .pump(String,T("get",-1)); // last char of each "permutation" } fcn decode(str){ table:=List.createLong(str.len(),""); // ("",""..), mutable do(str.len()){
foreach n in (str.len()){ table[n]=str[n] + table[n] } table.sort();
} // --> ("$dogwood","d$dogwoo","dogwood$",...) table.filter1("%s*".fmt(special).glob)[1,*]; // str[0]==$, often first element }
}</lang> <lang zkl>BWT:=BurrowsWheelerTransform(); //BWT.encode("$"); // --> assert(bbb.zkl:25): String cannot contain char "$"
tests:=T(
"banana", "appellee", "dogwood", "TO BE OR NOT TO BE OR WANT TO BE OR NOT?", "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",);
foreach test in (tests){
enc:=BWT.encode(test); println("%s\n -->%s\n -->%s".fmt(test,enc,BWT.decode(enc)));
}</lang>
- Output:
banana -->annb$aa -->banana appellee -->e$elplepa -->appellee dogwood -->do$oodwg -->dogwood TO BE OR NOT TO BE OR WANT TO BE OR NOT? -->OOORREEETTR?TW BBB ATTT NNOOONOO$ -->TO BE OR NOT TO BE OR WANT TO BE OR NOT? SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES -->STEXYDST.E.IXXIIXXSSMPPS.B..EE.$.USFXDIIOIIIT -->SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES