Burrows–Wheeler transform: Difference between revisions
Content added Content deleted
m (→{{header|REXX}}: changed some comments and whitespace.) |
(Added solution for Pascal (Lazarus)) |
||
Line 1,119: | Line 1,119: | ||
--> ERROR: String can't contain STX or ETX |
--> ERROR: String can't contain STX or ETX |
||
--> |
--> |
||
</pre> |
|||
=={{header|Pascal}}== |
|||
A console program in Free Pascal, created with the Lazarus IDE. It doesn't use special characters, but records the encoded string along with an index to enable decoding (as in the original paper by Burrows and Wheeler). |
|||
The first character in a Pascal string has index 1, but it's more convenient to describe the algorithm in terms of zero-based indices. The constant STR_BASE = 1 is used to make it clear where Pascal usage has been taken into account. |
|||
<lang pascal> |
|||
program BurrowsWheeler; |
|||
{$mode objfpc}{$H+} // Lazarus default mode; long strings |
|||
uses SysUtils; // only for console output |
|||
const STR_BASE = 1; // first character in a Pascal string has index [1]. |
|||
type TComparison = -1..1; |
|||
procedure Encode( const input : string; |
|||
out encoded : string; |
|||
out index : integer); |
|||
var |
|||
n : integer; |
|||
perm : array of integer; |
|||
i, j, k : integer; |
|||
incr, v : integer; |
|||
// Subroutine to compare rotations whose *last* letters have zero-based |
|||
// indices a, b. Returns 1, 0, -1 according as the rotation ending at a |
|||
// is >, =, < the rotation ending at b. |
|||
function CompareRotations( a, b : integer) : TComparison; |
|||
var |
|||
p, q, nrNotTested : integer; |
|||
begin |
|||
result := 0; |
|||
p := a; |
|||
q := b; |
|||
nrNotTested := n; |
|||
repeat |
|||
inc(p); if (p = n) then p := 0; |
|||
inc(q); if (q = n) then q := 0; |
|||
if (input[p + STR_BASE] = input[q + STR_BASE]) then dec( nrNotTested) |
|||
else if (input[p + STR_BASE] > input[q + STR_BASE]) then result := 1 |
|||
else result := -1 |
|||
until (result <> 0) or (nrNotTested = 0); |
|||
end; |
|||
begin |
|||
n := Length( input); |
|||
SetLength( perm, n); |
|||
for j := 0 to n - 1 do perm[j] := j; |
|||
// Sort string indices by comparing the associated rotations, as above. |
|||
// This is a Shell sort from Press et al., Numerical Recipes, 3rd edn, pp 422-3. |
|||
// Other sorting algorithms might be used. |
|||
incr := 1; |
|||
repeat |
|||
incr := 3*incr + 1 |
|||
until (incr >= n); |
|||
repeat |
|||
incr := incr div 3; |
|||
for i := incr to n - 1 do begin |
|||
v := perm[i]; |
|||
j := i; |
|||
while (j >= incr) and (CompareRotations( perm[j - incr], v) = 1) do begin |
|||
perm[j] := perm[j - incr]; |
|||
dec( j, incr); |
|||
end; |
|||
perm[j] := v; |
|||
end; // for |
|||
until (incr = 1); |
|||
// Apply the sorted array to create the output. |
|||
SetLength( encoded, n); |
|||
for j := 0 to n - 1 do begin |
|||
k := perm[j]; |
|||
encoded[j + STR_BASE] := input[k + STR_BASE]; |
|||
if (k = n - 1) then index := j; |
|||
end; |
|||
end; |
|||
{------------------------------------------------------------------------------ |
|||
Given an encoded string and the associated index, one way to rebuild |
|||
the original string is to do the following, or its equivalent: |
|||
Given Make an array Sort the array Rebuild the original string |
|||
'NNBAAA' [0] = ('N', 0) [0] = ('A', 3) Start with given index 3 |
|||
index = 3 [1] = ('N', 1) [1] = ('A', 4) [3] gives 'B', next index = 2 |
|||
[2] = ('B', 2) [2] = ('A', 5) [2] gives 'A', next index = 5 |
|||
[3] = ('A', 3) [3] = ('B', 2) [5] gives 'N', next index = 1 |
|||
[4] = ('A', 4) [4] = ('N', 0) [1] gives 'A', next index = 4 |
|||
[5] = ('A', 5) [5] = ('N', 1) [4] gives 'N', next index = 0 |
|||
[0] gives 'A', next index = 3 |
|||
3 = start index, so stop |
|||
Result = 'BANANA' |
|||
If the original string consists of two or more repetitions of a substring, |
|||
the above method will stop when that substring has been built, e.g. |
|||
'CANCAN' will stop at 'CAN'. |
|||
We therefore need to test for the rebuilt string being too short, and if so |
|||
make enough copies of the decoded part to fill the required length. |
|||
It's possible to take the above description literally, and write a decoding |
|||
routine that uses a record type consisting of a character and an integer. |
|||
A more efficient way is to create an integer array containing only the indices, |
|||
in the above example (3, 4, 5, 2, 0, 1). A first pass counts the occurrences |
|||
of each character in the encoded string. If the character set is ['A'..'Z'] |
|||
then the indices associated with 'A' are stored from [0]. If 'A' occurs a times, |
|||
the indices associated with 'B' are stored from [a]; if 'B' occurs b times, |
|||
the indices associated with 'C' are stored from [a + b]; and so on. |
|||
} |
|||
function Decode( encoded : string; |
|||
index : integer) : string; |
|||
var |
|||
charInfo : array [char] of integer; |
|||
perm : array of integer; |
|||
n, j, k : integer; |
|||
c : char; |
|||
total, prev : integer; |
|||
begin |
|||
n := Length( encoded); |
|||
// An empty encoded string will crash the code below, so trap it here. |
|||
if (n = 0) then begin |
|||
result := ''; |
|||
exit; |
|||
end; |
|||
// Count the occurrences of each possible character. |
|||
for c := Low(char) to High(char) do charInfo[c] := 0; |
|||
for j := 0 to n - 1 do begin |
|||
c := encoded[j + STR_BASE]; |
|||
inc( charInfo[c]); |
|||
end; |
|||
// Cumulate, i.e. charInfo[k] := sum of old charInfo from 0 to k - 1 |
|||
total := 0; |
|||
prev := 0; |
|||
for c := Low(char) to High(char) do begin |
|||
inc( total, prev); |
|||
prev := CharInfo[c]; |
|||
charInfo[c] := total; |
|||
end; |
|||
// Make the array "perm" |
|||
SetLength( perm, n); |
|||
for j := 0 to n - 1 do begin |
|||
c := encoded[j + STR_BASE]; |
|||
k := charInfo[c]; |
|||
perm[k] := j; |
|||
inc( charInfo[c]); |
|||
end; |
|||
// Apply the array "perm" to re-create the original string. |
|||
SetLength( result, n); |
|||
k := 0; // index into result |
|||
j := index; |
|||
repeat |
|||
j := perm[j]; |
|||
result[k + STR_BASE] := encoded[j + STR_BASE]; |
|||
inc(k); |
|||
until (j = index); |
|||
// If the original consisted of M repetitions of the same string, then |
|||
// at this point exactly 1/M of the result has been filled in. |
|||
// For M > 1 (shown by k < n), complete the result by copying the first part. |
|||
if (k < n) then begin |
|||
Assert( n mod k = 0); // we should have n = M*k |
|||
for j := k to n - 1 do result[j + STR_BASE] := result[j - k + STR_BASE]; |
|||
end; |
|||
end; |
|||
procedure Test( const s : string); |
|||
var |
|||
encoded, decoded : string; |
|||
index : integer; |
|||
begin |
|||
WriteLn( ''); |
|||
WriteLn( ' ' + s); |
|||
Encode( s, {out} encoded, index); |
|||
WriteLn( '---> ' + encoded); |
|||
WriteLn( ' index = ' + SysUtils.IntToStr( index)); |
|||
decoded := Decode( encoded, index); |
|||
WriteLn( '---> ' + decoded); |
|||
end; |
|||
begin |
|||
Test( 'BANANA'); |
|||
Test( 'CANAAN'); |
|||
Test( 'CANCAN'); |
|||
Test( 'appellee'); |
|||
Test( 'dogwood'); |
|||
Test( 'TO BE OR NOT TO BE OR WANT TO BE OR NOT?'); |
|||
Test( 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES'); |
|||
end. |
|||
</lang> |
|||
{{out}} |
|||
<pre> |
|||
BANANA |
|||
---> NNBAAA |
|||
index = 3 |
|||
---> BANANA |
|||
CANAAN |
|||
---> NCANAA |
|||
index = 3 |
|||
---> CANAAN |
|||
CANCAN |
|||
---> CCNNAA |
|||
index = 2 |
|||
---> CANCAN |
|||
appellee |
|||
---> eelplepa |
|||
index = 0 |
|||
---> appellee |
|||
dogwood |
|||
---> odoodwg |
|||
index = 1 |
|||
---> dogwood |
|||
TO BE OR NOT TO BE OR WANT TO BE OR NOT? |
|||
---> OOORREEETTRTW BBB ATTT NNOOONOO? |
|||
index = 36 |
|||
---> TO BE OR NOT TO BE OR WANT TO BE OR NOT? |
|||
SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES |
|||
---> TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT |
|||
index = 29 |
|||
---> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES |
|||
</pre> |
</pre> |
||