Burrows–Wheeler transform: Difference between revisions

Added solution for Pascal (Lazarus)
m (→‎{{header|REXX}}: changed some comments and whitespace.)
(Added solution for Pascal (Lazarus))
Line 1,119:
--> ERROR: String can't contain STX or ETX
-->
</pre>
 
=={{header|Pascal}}==
A console program in Free Pascal, created with the Lazarus IDE. It doesn't use special characters, but records the encoded string along with an index to enable decoding (as in the original paper by Burrows and Wheeler).
 
The first character in a Pascal string has index 1, but it's more convenient to describe the algorithm in terms of zero-based indices. The constant STR_BASE = 1 is used to make it clear where Pascal usage has been taken into account.
<lang pascal>
program BurrowsWheeler;
 
{$mode objfpc}{$H+} // Lazarus default mode; long strings
uses SysUtils; // only for console output
const STR_BASE = 1; // first character in a Pascal string has index [1].
type TComparison = -1..1;
 
procedure Encode( const input : string;
out encoded : string;
out index : integer);
var
n : integer;
perm : array of integer;
i, j, k : integer;
incr, v : integer;
 
// Subroutine to compare rotations whose *last* letters have zero-based
// indices a, b. Returns 1, 0, -1 according as the rotation ending at a
// is >, =, < the rotation ending at b.
function CompareRotations( a, b : integer) : TComparison;
var
p, q, nrNotTested : integer;
begin
result := 0;
p := a;
q := b;
nrNotTested := n;
repeat
inc(p); if (p = n) then p := 0;
inc(q); if (q = n) then q := 0;
if (input[p + STR_BASE] = input[q + STR_BASE]) then dec( nrNotTested)
else if (input[p + STR_BASE] > input[q + STR_BASE]) then result := 1
else result := -1
until (result <> 0) or (nrNotTested = 0);
end;
begin
n := Length( input);
SetLength( perm, n);
for j := 0 to n - 1 do perm[j] := j;
 
// Sort string indices by comparing the associated rotations, as above.
// This is a Shell sort from Press et al., Numerical Recipes, 3rd edn, pp 422-3.
// Other sorting algorithms might be used.
incr := 1;
repeat
incr := 3*incr + 1
until (incr >= n);
repeat
incr := incr div 3;
for i := incr to n - 1 do begin
v := perm[i];
j := i;
while (j >= incr) and (CompareRotations( perm[j - incr], v) = 1) do begin
perm[j] := perm[j - incr];
dec( j, incr);
end;
perm[j] := v;
end; // for
until (incr = 1);
 
// Apply the sorted array to create the output.
SetLength( encoded, n);
for j := 0 to n - 1 do begin
k := perm[j];
encoded[j + STR_BASE] := input[k + STR_BASE];
if (k = n - 1) then index := j;
end;
end;
 
{------------------------------------------------------------------------------
Given an encoded string and the associated index, one way to rebuild
the original string is to do the following, or its equivalent:
 
Given Make an array Sort the array Rebuild the original string
'NNBAAA' [0] = ('N', 0) [0] = ('A', 3) Start with given index 3
index = 3 [1] = ('N', 1) [1] = ('A', 4) [3] gives 'B', next index = 2
[2] = ('B', 2) [2] = ('A', 5) [2] gives 'A', next index = 5
[3] = ('A', 3) [3] = ('B', 2) [5] gives 'N', next index = 1
[4] = ('A', 4) [4] = ('N', 0) [1] gives 'A', next index = 4
[5] = ('A', 5) [5] = ('N', 1) [4] gives 'N', next index = 0
[0] gives 'A', next index = 3
3 = start index, so stop
Result = 'BANANA'
 
If the original string consists of two or more repetitions of a substring,
the above method will stop when that substring has been built, e.g.
'CANCAN' will stop at 'CAN'.
We therefore need to test for the rebuilt string being too short, and if so
make enough copies of the decoded part to fill the required length.
 
It's possible to take the above description literally, and write a decoding
routine that uses a record type consisting of a character and an integer.
A more efficient way is to create an integer array containing only the indices,
in the above example (3, 4, 5, 2, 0, 1). A first pass counts the occurrences
of each character in the encoded string. If the character set is ['A'..'Z']
then the indices associated with 'A' are stored from [0]. If 'A' occurs a times,
the indices associated with 'B' are stored from [a]; if 'B' occurs b times,
the indices associated with 'C' are stored from [a + b]; and so on.
}
function Decode( encoded : string;
index : integer) : string;
var
charInfo : array [char] of integer;
perm : array of integer;
n, j, k : integer;
c : char;
total, prev : integer;
 
begin
n := Length( encoded);
// An empty encoded string will crash the code below, so trap it here.
if (n = 0) then begin
result := '';
exit;
end;
 
// Count the occurrences of each possible character.
for c := Low(char) to High(char) do charInfo[c] := 0;
for j := 0 to n - 1 do begin
c := encoded[j + STR_BASE];
inc( charInfo[c]);
end;
 
// Cumulate, i.e. charInfo[k] := sum of old charInfo from 0 to k - 1
total := 0;
prev := 0;
for c := Low(char) to High(char) do begin
inc( total, prev);
prev := CharInfo[c];
charInfo[c] := total;
end;
 
// Make the array "perm"
SetLength( perm, n);
for j := 0 to n - 1 do begin
c := encoded[j + STR_BASE];
k := charInfo[c];
perm[k] := j;
inc( charInfo[c]);
end;
 
// Apply the array "perm" to re-create the original string.
SetLength( result, n);
k := 0; // index into result
j := index;
repeat
j := perm[j];
result[k + STR_BASE] := encoded[j + STR_BASE];
inc(k);
until (j = index);
 
// If the original consisted of M repetitions of the same string, then
// at this point exactly 1/M of the result has been filled in.
// For M > 1 (shown by k < n), complete the result by copying the first part.
if (k < n) then begin
Assert( n mod k = 0); // we should have n = M*k
for j := k to n - 1 do result[j + STR_BASE] := result[j - k + STR_BASE];
end;
end;
 
procedure Test( const s : string);
var
encoded, decoded : string;
index : integer;
begin
WriteLn( '');
WriteLn( ' ' + s);
Encode( s, {out} encoded, index);
WriteLn( '---> ' + encoded);
WriteLn( ' index = ' + SysUtils.IntToStr( index));
decoded := Decode( encoded, index);
WriteLn( '---> ' + decoded);
end;
 
begin
Test( 'BANANA');
Test( 'CANAAN');
Test( 'CANCAN');
Test( 'appellee');
Test( 'dogwood');
Test( 'TO BE OR NOT TO BE OR WANT TO BE OR NOT?');
Test( 'SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES');
end.
</lang>
{{out}}
<pre>
BANANA
---> NNBAAA
index = 3
---> BANANA
 
CANAAN
---> NCANAA
index = 3
---> CANAAN
 
CANCAN
---> CCNNAA
index = 2
---> CANCAN
 
appellee
---> eelplepa
index = 0
---> appellee
 
dogwood
---> odoodwg
index = 1
---> dogwood
 
TO BE OR NOT TO BE OR WANT TO BE OR NOT?
---> OOORREEETTRTW BBB ATTT NNOOONOO?
index = 36
---> TO BE OR NOT TO BE OR WANT TO BE OR NOT?
 
SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
---> TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT
index = 29
---> SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES
</pre>
 
113

edits