LZW compression
The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.
You are encouraged to solve this task according to the task description, using any language you may know.
D
D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <d> import std.stdio: writefln;
/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {
// build the dictionary int dict_size = 256; int[string] dictionary; for (int i; i < dict_size; i++) dictionary["" ~ cast(char)i] = i;
string w; int[] result; foreach (c; uncompressed) { string wc = w ~ c; if (wc in dictionary) w = wc; else { result ~= dictionary[w]; // add wc to the dictionary dictionary[wc] = dict_size; dict_size++; w = "" ~ c; } }
// output the code for w if (w.length) result ~= dictionary[w]; return result;
}
/// Decompress a list of output ks to a string.
string decompress(int[] compressed) {
// build the dictionary int dict_size = 256; string[int] dictionary; for (int i; i < dict_size; i++) dictionary[i] = "" ~ cast(char)i;
string w = "" ~ cast(char)compressed[0]; string result = w; foreach (k; compressed[1 .. $]) { string entry; if (k in dictionary) entry = dictionary[k]; else if (k == dict_size) entry = w ~ w[0]; else throw new Exception("Bad compressed k"); result ~= entry;
// add w+entry[0] to the dictionary dictionary[dict_size] = w ~ entry[0]; dict_size++;
w = entry; } return result;
}
void main() {
auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT"); writefln(compressed);
auto decompressed = decompress(compressed); writefln(decompressed);
} </d> The output: <d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </d>
Forth
256 value next-symbol \ current string fragment create w 256 allot \ counted string : w=c ( c -- ) w 1+ c! 1 w c! ; : w+c ( c -- ) w count + c! w c@ 1+ w c! ;
\ Compression \ dictionary of strings to symbols 0 value dict : init-dict table to dict 256 to next-symbol dict set-current ; : free-dict forth-wordlist set-current ; : in-dict? ( key len -- ? ) \ can assume len > 1 dict search-wordlist dup if nip then ; : lookup-dict ( key len -- symbol ) dup 1 = if drop c@ exit then dict search-wordlist if >body @ else abort" bad-dict!" then ; : put-dict ( data key len -- ) nextname create , ; \ output buffer of symbols \ in real life, these symbols would be packed into octets variable out-size create out 256 cells allot : output ( symbol -- ) dup out out-size @ cells + ! 1 out-size +! dup 256 < if emit space else . then ; : compress ( addr len -- ) init-dict 0 out-size ! over c@ w=c 1 /string bounds do i c@ w+c w count in-dict? 0= if w count 1- lookup-dict output next-symbol dup w count put-dict 1+ to next-symbol i c@ w=c then loop w count lookup-dict output free-dict ;
\ Decompression \ array of symbols to strings (in real code this would need to be growable) \ next-symbol is reused for the size of this table create symtab 256 cells allot 0 value start : init-symtab 256 to next-symbol here to start ; : free-symtab start here - allot ; : get-symbol ( symbol -- addr len ) dup 256 < if pad c! pad 1 exit then 256 - cells symtab + @ count ; : add-symbol ( addr len -- ) here symtab next-symbol 256 - cells + ! s, next-symbol 1+ to next-symbol ; create entry 256 allot : decompress ( addr len -- ) init-symtab over @ dup emit w=c cells bounds cell+ do i @ next-symbol < if i @ get-symbol entry place else i @ next-symbol = if w 1+ c@ w count + c! w count 1+ entry place else abort" bad symbol!" then then entry count type \ output entry 1+ c@ w+c w count add-symbol entry count w place 1 cells +loop free-symtab ;
\ Testing s" TOBEORNOTTOBEORTOBEORNOT" compress cr \ T O B E O R N O T 256 258 260 265 259 261 263 out out-size @ decompress cr \ TOBEORNOTTOBEORTOBEORNOT
Java
<java>import java.util.*;
public class LZW {
/** Compress a string to a list of output symbols. */ public static List<Integer> compress(String uncompressed) { // Build the dictionary. int dictSize = 256; Map<String,Integer> dictionary = new HashMap<String,Integer>(); for (int i = 0; i < 256; i++) dictionary.put("" + (char)i, i); String w = ""; List<Integer> result = new ArrayList<Integer>(); for (char c : uncompressed.toCharArray()) { String wc = w + c; if (dictionary.containsKey(wc)) w = wc; else { result.add(dictionary.get(w)); // Add wc to the dictionary. dictionary.put(wc, dictSize++); w = "" + c; } } // Output the code for w. if (!w.equals("")) result.add(dictionary.get(w)); return result; } /** Decompress a list of output ks to a string. */ public static String decompress(List<Integer> compressed) { // Build the dictionary. int dictSize = 256; Map<Integer,String> dictionary = new HashMap<Integer,String>(); for (int i = 0; i < 256; i++) dictionary.put(i, "" + (char)i); String w = "" + (char)(int)compressed.remove(0); String result = w; for (int k : compressed) { String entry; if (dictionary.containsKey(k)) entry = dictionary.get(k); else if (k == dictSize) entry = w + w.charAt(0); else throw new IllegalArgumentException("Bad compressed k: " + k); result += entry; // Add w+entry[0] to the dictionary. dictionary.put(dictSize++, w + entry.charAt(0)); w = entry; } return result; }
public static void main(String[] args) { List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT"); System.out.println(compressed); String decompressed = decompress(compressed); System.out.println(decompressed); }
}</java>
JavaScript
<javascript> //LZW Compression/Decompression for Strings var LZW = {
"compress" : function(uncompressed) { // Build the dictionary. var dictSize = 256; var dictionary = []; for (var i = 0; i < 256; i++) { dictionary[String.fromCharCode(i)] = i; } var w = ""; var result = []; for (var i = 0; i < uncompressed.length; i++) { var c = uncompressed.charAt(i); var wc = w + c; if (dictionary[wc]) w = wc; else { result.push(dictionary[w]); // Add wc to the dictionary. dictionary[wc] = dictSize++; w = "" + c; } } // Output the code for w. if (w != "") result.push(dictionary[w]); return result; },
"decompress" : function(compressed) { // Build the dictionary. var dictSize = 256; var dictionary = []; for (var i = 0; i < 256; i++) { dictionary[i] = String.fromCharCode(i); } var w = String.fromCharCode(compressed[0]); var result = w; for (var i = 1; i < compressed.length; i++) { var entry = ""; var k = compressed[i]; if (dictionary[k]) entry = dictionary[k]; else if (k == dictSize) entry = w + w.charAt(0); else return null; result += entry; // Add w+entry[0] to the dictionary. dictionary[dictSize++] = w + entry.charAt(0); w = entry; } return result; }
}
// For Test Purposes
var comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT");
var decomp = LZW.decompress(comp);
document.write(comp+'
'+decomp);
</javascript>
Output: <javascript> 84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT </javascript>
Objective-C
The class for the LZW compression algorithm:
<objc>#import <Foundation/Foundation.h>
- import <stdio.h>
@interface LZWCompressor : NSObject {
@private NSMutableArray *iostream; NSMutableDictionary *dict; NSUInteger codemark;
}
-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end
@implementation LZWCompressor : NSObject
-(LZWCompressor *) init {
self = [super init]; if ( self ) { iostream = nil; codemark = 256; dict = [NSMutableDictionary dictionaryWithCapacity: 512]; } return self;
}
-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {
self = [self init]; if ( self ) { [self setArray: stream]; } return self;
}
-(void) setArray: (NSMutableArray *) stream {
iostream = stream;
}
-(BOOL) compressData: (NSData *) string; {
NSUInteger i; unsigned char j; // prepare dict for(i=0; i < 256; i++) { j = i; NSData *s = [NSData dataWithBytes: &j length: 1]; [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s]; } NSMutableData *w = [NSMutableData data]; NSMutableData *wc = [NSMutableData data]; for(i=0; i < [string length]; i++) { [wc setData: w]; [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]]; if ( [dict objectForKey: wc] != nil ) { [w setData: wc]; } else { [iostream addObject: [dict objectForKey: w]]; [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc]; codemark++; [w setData: [string subdataWithRange: NSMakeRange(i, 1)]]; } } if ( [w length] != 0 ) { [iostream addObject: [dict objectForKey: w]]; } return YES;
}
-(NSArray *) getArray {
return iostream;
}
@end</objc>
Usage example:
<objc>char *text = "TOBEORNOTTOBEORTOBEORNOT";
int main() {
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init]; LZWCompressor *lzw = [[LZWCompressor alloc] initWithArray: [[NSMutableArray alloc] init] ]; if ( lzw ) { [lzw compressData: [NSData dataWithBytes: text length: strlen(text)]]; NSEnumerator *en = [[lzw getArray] objectEnumerator]; id obj; while( (obj = [en nextObject]) ) { printf("%u\n", [obj unsignedIntValue]); } } [pool release]; return EXIT_SUCCESS;
}</objc>
Output (reformatted by hand):
84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263
OCaml
<ocaml>#directory "+site-lib/extlib/"
- load "extLib.cma"
open ExtString
(** compress a string to a list of output symbols *) let compress ~uncompressed =
(* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to 255 do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary str i done; let f = (fun (w, dict_size, result) c -> let c = String.make 1 c in let wc = w ^ c in if Hashtbl.mem dictionary wc then (wc, dict_size, result) else begin (* add wc to the dictionary *) Hashtbl.add dictionary wc dict_size; let this = Hashtbl.find dictionary w in (c, dict_size + 1, this::result) end ) in let w, _, result = String.fold_left f ("", dict_size, []) uncompressed in
(* output the code for w *) let result = if w = "" then result else (Hashtbl.find dictionary w) :: result in
(List.rev result)
exception ValueError of string
(** decompress a list of output symbols to a string *) let decompress ~compressed =
(* build the dictionary *) let dict_size = 256 in let dictionary = Hashtbl.create 397 in for i=0 to pred dict_size do let str = String.make 1 (char_of_int i) in Hashtbl.add dictionary i str done;
let w, compressed = match compressed with | hd::tl -> (String.make 1 (char_of_int hd)), tl | [] -> failwith "empty input" in let result = w::[] in
let result, _, _ = List.fold_left (fun (result, w, dict_size) k -> let entry = if Hashtbl.mem dictionary k then Hashtbl.find dictionary k else if k = Hashtbl.length dictionary then w ^ (String.make 1 w.[0]) else raise(ValueError(Printf.sprintf "Bad compressed k: %d" k)) in let result = entry :: result in (* add (w ^ entry.[0]) to the dictionary *) Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0])); (result, entry, dict_size + 1) ) (result, w, dict_size) compressed in (List.rev result)
- </ocaml>
here is the interface: <ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</ocaml>
How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved.
So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.
<ocaml>let greatest = List.fold_left (fun m x -> max m x) 0 ;;
(** number of bits needed to encode the integer m *) let n_bits m =
let m = float m in let rec aux n = let max = (2. ** n) -. 1. in if max >= m then int_of_float n else aux (n +. 1.0) in aux 1.0
let write_compressed ~filename ~compressed =
let nbits = n_bits(greatest compressed) in let oc = open_out filename in output_byte oc nbits; let ob = IO.output_bits(IO.output_channel oc) in List.iter (fun code -> IO.write_bits ob nbits code ) compressed; IO.flush_bits ob; close_out oc;
let read_compressed ~filename =
let ic = open_in filename in let nbits = input_byte ic in let ib = IO.input_bits(IO.input_channel ic) in let rec loop acc = try let code = IO.read_bits ib nbits in loop (code::acc) with _ -> List.rev acc in let compressed = loop [] in let result = decompress ~compressed in let buf = Buffer.create 2048 in List.iter (Buffer.add_string buf) result; (Buffer.contents buf)
- </ocaml>
Perl
In this version the hashes contain mixed typed data: <perl># Compress a string to a list of output symbols. sub compress {
my $uncompressed = shift;
# Build the dictionary. my $dict_size = 256; my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
my $w = ""; my @result; foreach my $c (split , $uncompressed) { my $wc = $w . $c; if (exists $dictionary{$wc}) { $w = $wc; } else { push @result, $dictionary{$w}; # Add wc to the dictionary. $dictionary{$wc} = $dict_size; $dict_size++; $w = $c; } }
# Output the code for w. if ($w) { push @result, $dictionary{$w}; } return @result;
}
- Decompress a list of output ks to a string.
sub decompress {
my @compressed = @_;
# Build the dictionary. my $dict_size = 256; my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
my $w = shift @compressed; my $result = $w; foreach my $k (@compressed) { my $entry; if (exists $dictionary{$k}) { $entry = $dictionary{$k}; } elsif ($k == $dict_size) { $entry = $w . substr(w,0,1); } else { die "Bad compressed k: $k"; } $result .= $entry;
# Add w+entry[0] to the dictionary. $dictionary{$dict_size} = $w . substr($entry,0,1); $dict_size++;
$w = $entry; } return $result;
}
- How to use:
my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</perl>
Output:
T O B E O R N O T 256 258 260 265 259 261 263 TOBEORNOTTOBEORTOBEORNOT
Python
In this version the dicts contain mixed typed data: <python>def compress(uncompressed):
"""Compress a string to a list of output symbols."""
# Build the dictionary. dict_size = 256 dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
w = "" result = [] for c in uncompressed: wc = w + c if wc in dictionary: w = wc else: result.append(dictionary[w]) # Add wc to the dictionary. dictionary[wc] = dict_size dict_size += 1 w = c
# Output the code for w. if w: result.append(dictionary[w]) return result
def decompress(compressed):
"""Decompress a list of output ks to a string."""
# Build the dictionary. dict_size = 256 dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
w = result = compressed.pop(0) for k in compressed: if k in dictionary: entry = dictionary[k] elif k == dict_size: entry = w + w[0] else: raise ValueError('Bad compressed k: %s' % k) result += entry
# Add w+entry[0] to the dictionary. dictionary[dict_size] = w + entry[0] dict_size += 1
w = entry return result
- How to use:
compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print compressed decompressed = decompress(compressed) print decompressed</python>
Output:
['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT
Ruby
In this version the hashes contain mixed typed data: # Compress a string to a list of output symbols. def compress(uncompressed)
# Build the dictionary. dict_size = 256 dictionary = Hash[* (0...dict_size).map {|i| [i.chr, i.chr]}.flatten ]
w = "" result = [] for c in uncompressed.split() wc = w + c if dictionary.has_key?(wc) w = wc else result << dictionary[w] # Add wc to the dictionary. dictionary[wc] = dict_size dict_size += 1 w = c end end
# Output the code for w. if w result << dictionary[w] end result
end
- Decompress a list of output ks to a string.
def decompress(compressed)
# Build the dictionary. dict_size = 256 dictionary = Hash[* (0...dict_size).map {|i| [i.chr, i.chr]}.flatten ]
w = result = compressed.shift for k in compressed if dictionary.has_key?(k) entry = dictionary[k] elsif k == dict_size entry = w + w[0,1] else raise 'Bad compressed k: %s' % k end result += entry
# Add w+entry[0] to the dictionary. dictionary[dict_size] = w + entry[0,1] dict_size += 1
w = entry end result
end
- How to use:
compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed
Output:
["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT