LZW compression

From Rosetta Code
Revision as of 05:22, 27 February 2009 by 75.63.5.131 (talk) (Add Clojure compression example)
Task
LZW compression
You are encouraged to solve this task according to the task description, using any language you may know.

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

C++

<lang cpp>#include <string>

  1. include <map>

// Compress a string to a list of output symbols. // The result will be written to the output iterator // starting at "result"; the final iterator is returned. template <typename Iterator> Iterator compress(const std::string &uncompressed, Iterator result) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<std::string,int> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[std::string(1, i)] = i;
 
 std::string w;
 for (std::string::const_iterator it = uncompressed.begin();
      it != uncompressed.end(); ++it) {
   char c = *it;
   std::string wc = w + c;
   if (dictionary.count(wc))
     w = wc;
   else {
     *result++ = dictionary[w];
     // Add wc to the dictionary.
     dictionary[wc] = dictSize++;
     w = std::string(1, c);
   }
 }
 
 // Output the code for w.
 if (!w.empty())
   *result++ = dictionary[w];
 return result;

}

// Decompress a list of output ks to a string. // "begin" and "end" must form a valid range of ints template <typename Iterator> std::string decompress(Iterator begin, Iterator end) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<int,std::string> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[i] = std::string(1, i);
 
 std::string w(1, *begin++);
 std::string result = w;
 std::string entry;
 for ( ; begin != end; begin++) {
   int k = *begin;
   if (dictionary.count(k))
     entry = dictionary[k];
   else if (k == dictSize)
     entry = w + w[0];
   else
     throw "Bad compressed k";
   
   result += entry;
   
   // Add w+entry[0] to the dictionary.
   dictionary[dictSize++] = w + entry[0];
   
   w = entry;
 }
 return result;

}

  1. include <iostream>
  2. include <iterator>
  3. include <vector>

int main() {

 std::vector<int> compressed;
 compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
 copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
 std::cout << std::endl;
 std::string decompressed = decompress(compressed.begin(), compressed.end());
 std::cout << decompressed << std::endl;
 
 return 0;

}</lang>

Clojure

<lang Clojure> (defn make-dict []

 (let [vals (range 0 256)]
   (zipmap (map (comp #'list #'char) vals) vals)))

(defn compress [#^String text]

 (loop [t (seq text)
        r '()
        w '()
        dict (make-dict)
        s 256]
   (let [c (first t)]
     (if c
       (let [wc (cons c w)]
         (if (get dict wc)
           (recur (rest t) r wc dict s)
           (recur (rest t) (cons (get dict w) r) (list c) (assoc dict wc s) (inc s))))
       (reverse (if w (cons (get dict w) r) r))))))

(compress "TOBEORNOTTOBEORTOBEORNOT") </lang> The output: <lang Clojure> (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263) </lang>

D

D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <lang d> import std.stdio: writefln;

/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {

   // build the dictionary
   int dict_size = 256;
   int[string] dictionary;
   for (int i; i < dict_size; i++)
       dictionary["" ~ cast(char)i] = i;
   string w;
   int[] result;
   foreach (c; uncompressed) {
       string wc = w ~ c;
       if (wc in dictionary)
           w = wc;
       else {
           result ~= dictionary[w];
           // add wc to the dictionary
           dictionary[wc] = dict_size;
           dict_size++;
           w = "" ~ c;
       }
   }
   // output the code for w
   if (w.length)
       result ~= dictionary[w];
   return result;

}


/// Decompress a list of output ks to a string. string decompress(int[] compressed) {

   // build the dictionary
   int dict_size = 256;
   string[int] dictionary;
   for (int i; i < dict_size; i++)
       dictionary[i] = "" ~ cast(char)i;
   string w = "" ~ cast(char)compressed[0];
   string result = w;
   foreach (k; compressed[1 .. $]) {
       string entry;
       if (k in dictionary)
           entry = dictionary[k];
       else if (k == dict_size)
           entry = w ~ w[0];
       else
           throw new Exception("Bad compressed k");
       result ~= entry;
       // add w+entry[0] to the dictionary
       dictionary[dict_size] = w ~ entry[0];
       dict_size++;
       w = entry;
   }
   return result;

}

void main() {

   auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
   writefln(compressed);
   auto decompressed = decompress(compressed);
   writefln(decompressed);

} </lang> The output: <lang d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </lang>

Forth

Works with: GNU Forth version 0.6.2
256 value next-symbol

\ current string fragment

create w 256 allot     \ counted string

: w=c ( c -- )       w 1+ c!        1 w c! ;
: w+c ( c -- )  w count + c!  w c@ 1+ w c! ;
\ Compression

\ dictionary of strings to symbols
0 value dict

: init-dict  table to dict  256 to next-symbol  dict set-current ;

: free-dict                           forth-wordlist set-current ;

: in-dict? ( key len -- ? )		\ can assume len > 1
  dict search-wordlist dup if nip then ;

: lookup-dict ( key len -- symbol )
  dup 1 = if drop c@ exit then
  dict search-wordlist if >body @ else abort" bad-dict!" then ;

: put-dict ( data key len -- )
  nextname create , ;

\ output buffer of symbols
\  in real life, these symbols would be packed into octets
variable out-size
create out 256 cells allot

: output ( symbol -- )
  dup out out-size @ cells + !  1 out-size +!
  dup 256 < if emit space else . then ;

: compress ( addr len -- )
  init-dict  0 out-size !
  over c@ w=c  1 /string
  bounds do
    i c@ w+c
    w count in-dict? 0= if
      w count 1- lookup-dict output
      next-symbol dup w count put-dict
      1+ to next-symbol
      i c@ w=c
    then
  loop
  w count lookup-dict output
  free-dict ;
\ Decompression

\ array of symbols to strings (in real code this would need to be growable)
\  next-symbol is reused for the size of this table
create symtab 256 cells allot
0 value start

: init-symtab  256 to next-symbol  here to start ;

: free-symtab  start here - allot ;

: get-symbol ( symbol -- addr len )
  dup 256 < if pad c! pad 1 exit then
  256 - cells symtab + @ count ;

: add-symbol ( addr len -- )
  here symtab next-symbol 256 - cells + !
  s,
  next-symbol 1+ to next-symbol ;

create entry 256 allot

: decompress ( addr len -- )
  init-symtab
  over @ dup emit w=c
  cells bounds cell+ do
    i @ next-symbol < if
      i @ get-symbol entry place
    else i @ next-symbol = if
      w 1+ c@ w count + c!  w count 1+ entry place
    else
      abort" bad symbol!"
    then then
    entry count type	\ output
    entry 1+ c@ w+c
    w count add-symbol
    entry count w place
  1 cells +loop
  free-symtab ;
\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr
\ T O B E O R N O T 256 258 260 265 259 261 263

out out-size @ decompress cr
\ TOBEORNOTTOBEORTOBEORNOT

Java

<lang java>import java.util.*;

public class LZW {

   /** Compress a string to a list of output symbols. */
   public static List<Integer> compress(String uncompressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<String,Integer> dictionary = new HashMap<String,Integer>();
       for (int i = 0; i < 256; i++)
           dictionary.put("" + (char)i, i);
       
       String w = "";
       List<Integer> result = new ArrayList<Integer>();
       for (char c : uncompressed.toCharArray()) {
           String wc = w + c;
           if (dictionary.containsKey(wc))
               w = wc;
           else {
               result.add(dictionary.get(w));
               // Add wc to the dictionary.
               dictionary.put(wc, dictSize++);
               w = "" + c;
           }
       }

       // Output the code for w.
       if (!w.equals(""))
           result.add(dictionary.get(w));
       return result;
   }
   
   /** Decompress a list of output ks to a string. */
   public static String decompress(List<Integer> compressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<Integer,String> dictionary = new HashMap<Integer,String>();
       for (int i = 0; i < 256; i++)
           dictionary.put(i, "" + (char)i);
       
       String w = "" + (char)(int)compressed.remove(0);
       String result = w;
       for (int k : compressed) {
           String entry;
           if (dictionary.containsKey(k))
               entry = dictionary.get(k);
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               throw new IllegalArgumentException("Bad compressed k: " + k);
           
           result += entry;
           
           // Add w+entry[0] to the dictionary.
           dictionary.put(dictSize++, w + entry.charAt(0));
           
           w = entry;
       }
       return result;
   }
   public static void main(String[] args) {
       List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
       System.out.println(compressed);
       String decompressed = decompress(compressed);
       System.out.println(decompressed);
   }

}</lang>

JavaScript

<lang javascript> //LZW Compression/Decompression for Strings var LZW = {

   "compress" : function(uncompressed) 
   {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = [];
       for (var i = 0; i < 256; i++)
       {
           dictionary[String.fromCharCode(i)] = i;
       }
		
       var w = "";
       var result = [];
       for (var i = 0; i < uncompressed.length; i++) 
       {
       	var c = uncompressed.charAt(i);
           var wc = w + c;
           if (dictionary[wc])
               w = wc;
           else {
               result.push(dictionary[w]);
               // Add wc to the dictionary.
               dictionary[wc] = dictSize++;
               w = "" + c;
           }
       }

       // Output the code for w.
       if (w != "")
           result.push(dictionary[w]);
       return result;
   },

   "decompress" : function(compressed) {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = [];
       for (var i = 0; i < 256; i++)
       {
           dictionary[i] = String.fromCharCode(i);
	}

       var w = String.fromCharCode(compressed[0]);
       var result = w;
       for (var i = 1; i < compressed.length; i++) {
           var entry = "";
           var k = compressed[i];
           if (dictionary[k])
               entry = dictionary[k];
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               return null;

           result += entry;

           // Add w+entry[0] to the dictionary.
           dictionary[dictSize++] = w + entry.charAt(0);

           w = entry;
       }
       return result;
   }

}

// For Test Purposes var comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT"); var decomp = LZW.decompress(comp); document.write(comp+'
'+decomp); </lang>

Output: <lang javascript> 84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT </lang>

Objective-C

Works with: GNUstep

The class for the LZW compression algorithm:

<lang objc>#import <Foundation/Foundation.h>

  1. import <stdio.h>

@interface LZWCompressor : NSObject {

 @private
   NSMutableArray *iostream;
   NSMutableDictionary *dict;
   NSUInteger codemark;

}

-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end

@implementation LZWCompressor : NSObject

-(LZWCompressor *) init {

  self = [super init];
  if ( self )
  {
     iostream = nil;
     codemark = 256;
     dict = [NSMutableDictionary dictionaryWithCapacity: 512];
  }
  return self;

}

-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {

  self = [self init];
  if ( self )
  {
     [self setArray: stream];
  }
  return self;

}

-(void) setArray: (NSMutableArray *) stream {

  iostream = stream;

}

-(BOOL) compressData: (NSData *) string; {

   NSUInteger i;
   unsigned char j;
   
   // prepare dict
   for(i=0; i < 256; i++)
   {
      j = i;
      NSData *s = [NSData dataWithBytes: &j length: 1];
      [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s];
   }
   
   NSMutableData *w = [NSMutableData data];
   NSMutableData *wc = [NSMutableData data];
   
   for(i=0; i < [string length]; i++)
   {
      [wc setData: w];
      [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]];
      if ( [dict objectForKey: wc] != nil )
      {
         [w setData: wc];
      } else {
         [iostream addObject: [dict objectForKey: w]];
         [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc];
         codemark++;
         [w setData: [string subdataWithRange: NSMakeRange(i, 1)]];
      }
   }
   if ( [w length] != 0 )
   {
      [iostream addObject: [dict objectForKey: w]];
   }
   return YES;

}

-(NSArray *) getArray {

 return iostream;

}

@end</lang>

Usage example:

<lang objc>char *text = "TOBEORNOTTOBEORTOBEORNOT";

int main() {

 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
 
 LZWCompressor *lzw = [[LZWCompressor alloc]
                       initWithArray: [[NSMutableArray alloc] init] ];
 if ( lzw )
 {
    [lzw compressData: [NSData dataWithBytes: text
                        length: strlen(text)]];
    NSEnumerator *en = [[lzw getArray] objectEnumerator];
    id obj;
    while( (obj = [en nextObject]) )
    {
       printf("%u\n", [obj unsignedIntValue]);
    } 
 }
 
 [pool release];
 return EXIT_SUCCESS;

}</lang>

Output (reformatted by hand):

 84  79  66  69  79  82  78  79
 84 256 258 260 265 259 261 263

OCaml

<lang ocaml>#directory "+site-lib/extlib/"

  1. load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in
 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in
 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;
 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = w::[] in
 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)
</lang>

here is the interface: <lang ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</lang>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<lang ocaml>let greatest = List.fold_left (fun m x -> max m x) 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (fun code ->
     IO.write_bits ob nbits code
   ) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)
</lang>

Perl

In this version the hashes contain mixed typed data: <lang perl># Compress a string to a list of output symbols. sub compress {

   my $uncompressed = shift;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = "";
   my @result;
   foreach my $c (split , $uncompressed) {
       my $wc = $w . $c;
       if (exists $dictionary{$wc}) {
           $w = $wc;
       } else {
           push @result, $dictionary{$w};
           # Add wc to the dictionary.
           $dictionary{$wc} = $dict_size;
           $dict_size++;
           $w = $c;
       }
   }
   # Output the code for w.
   if ($w) {
       push @result, $dictionary{$w};
   }
   return @result;

}

  1. Decompress a list of output ks to a string.

sub decompress {

   my @compressed = @_;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = shift @compressed;
   my $result = $w;
   foreach my $k (@compressed) {
       my $entry;
       if (exists $dictionary{$k}) {
           $entry = $dictionary{$k};
       } elsif ($k == $dict_size) {
           $entry = $w . substr(w,0,1);
       } else {
           die "Bad compressed k: $k";
       }
       $result .= $entry;
       # Add w+entry[0] to the dictionary.
       $dictionary{$dict_size} = $w . substr($entry,0,1);
       $dict_size++;
       $w = $entry;
   }
   return $result;

}

  1. How to use:

my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</lang>

Output:

T O B E O R N O T 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

Python

In this version the dicts contain mixed typed data: <lang python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   w = ""
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
   # Output the code for w.
   if w:
       result.append(dictionary[w])
   return result


def decompress(compressed):

   """Decompress a list of output ks to a string."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == dict_size:
           entry = w + w[0]
       else:
           raise ValueError('Bad compressed k: %s' % k)
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0]
       dict_size += 1
       w = entry
   return result


  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print compressed decompressed = decompress(compressed) print decompressed</lang>

Output:

['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Ruby

In this version the hashes contain mixed typed data: <lang ruby># Compress a string to a list of output symbols. def compress(uncompressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[* Array.new(dict_size) {|i| [i.chr, i.chr]}.flatten ]
   w = ""
   result = []
   for c in uncompressed.split()
       wc = w + c
       if dictionary.has_key?(wc)
           w = wc
       else
           result << dictionary[w]
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
       end
   end
   # Output the code for w.
   if w
       result << dictionary[w]
   end
   result

end

  1. Decompress a list of output ks to a string.

def decompress(compressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[* Array.new(dict_size) {|i| [i.chr, i.chr]}.flatten ]
   w = result = compressed.shift
   for k in compressed
       if dictionary.has_key?(k)
           entry = dictionary[k]
       elsif k == dict_size
           entry = w + w[0,1]
       else
           raise 'Bad compressed k: %s' % k
       end
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0,1]
       dict_size += 1
       w = entry
   end
   result

end

  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed</lang>

Output:

["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT