LZW compression

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

D

D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <d> import std.stdio: writefln;

/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {

   // build the dictionary
   int dict_size = 256;
   int[string] dictionary;
   for (int i; i < dict_size; i++)
       dictionary["" ~ cast(char)i] = i;

   string w;
   int[] result;
   foreach (c; uncompressed) {
       string wc = w ~ c;
       if (wc in dictionary)
           w = wc;
       else {
           result ~= dictionary[w];
           // add wc to the dictionary
           dictionary[wc] = dict_size;
           dict_size++;
           w = "" ~ c;
       }
   }

   // output the code for w
   if (w.length)
       result ~= dictionary[w];
   return result;

}

/// Decompress a list of output ks to a string. string decompress(int[] compressed) {

   // build the dictionary
   int dict_size = 256;
   string[int] dictionary;
   for (int i; i < dict_size; i++)
       dictionary[i] = "" ~ cast(char)i;

   string w = "" ~ cast(char)compressed[0];
   string result = w;
   foreach (k; compressed[1 .. $]) {
       string entry;
       if (k in dictionary)
           entry = dictionary[k];
       else if (k == dict_size)
           entry = w ~ w[0];
       else
           throw new Exception("Bad compressed k");
       result ~= entry;

       // add w+entry[0] to the dictionary
       dictionary[dict_size] = w ~ entry[0];
       dict_size++;

       w = entry;
   }
   return result;

}

void main() {

   auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
   writefln(compressed);

   auto decompressed = decompress(compressed);
   writefln(decompressed);

} </d> The output: <d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </d>

Forth

Works with: GNU Forth version 0.6.2

256 value next-symbol

\ current string fragment

create w 256 allot     \ counted string

: w=c ( c -- )       w 1+ c!        1 w c! ;
: w+c ( c -- )  w count + c!  w c@ 1+ w c! ;

\ Compression

\ dictionary of strings to symbols
0 value dict

: init-dict  table to dict  256 to next-symbol  dict set-current ;

: free-dict                           forth-wordlist set-current ;

: in-dict? ( key len -- ? )		\ can assume len > 1
  dict search-wordlist dup if nip then ;

: lookup-dict ( key len -- symbol )
  dup 1 = if drop c@ exit then
  dict search-wordlist if >body @ else abort" bad-dict!" then ;

: put-dict ( data key len -- )
  nextname create , ;

\ output buffer of symbols
\  in real life, these symbols would be packed into octets
variable out-size
create out 256 cells allot

: output ( symbol -- )
  dup out out-size @ cells + !  1 out-size +!
  dup 256 < if emit space else . then ;

: compress ( addr len -- )
  init-dict  0 out-size !
  over c@ w=c  1 /string
  bounds do
    i c@ w+c
    w count in-dict? 0= if
      w count 1- lookup-dict output
      next-symbol dup w count put-dict
      1+ to next-symbol
      i c@ w=c
    then
  loop
  w count lookup-dict output
  free-dict ;

\ Decompression

\ array of symbols to strings (in real code this would need to be growable)
\  next-symbol is reused for the size of this table
create symtab 256 cells allot
0 value start

: init-symtab  256 to next-symbol  here to start ;

: free-symtab  start here - allot ;

: get-symbol ( symbol -- addr len )
  dup 256 < if pad c! pad 1 exit then
  256 - cells symtab + @ count ;

: add-symbol ( addr len -- )
  here symtab next-symbol 256 - cells + !
  s,
  next-symbol 1+ to next-symbol ;

create entry 256 allot

: decompress ( addr len -- )
  init-symtab
  over @ dup emit w=c
  cells bounds cell+ do
    i @ next-symbol < if
      i @ get-symbol entry place
    else i @ next-symbol = if
      w 1+ c@ w count + c!  w count 1+ entry place
    else
      abort" bad symbol!"
    then then
    entry count type	\ output
    entry 1+ c@ w+c
    w count add-symbol
    entry count w place
  1 cells +loop
  free-symtab ;

\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr
\ T O B E O R N O T 256 258 260 265 259 261 263

out out-size @ decompress cr
\ TOBEORNOTTOBEORTOBEORNOT

Java

<java>import java.util.*;

public class LZW {

   /** Compress a string to a list of output symbols. */
   public static List<Integer> compress(String uncompressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<String,Integer> dictionary = new HashMap<String,Integer>();
       for (int i = 0; i < 256; i++)
           dictionary.put("" + (char)i, i);
       
       String w = "";
       List<Integer> result = new ArrayList<Integer>();
       for (char c : uncompressed.toCharArray()) {
           String wc = w + c;
           if (dictionary.containsKey(wc))
               w = wc;
           else {
               result.add(dictionary.get(w));
               // Add wc to the dictionary.
               dictionary.put(wc, dictSize++);
               w = "" + c;
           }
       }

       // Output the code for w.
       if (!w.equals(""))
           result.add(dictionary.get(w));
       return result;
   }
   
   /** Decompress a list of output ks to a string. */
   public static String decompress(List<Integer> compressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<Integer,String> dictionary = new HashMap<Integer,String>();
       for (int i = 0; i < 256; i++)
           dictionary.put(i, "" + (char)i);
       
       String w = "" + (char)(int)compressed.remove(0);
       String result = w;
       for (int k : compressed) {
           String entry;
           if (dictionary.containsKey(k))
               entry = dictionary.get(k);
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               throw new IllegalArgumentException("Bad compressed k: " + k);
           
           result += entry;
           
           // Add w+entry[0] to the dictionary.
           dictionary.put(dictSize++, w + entry.charAt(0));
           
           w = entry;
       }
       return result;
   }

   public static void main(String[] args) {
       List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
       System.out.println(compressed);
       String decompressed = decompress(compressed);
       System.out.println(decompressed);
   }

}</java>

OCaml

<ocaml>#directory "+site-lib/extlib/"

load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in

 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in

 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;

 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = w::[] in

 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)

</ocaml>

here is the interface: <ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</ocaml>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<ocaml>let greatest = List.fold_left (fun m x -> max m x) 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (fun code ->
     IO.write_bits ob nbits code
   ) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)

</ocaml>

Python

In this version the dicts contain mixed typed data: <python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""

   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))

   w = ""
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c

   # Output the code for w.
   if w:
       result.append(dictionary[w])
   return result

def decompress(compressed):

   """Decompress a list of output ks to a string."""

   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))

   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == dict_size:
           entry = w + w[0]
       else:
           raise ValueError('Bad compressed k: %s' % k)
       result += entry

       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0]
       dict_size += 1

       w = entry
   return result

How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print compressed decompressed = decompress(compressed) print decompressed</python>

Output:

['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Ruby

In this version the hashes contain mixed typed data: # Compress a string to a list of output symbols. def compress(uncompressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[* (0..dict_size-1).map {|i| [i.chr, i.chr]}.flatten ]

   w = ""
   result = []
   for c in uncompressed.split()
       wc = w + c
       if dictionary.has_key?(wc)
           w = wc
       else
           result << dictionary[w]
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
       end
   end

   # Output the code for w.
   if w
       result << dictionary[w]
   end
   result

end

Decompress a list of output ks to a string.

def decompress(compressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[* (0..dict_size-1).map {|i| [i.chr, i.chr]}.flatten ]

   w = result = compressed.shift
   for k in compressed
       if dictionary.has_key?(k)
           entry = dictionary[k]
       elsif k == dict_size
           entry = w + w[0,1]
       else
           raise 'Bad compressed k: %s' % k
       end
       result += entry

       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0,1]
       dict_size += 1

       w = entry
   end
   result

end

How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed

Output:

["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT