LZW compression: Difference between revisions

From Rosetta Code
Content added Content deleted
(CL code)
No edit summary
Line 283: Line 283:


=={{header|Common Lisp}}==
=={{header|Common Lisp}}==

{{libheader|Babel}}


This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND reallocates the whole vector with each call.
This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND reallocates the whole vector with each call.

The Babel library is required to convert octet vectors to strings. Lisp strings can contain characters out of the ASCII/latin1 character set, including the whole Unicode range in them. The exact encoding used depending upon user's locale (<code>LC_CTYPE</code> on Unix).


<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)
<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)

Revision as of 19:36, 7 August 2009

Task
LZW compression
You are encouraged to solve this task according to the task description, using any language you may know.

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

C

See Bit oriented IO for bitio.h and Basic string manipulation functions for estrings.h.

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <stdbool.h>
  3. include <string.h>
  4. include "bitio.h"
  5. include "estrings.h"
  1. define MIN(A,B) ( ((A)<(B)) ? (A) : (B) )
  2. define MAX(A,B) ( ((A)>(B)) ? (A) : (B) )
  1. define DEBUG
  1. if defined(DEBUG)
  2. define Debug(...) fprintf(stderr, __VA_ARGS__)
  3. else
  4. define Debug(...)
  5. endif</lang>

We need a dictionary and so we implement one according to our need.

<lang c>struct DictionaryNode {

  String string;
  unsigned int code;
  struct DictionaryNode *next;

}; typedef struct DictionaryNode DN;

  1. define NODES_PER_ALLOC 512

struct DictionaryStruct {

  DN *first;
  DN **nodes;
  size_t num_of_nodes;
  size_t num_of_alloc;
  bool sorted;

}; typedef struct DictionaryStruct *Dictionary;

Dictionary newDictionary() {

   Dictionary t;
   
   t = malloc(sizeof(struct DictionaryStruct));
   if ( t==NULL ) return NULL;
   t->num_of_nodes = 0;
   t->num_of_alloc = 1;
   t->nodes = malloc(t->num_of_alloc * NODES_PER_ALLOC * sizeof(void *));
   t->first = NULL;
   t->sorted = false;
   return t;

}

bool _expandDictionary(Dictionary d) {

  d->nodes = realloc(d->nodes, (d->num_of_alloc + 1) * NODES_PER_ALLOC * sizeof(void *));
  if ( d->nodes == NULL ) return false;
  d->num_of_alloc++;
  return true;

}

bool dictionaryPut_ns(Dictionary d, String s, unsigned int c) {

  DN *newdw;
  
  if ( (d->num_of_nodes + 1) > (d->num_of_alloc * NODES_PER_ALLOC) )
  {
     if ( !_expandDictionary(d) ) return false;
  }
  newdw = malloc(sizeof(DN));
  if ( newdw == NULL ) return false;
  newdw->string = cloneString(s);
  newdw->code = c;
  newdw->next = d->first;
  d->first = newdw;
  d->nodes[d->num_of_nodes] = newdw;
  d->num_of_nodes++;
  d->sorted = false;
  return true;

}

int _dict_strcompare(const void *av, const void *bv) {

  DN *a = *((DN **)av);
  DN *b = *((DN **)bv);
  return compareStrings(a->string, b->string);

}

void dictionarySort(Dictionary d) {

   if ( ! (d->sorted) ) {
     qsort(d->nodes, d->num_of_nodes, sizeof(void *), _dict_strcompare);
     d->sorted = true;
   }

}

bool dictionaryPut(Dictionary d, String s, unsigned int c) {

  bool added = dictionaryPut_ns(d, s, c);
  if ( added )
  {
      dictionarySort(d);
  }
  return added;

}


DN *dictionaryLookUp(Dictionary d, String s) {

  if ( d->num_of_nodes == 0 ) return NULL;
  int low, high, med, p;
  /* perform a "binary" search ... */
  for(low=-1, high = d->num_of_nodes-1; (high-low)>1 ; )
  {
     med = (high+low) >> 1;
     p = compareStrings(s, d->nodes[med]->string);
     if ( p <= 0 )
     {
        high = med;
     } else {
        low = med;
     }
  }
  if ( compareStrings(s, d->nodes[high]->string) == 0 ) return d->nodes[high];
  return NULL;

}

void destroyDictionary(Dictionary d) {

  int i;
  if ( d==NULL ) return;
  if ( d->nodes != NULL )
  {
    for(i=0; i < d->num_of_nodes; i++)
    {
       destroyString(d->nodes[i]->string);
       free(d->nodes[i]);
    }
    free(d->nodes);
  }
  free(d);

}</lang>

LZW code:

<lang c>#define BITS_PER_WORD 9 unsigned int codemark = 256; unsigned char num_of_bits = BITS_PER_WORD;

struct LZWHandleStruct {

  unsigned int codemark;
  unsigned char num_of_bits;
  Dictionary dict;
  size_t bytes_written;
  FILE *iostream;

}; typedef struct LZWHandleStruct LZWHandle;

LZWHandle *lzw_begin(FILE *o) {

  LZWHandle *lh;
  
  lh = malloc(sizeof(LZWHandle));
  if ( lh != NULL )
  {
    lh->bytes_written = 0;
    lh->codemark = 256;
    lh->num_of_bits = BITS_PER_WORD;
    lh->dict = newDictionary();
    lh->iostream = o;
    if ( lh->dict == NULL ) { free(lh); return NULL; }
  }
  return lh;

}

bool lzw_compress(LZWHandle *lh, const char *stream, size_t len) {

  unsigned int i;
  char tc;
  Dictionary d = lh->dict;
  DN *oc;
  String w = NULL;
  String wc = NULL;
  String t = NULL;
  size_t wbits = 0;
  
  t = newString();
  for(i=0; i < 256; i++)
  {
     tc = i;
     setString(t, &tc, 1);
     dictionaryPut_ns(d, t, i);
  }
  destroyString(t); t = NULL;
  
  /* a trick: we have put values in order, so it is sorted*/
  d->sorted = true;
  
  w = newString();
  wc = newString();
  
  for(i=0; i < len; i++)
  {
     copyString(wc, w);
     appendChar(wc, stream[i]);
     if ( (dictionaryLookUp(d, wc)) != NULL )
     {
        copyString(w, wc);
     } else {
        oc = dictionaryLookUp(d, w);
        if ( oc != NULL )
        { /* guardring for bugs :D */
          wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream);
          Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark);
        }
        dictionaryPut(d, wc, lh->codemark);
        lh->codemark++;
        if ( lh->codemark >= (1 << lh->num_of_bits) )
        {
           lh->num_of_bits++;
        }
        setString(w, &stream[i], 1);
     }
  }
  if ( ! stringIsEmpty(w) )
  {
      oc = dictionaryLookUp(d, w);
      /* no guard ring for bugs here :) */
      wbits += bits_write(oc->code, lh->num_of_bits, lh->iostream);
      Debug("%u ; %d ; %u\n", oc->code, lh->num_of_bits, lh->codemark);
  }
  lh->bytes_written = wbits >> 3;
  destroyString(w); destroyString(wc);
  return true;

}

int lzw_end(LZWHandle *lh) {

  int oby=0;
  if ( lh != NULL )
  {
    oby=bits_flush(lh->iostream);
    destroyDictionary(lh->dict);
    free(lh);
  }
  return oby;

}</lang>

Testing:

<lang c>const char *text = "TOBEORNOTTOBEORTOBEORNOT"; int main() {

    LZWHandle *lzw;
    size_t writtenbytecount = 0;
    
    lzw = lzw_begin(stdout);
    if ( lzw != NULL )
    {
      lzw_compress(lzw, text, strlen(text));
      writtenbytecount += lzw->bytes_written;
      Debug("bits on exit: %u\n", lzw->num_of_bits);
      lzw_end(lzw);
      fprintf(stderr, "n. of input bytes: %u\n"
                      "n. of output bytes: %u\n",
                      strlen(text), writtenbytecount
             );
    } else return 1;
    return 0;

}</lang>

This test code really emits the compressed bytes stream. The decimal values of the codes (as other solutions do) are sent as debug informations.

Common Lisp

Library: Babel

This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND reallocates the whole vector with each call.

The Babel library is required to convert octet vectors to strings. Lisp strings can contain characters out of the ASCII/latin1 character set, including the whole Unicode range in them. The exact encoding used depending upon user's locale (LC_CTYPE on Unix).

<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)

               vector-append))

(defun vector-append (old new &optional (start2 0) end2)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (+ old-fill (length new))))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (replace old new :start1 old-fill :start2 start2 :end2 end2))))

(declaim (ftype (function (vector t) vector) vector-append1)) (defun vector-append1 (old new)

 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (1+ old-fill)))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (setf (aref old old-fill) new))))

(declaim (ftype (function (&optional t) vector) make-empty-vector)) (defun make-empty-vector (&optional (element-type t))

 (make-array 0 :element-type element-type :fill-pointer 0 :adjustable t))


(declaim (ftype (function (t &optional t) vector) make-vector-with-elt)) (defun make-vector-with-elt (elt &optional (element-type t))

 (make-array 1 :element-type element-type
               :fill-pointer 1
               :adjustable t
               :initial-element elt))

(declaim (ftype (function (vector t) vector) vector-append1-new)) (defun vector-append1-new (old new)

 (vector-append1 (vector-append (make-empty-vector 'octet) old)
                 new))

(declaim (ftype (function (vector vector) vector) vector-append-new)) (defun vector-append-new (old new)

 (vector-append (vector-append (make-empty-vector 'octet) old)
                new))

(deftype octet () '(unsigned-byte 8))

(declaim (ftype (function () hash-table) build-dictionary)) (defun build-dictionary ()

 (let ((dictionary (make-hash-table :test #'equalp)))
   (loop for i below 256
         do (let ((vec (make-vector-with-elt i 'octet)))
              (setf (gethash vec dictionary) vec)))
   dictionary))

(declaim (ftype (function ((vector octet)) (vector octet))

               lzw-compress-octets))

(defun lzw-compress-octets (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (loop with dictionary-size of-type fixnum = 256
       with w = (make-empty-vector 'octet)
       with result = (make-empty-vector 't)
       with dictionary = (build-dictionary)
       for c across octets
       for wc = (vector-append1-new w c)
       if (gethash wc dictionary) do (setq w wc)
       else do
         (vector-append result (gethash w dictionary))
         (setf (gethash wc dictionary)
               (make-vector-with-elt dictionary-size)) 
         (incf dictionary-size)
         (setq w (make-vector-with-elt c 'octet))
       finally (unless (zerop (length (the (vector octet) w)))
                 (vector-append result (gethash w dictionary)))
               (return result)))

(declaim (ftype (function (vector) (vector octet)) lzw-decompress)) (defun #1=lzw-decompress (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (when (zerop (length octets))
   (return-from #1# (make-empty-vector 'octet)))
 (loop with dictionary-size = 256
       with dictionary = (build-dictionary)
       with result = (make-vector-with-elt (aref octets 0) 'octet)
       with w = (copy-seq result)
       for i from 1 below (length octets)
       for k = (make-vector-with-elt (aref octets i) 't)
       for entry = (or (gethash k dictionary)
                       (if (equalp k dictionary-size)
                           (coerce (list w (aref w 0)) '(vector octet))
                           (error "bad compresed entry at pos ~S" i)))
       do (vector-append result entry)
          (setf (gethash (make-vector-with-elt dictionary-size) dictionary)
                (vector-append1-new w (aref entry 0)))
          (incf dictionary-size)
          (setq w entry)
       finally (return result)))

(defgeneric lzw-compress (datum)

 (:method ((string string))
   (lzw-compress (babel:string-to-octets string)))
 (:method ((octets vector))
   (lzw-compress-octets octets)))

(defun lzw-decompress-to-string (octets)

 (babel:octets-to-string (lzw-decompress octets)))

(defun test (string)

 (assert (equal #2=(lzw-decompress-to-string (lzw-compress string)) string) ()
         "Can't compress ~S properly, got ~S instead" string #2#)
 t)

</lang>

And the format used:

<lang lisp>CL-USER> (test "TOBEORNOTTOBEORTOBEORNOT") T CL-USER> (lzw-compress "TOBEORNOTTOBEORTOBEORNOT")

  1. (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)

CL-USER> (lzw-decompress-to-string *) "TOBEORNOTTOBEORTOBEORNOT"</lang>

C++

<lang cpp>#include <string>

  1. include <map>

// Compress a string to a list of output symbols. // The result will be written to the output iterator // starting at "result"; the final iterator is returned. template <typename Iterator> Iterator compress(const std::string &uncompressed, Iterator result) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<std::string,int> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[std::string(1, i)] = i;
 
 std::string w;
 for (std::string::const_iterator it = uncompressed.begin();
      it != uncompressed.end(); ++it) {
   char c = *it;
   std::string wc = w + c;
   if (dictionary.count(wc))
     w = wc;
   else {
     *result++ = dictionary[w];
     // Add wc to the dictionary.
     dictionary[wc] = dictSize++;
     w = std::string(1, c);
   }
 }
 
 // Output the code for w.
 if (!w.empty())
   *result++ = dictionary[w];
 return result;

}

// Decompress a list of output ks to a string. // "begin" and "end" must form a valid range of ints template <typename Iterator> std::string decompress(Iterator begin, Iterator end) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<int,std::string> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[i] = std::string(1, i);
 
 std::string w(1, *begin++);
 std::string result = w;
 std::string entry;
 for ( ; begin != end; begin++) {
   int k = *begin;
   if (dictionary.count(k))
     entry = dictionary[k];
   else if (k == dictSize)
     entry = w + w[0];
   else
     throw "Bad compressed k";
   
   result += entry;
   
   // Add w+entry[0] to the dictionary.
   dictionary[dictSize++] = w + entry[0];
   
   w = entry;
 }
 return result;

}

  1. include <iostream>
  2. include <iterator>
  3. include <vector>

int main() {

 std::vector<int> compressed;
 compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
 copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
 std::cout << std::endl;
 std::string decompressed = decompress(compressed.begin(), compressed.end());
 std::cout << decompressed << std::endl;
 
 return 0;

}</lang>

Clojure

<lang Clojure> (defn make-dict []

 (let [vals (range 0 256)]
   (zipmap (map (comp #'list #'char) vals) vals)))

(defn compress [#^String text]

 (loop [t (seq text)
        r '()
        w '()
        dict (make-dict)
        s 256]
   (let [c (first t)]
     (if c
       (let [wc (cons c w)]
         (if (get dict wc)
           (recur (rest t) r wc dict s)
           (recur (rest t) (cons (get dict w) r) (list c) (assoc dict wc s) (inc s))))
       (reverse (if w (cons (get dict w) r) r))))))

(compress "TOBEORNOTTOBEORTOBEORNOT") </lang> The output: <lang Clojure> (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263) </lang>

D

D 1, with Phobos, from the Python version (the final writefln works only on 7-bit ASCII strings): <lang d> import std.stdio: writefln;

/// Compress a string to a list of output symbols. int[] compress(string uncompressed) {

   // build the dictionary
   int dict_size = 256;
   int[string] dictionary;
   for (int i; i < dict_size; i++)
       dictionary["" ~ cast(char)i] = i;
   string w;
   int[] result;
   foreach (c; uncompressed) {
       string wc = w ~ c;
       if (wc in dictionary)
           w = wc;
       else {
           result ~= dictionary[w];
           // add wc to the dictionary
           dictionary[wc] = dict_size;
           dict_size++;
           w = "" ~ c;
       }
   }
   // output the code for w
   if (w.length)
       result ~= dictionary[w];
   return result;

}


/// Decompress a list of output ks to a string. string decompress(int[] compressed) {

   // build the dictionary
   int dict_size = 256;
   string[int] dictionary;
   for (int i; i < dict_size; i++)
       dictionary[i] = "" ~ cast(char)i;
   string w = "" ~ cast(char)compressed[0];
   string result = w;
   foreach (k; compressed[1 .. $]) {
       string entry;
       if (k in dictionary)
           entry = dictionary[k];
       else if (k == dict_size)
           entry = w ~ w[0];
       else
           throw new Exception("Bad compressed k");
       result ~= entry;
       // add w+entry[0] to the dictionary
       dictionary[dict_size] = w ~ entry[0];
       dict_size++;
       w = entry;
   }
   return result;

}

void main() {

   auto compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
   writefln(compressed);
   auto decompressed = decompress(compressed);
   writefln(decompressed);

} </lang> The output: <lang d> [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] TOBEORNOTTOBEORTOBEORNOT </lang>

Forth

Works with: GNU Forth version 0.6.2
256 value next-symbol

\ current string fragment

create w 256 allot     \ counted string

: w=c ( c -- )       w 1+ c!        1 w c! ;
: w+c ( c -- )  w count + c!  w c@ 1+ w c! ;
\ Compression

\ dictionary of strings to symbols
0 value dict

: init-dict  table to dict  256 to next-symbol  dict set-current ;

: free-dict                           forth-wordlist set-current ;

: in-dict? ( key len -- ? )		\ can assume len > 1
  dict search-wordlist dup if nip then ;

: lookup-dict ( key len -- symbol )
  dup 1 = if drop c@ exit then
  dict search-wordlist if >body @ else abort" bad-dict!" then ;

: put-dict ( data key len -- )
  nextname create , ;

\ output buffer of symbols
\  in real life, these symbols would be packed into octets
variable out-size
create out 256 cells allot

: output ( symbol -- )
  dup out out-size @ cells + !  1 out-size +!
  dup 256 < if emit space else . then ;

: compress ( addr len -- )
  init-dict  0 out-size !
  over c@ w=c  1 /string
  bounds do
    i c@ w+c
    w count in-dict? 0= if
      w count 1- lookup-dict output
      next-symbol dup w count put-dict
      1+ to next-symbol
      i c@ w=c
    then
  loop
  w count lookup-dict output
  free-dict ;
\ Decompression

\ array of symbols to strings (in real code this would need to be growable)
\  next-symbol is reused for the size of this table
create symtab 256 cells allot
0 value start

: init-symtab  256 to next-symbol  here to start ;

: free-symtab  start here - allot ;

: get-symbol ( symbol -- addr len )
  dup 256 < if pad c! pad 1 exit then
  256 - cells symtab + @ count ;

: add-symbol ( addr len -- )
  here symtab next-symbol 256 - cells + !
  s,
  next-symbol 1+ to next-symbol ;

create entry 256 allot

: decompress ( addr len -- )
  init-symtab
  over @ dup emit w=c
  cells bounds cell+ do
    i @ next-symbol < if
      i @ get-symbol entry place
    else i @ next-symbol = if
      w 1+ c@ w count + c!  w count 1+ entry place
    else
      abort" bad symbol!"
    then then
    entry count type	\ output
    entry 1+ c@ w+c
    w count add-symbol
    entry count w place
  1 cells +loop
  free-symtab ;
\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr
\ T O B E O R N O T 256 258 260 265 259 261 263

out out-size @ decompress cr
\ TOBEORNOTTOBEORTOBEORNOT

Java

<lang java>import java.util.*;

public class LZW {

   /** Compress a string to a list of output symbols. */
   public static List<Integer> compress(String uncompressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<String,Integer> dictionary = new HashMap<String,Integer>();
       for (int i = 0; i < 256; i++)
           dictionary.put("" + (char)i, i);
       
       String w = "";
       List<Integer> result = new ArrayList<Integer>();
       for (char c : uncompressed.toCharArray()) {
           String wc = w + c;
           if (dictionary.containsKey(wc))
               w = wc;
           else {
               result.add(dictionary.get(w));
               // Add wc to the dictionary.
               dictionary.put(wc, dictSize++);
               w = "" + c;
           }
       }

       // Output the code for w.
       if (!w.equals(""))
           result.add(dictionary.get(w));
       return result;
   }
   
   /** Decompress a list of output ks to a string. */
   public static String decompress(List<Integer> compressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<Integer,String> dictionary = new HashMap<Integer,String>();
       for (int i = 0; i < 256; i++)
           dictionary.put(i, "" + (char)i);
       
       String w = "" + (char)(int)compressed.remove(0);
       String result = w;
       for (int k : compressed) {
           String entry;
           if (dictionary.containsKey(k))
               entry = dictionary.get(k);
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               throw new IllegalArgumentException("Bad compressed k: " + k);
           
           result += entry;
           
           // Add w+entry[0] to the dictionary.
           dictionary.put(dictSize++, w + entry.charAt(0));
           
           w = entry;
       }
       return result;
   }
   public static void main(String[] args) {
       List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
       System.out.println(compressed);
       String decompressed = decompress(compressed);
       System.out.println(decompressed);
   }

}</lang>

JavaScript

<lang javascript> //LZW Compression/Decompression for Strings var LZW = {

   "compress" : function(uncompressed) 
   {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = {};
       for (var i = 0; i < 256; i++)
       {
           dictionary[String.fromCharCode(i)] = i;
       }
		
       var w = "";
       var result = [];
       for (var i = 0; i < uncompressed.length; i++) 
       {
       	var c = uncompressed.charAt(i);
           var wc = w + c;
           if (dictionary[wc])
               w = wc;
           else {
               result.push(dictionary[w]);
               // Add wc to the dictionary.
               dictionary[wc] = dictSize++;
               w = "" + c;
           }
       }

       // Output the code for w.
       if (w != "")
           result.push(dictionary[w]);
       return result;
   },

   "decompress" : function(compressed) {
       // Build the dictionary.
       var dictSize = 256;
       var dictionary = [];
       for (var i = 0; i < 256; i++)
       {
           dictionary[i] = String.fromCharCode(i);
	}

       var w = String.fromCharCode(compressed[0]);
       var result = w;
       for (var i = 1; i < compressed.length; i++) {
           var entry = "";
           var k = compressed[i];
           if (dictionary[k])
               entry = dictionary[k];
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               return null;

           result += entry;

           // Add w+entry[0] to the dictionary.
           dictionary[dictSize++] = w + entry.charAt(0);

           w = entry;
       }
       return result;
   }

}

// For Test Purposes var comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT"); var decomp = LZW.decompress(comp); document.write(comp+'
'+decomp); </lang>

Output: <lang javascript> 84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT </lang>

Objective-C

Works with: GNUstep

The class for the LZW compression algorithm:

<lang objc>#import <Foundation/Foundation.h>

  1. import <stdio.h>

@interface LZWCompressor : NSObject {

 @private
   NSMutableArray *iostream;
   NSMutableDictionary *dict;
   NSUInteger codemark;

}

-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end

@implementation LZWCompressor : NSObject

-(LZWCompressor *) init {

  self = [super init];
  if ( self )
  {
     iostream = nil;
     codemark = 256;
     dict = [NSMutableDictionary dictionaryWithCapacity: 512];
  }
  return self;

}

-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {

  self = [self init];
  if ( self )
  {
     [self setArray: stream];
  }
  return self;

}

-(void) setArray: (NSMutableArray *) stream {

  iostream = stream;

}

-(BOOL) compressData: (NSData *) string; {

   NSUInteger i;
   unsigned char j;
   
   // prepare dict
   for(i=0; i < 256; i++)
   {
      j = i;
      NSData *s = [NSData dataWithBytes: &j length: 1];
      [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s];
   }
   
   NSMutableData *w = [NSMutableData data];
   NSMutableData *wc = [NSMutableData data];
   
   for(i=0; i < [string length]; i++)
   {
      [wc setData: w];
      [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]];
      if ( [dict objectForKey: wc] != nil )
      {
         [w setData: wc];
      } else {
         [iostream addObject: [dict objectForKey: w]];
         [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc];
         codemark++;
         [w setData: [string subdataWithRange: NSMakeRange(i, 1)]];
      }
   }
   if ( [w length] != 0 )
   {
      [iostream addObject: [dict objectForKey: w]];
   }
   return YES;

}

-(NSArray *) getArray {

 return iostream;

}

@end</lang>

Usage example:

<lang objc>const char *text = "TOBEORNOTTOBEORTOBEORNOT";

int main() {

 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
 
 LZWCompressor *lzw = [[LZWCompressor alloc]
                       initWithArray: [[NSMutableArray alloc] init] ];
 if ( lzw )
 {
    [lzw compressData: [NSData dataWithBytes: text
                        length: strlen(text)]];
    NSEnumerator *en = [[lzw getArray] objectEnumerator];
    id obj;
    while( (obj = [en nextObject]) )
    {
       printf("%u\n", [obj unsignedIntValue]);
    } 
 }
 
 [pool release];
 return EXIT_SUCCESS;

}</lang>

Output (reformatted by hand):

 84  79  66  69  79  82  78  79
 84 256 258 260 265 259 261 263

OCaml

<lang ocaml>#directory "+site-lib/extlib/"

  1. load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in
 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in
 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;
 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = w::[] in
 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)
</lang>

here is the interface: <lang ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</lang>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<lang ocaml>let greatest = List.fold_left max 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (IO.write_bits ob nbits) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)
</lang>

Perl

In this version the hashes contain mixed typed data: <lang perl># Compress a string to a list of output symbols. sub compress {

   my $uncompressed = shift;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = "";
   my @result;
   foreach my $c (split , $uncompressed) {
       my $wc = $w . $c;
       if (exists $dictionary{$wc}) {
           $w = $wc;
       } else {
           push @result, $dictionary{$w};
           # Add wc to the dictionary.
           $dictionary{$wc} = $dict_size;
           $dict_size++;
           $w = $c;
       }
   }
   # Output the code for w.
   if ($w) {
       push @result, $dictionary{$w};
   }
   return @result;

}

  1. Decompress a list of output ks to a string.

sub decompress {

   my @compressed = @_;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = shift @compressed;
   my $result = $w;
   foreach my $k (@compressed) {
       my $entry;
       if (exists $dictionary{$k}) {
           $entry = $dictionary{$k};
       } elsif ($k == $dict_size) {
           $entry = $w . substr(w,0,1);
       } else {
           die "Bad compressed k: $k";
       }
       $result .= $entry;
       # Add w+entry[0] to the dictionary.
       $dictionary{$dict_size} = $w . substr($entry,0,1);
       $dict_size++;
       $w = $entry;
   }
   return $result;

}

  1. How to use:

my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</lang>

Output:

T O B E O R N O T 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

Python

In this version the dicts contain mixed typed data: <lang python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
   w = ""
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
   # Output the code for w.
   if w:
       result.append(dictionary[w])
   return result


def decompress(compressed):

   """Decompress a list of output ks to a string."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == dict_size:
           entry = w + w[0]
       else:
           raise ValueError('Bad compressed k: %s' % k)
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0]
       dict_size += 1
       w = entry
   return result


  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print (compressed) decompressed = decompress(compressed) print (decompressed)</lang>

Output:

['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Ruby

In this version the hashes contain mixed typed data: <lang ruby># Compress a string to a list of output symbols. def compress(uncompressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
   w = ""
   result = []
   for c in uncompressed.split()
       wc = w + c
       if dictionary.has_key?(wc)
           w = wc
       else
           result << dictionary[w]
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
       end
   end
   # Output the code for w.
   result << dictionary[w] unless w.empty?
   result

end

  1. Decompress a list of output ks to a string.

def decompress(compressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
   w = result = compressed.shift
   for k in compressed
       if dictionary.has_key?(k)
           entry = dictionary[k]
       elsif k == dict_size
           entry = w + w[0,1]
       else
           raise 'Bad compressed k: %s' % k
       end
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0,1]
       dict_size += 1
       w = entry
   end
   result

end

  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed</lang>

Output:

["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Seed7

<lang seed7>$ include "seed7_05.s7i";

const func string: lzwCompress (in string: uncompressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [string] char: mydict is (hash [string] char).value;
   var string: buffer is "";
   var string: xstr is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [str(ch)] ch;
   end for;
   for ch range uncompressed do
     xstr := buffer & str(ch);
     if xstr in mydict then
       buffer &:= str(ch)
     else
       result &:= str(mydict[buffer]);
       mydict @:= [xstr] chr(length(mydict));
       buffer := str(ch);
     end if;
   end for;
   if buffer <> "" then
     result &:= str(mydict[buffer]);
   end if;
 end func;

const func string: lzwDecompress (in string: compressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [char] string: mydict is (hash [char] string).value;
   var string: buffer is "";
   var string: current is "";
   var string: chain is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [ch] str(ch);
   end for;
   for ch range compressed do
     if buffer = "" then
       buffer := mydict[ch];
       result &:= buffer;
     elsif ch <= chr(255) then
       current := mydict[ch];
       result &:= current;
       chain := buffer & current;
       mydict @:= [chr(length(mydict))] chain;
       buffer := current;
     else
       if ch in mydict then
         chain := mydict[ch];
       else
         chain := buffer & str(buffer[1]);
       end if;
       result &:= chain;
       mydict @:= [chr(length(mydict))] buffer & str(chain[1]);
       buffer := chain;
     end if;
   end for;
 end func;

const proc: main is func

 local
   var string: compressed is "";
   var string: uncompressed is "";
 begin
   compressed := lzwCompress("TOBEORNOTTOBEORTOBEORNOT");
   writeln(literal(compressed));
   uncompressed := lzwDecompress(compressed);
   writeln(uncompressed);
 end func;</lang>

Output:

"TOBEORNOT\256\\258\\260\\265\\259\\261\\263\"
TOBEORNOTTOBEORTOBEORNOT

Original source: [1] and [2]

Tcl

<lang tcl>namespace eval LZW {

   variable char2int
   variable chars
   for {set i 0} {$i < 256} {incr i} {
       set char [binary format c $i]
       set char2int($char) $i
       lappend chars $char
   }

}

proc LZW::encode {data} {

   variable char2int
   array set dict [array get char2int]
   set w ""
   set result [list]
   foreach c [split $data ""] {
       set wc $w$c
       if {[info exists dict($wc)]} {
           set w $wc
       } else {
           lappend result $dict($w)
           set dict($wc) [array size dict]
           set w $c
       }
   }
   lappend result $dict($w)

}

proc LZW::decode {cdata} {

   variable chars
   set dict $chars
   set k [lindex $cdata 0]
   set w [lindex $dict $k]
   set result $w
   foreach k [lrange $cdata 1 end] {
       set currSizeDict [llength $dict]
       if {$k < $currSizeDict} {
           set entry [lindex $dict $k]
       } elseif {$k == $currSizeDict} {
           set entry $w[string index $w 0]
       } else {
           error "invalid code ($k) in ($cdata)"
       }
       append result $entry
       lappend dict $w[string index $entry 0]
       set w $entry
   }
   return $result

}

set s TOBEORNOTTOBEORTOBEORNOT# set e [LZW::encode $s] ;# ==> 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263 35 set d [LZW::decode $e] ;# ==> TOBEORNOTTOBEORTOBEORNOT#

  1. or

if {$s eq [LZW::decode [LZW::encode $s]]} then {puts success} else {puts fail} ;# ==> success