LZW compression

From Rosetta Code
Revision as of 04:11, 18 February 2012 by 79.37.200.27 (talk) (Fixed bad char interval bug in D code)
Task
LZW compression
You are encouraged to solve this task according to the task description, using any language you may know.

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.

Ada

Works with: Ada 2005

lzw.ads: <lang Ada>package LZW is

  MAX_CODE : constant := 4095;
  type Codes is new Natural range 0 .. MAX_CODE;
  type Compressed_Data is array (Positive range <>) of Codes;
  function Compress (Cleartext : in String) return Compressed_Data;
  function Decompress (Data : in Compressed_Data) return String;

end LZW;</lang>

lzw.adb: <lang Ada>with Ada.Containers.Ordered_Maps; with Ada.Strings.Unbounded;

package body LZW is

  package UStrings renames Ada.Strings.Unbounded;
  use type UStrings.Unbounded_String;
  --------------
  -- Compress --
  --------------
  function Compress (Cleartext : in String) return Compressed_Data is
     -- translate String to Code-ID
     package String_To_Code is new Ada.Containers.Ordered_Maps (
        Key_Type => UStrings.Unbounded_String,
        Element_Type => Codes);
     Dictionary : String_To_Code.Map;
     -- Next unused Code-ID
     Next_Entry : Codes := 256;
     -- maximum same length as input, compression ratio always >=1.0
     Result : Compressed_Data (1 .. Cleartext'Length);
     -- position for next Code-ID
     Result_Index : Natural := 1;
     -- current and next input string
     Current_Word : UStrings.Unbounded_String :=
       UStrings.Null_Unbounded_String;
     Next_Word    : UStrings.Unbounded_String :=
       UStrings.Null_Unbounded_String;
  begin
     -- initialize Dictionary
     for C in Character loop
        String_To_Code.Insert
          (Dictionary,
           UStrings.Null_Unbounded_String & C,
           Character'Pos (C));
     end loop;
     for Index in Cleartext'Range loop
        -- add character to current word
        Next_Word := Current_Word & Cleartext (Index);
        if String_To_Code.Contains (Dictionary, Next_Word) then
           -- already in dictionary, continue with next character
           Current_Word := Next_Word;
        else
           -- insert code for current word to result
           Result (Result_Index) :=
              String_To_Code.Element (Dictionary, Current_Word);
           Result_Index          := Result_Index + 1;
           -- add new Code to Dictionary
           String_To_Code.Insert (Dictionary, Next_Word, Next_Entry);
           Next_Entry := Next_Entry + 1;
           -- reset current word to one character
           Current_Word := UStrings.Null_Unbounded_String &
                           Cleartext (Index);
        end if;
     end loop;
     -- Last word was not entered
     Result (Result_Index) :=
        String_To_Code.Element (Dictionary, Current_Word);
     -- return correct array size
     return Result (1 .. Result_Index);
  end Compress;
  ----------------
  -- Decompress --
  ----------------
  function Decompress (Data : in Compressed_Data) return String is
     -- translate Code-ID to String
     type Code_To_String is array (Codes) of UStrings.Unbounded_String;
     Dictionary : Code_To_String;
     -- next unused Code-ID
     Next_Entry : Codes := 256;
     -- initialize resulting string as empty string
     Result : UStrings.Unbounded_String := UStrings.Null_Unbounded_String;
     Next_Code : Codes;
     -- first code has to be in dictionary
     Last_Code : Codes := Data (1);
     -- suffix appended to last string for new dictionary entry
     Suffix : Character;
  begin
     -- initialize Dictionary
     for C in Character loop
        Dictionary (Codes (Character'Pos (C)))   :=
          UStrings.Null_Unbounded_String & C;
     end loop;
     -- output first Code-ID
     UStrings.Append (Result, Dictionary (Last_Code));
     for Index in 2 .. Data'Last loop
        Next_Code := Data (Index);
        if Next_Code <= Next_Entry then
           -- next Code-ID already in dictionary -> append first char
           Suffix := UStrings.Element (Dictionary (Next_Code), 1);
        else
           -- next Code-ID not in dictionary -> use char from last ID
           Suffix := UStrings.Element (Dictionary (Last_Code), 1);
        end if;
        -- expand the dictionary
        Dictionary (Next_Entry) := Dictionary (Last_Code) & Suffix;
        Next_Entry              := Next_Entry + 1;
        -- output the current Code-ID to result
        UStrings.Append (Result, Dictionary (Next_Code));
        Last_Code := Next_Code;
     end loop;
     -- return String
     return UStrings.To_String (Result);
  end Decompress;

end LZW;</lang>

test.adb: <lang Ada>with LZW; with Ada.Text_IO;

procedure Test is

  package Text_IO renames Ada.Text_IO;
  package Code_IO is new Ada.Text_IO.Integer_IO (LZW.Codes);
  Test_Data : constant LZW.Compressed_Data :=
     LZW.Compress ("TOBEORNOTTOBEORTOBEORNOT");

begin

  for Index in Test_Data'Range loop
     Code_IO.Put (Test_Data (Index), 0);
     Text_IO.Put (" ");
  end loop;
  Text_IO.New_Line;
  declare
     Cleartext : constant String := LZW.Decompress (Test_Data);
  begin
     Text_IO.Put_Line (Cleartext);
  end;

end Test;</lang>

C

LZW encoder/decoder. Using variable bit length from 9 to up to 16. Encoder needs to know max allow bits, decoder doesn't. Code 256 for clear table, 257 for end of data, everything are either byte values (<256) or code values. <lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <string.h>
  3. include <stdint.h>
  4. include <unistd.h>
  5. include <fcntl.h>
  6. include <sys/types.h>
  7. include <sys/stat.h>

/* -------- aux stuff ---------- */ void* mem_alloc(size_t item_size, size_t n_item) { size_t *x = calloc(1, sizeof(size_t)*2 + n_item * item_size); x[0] = item_size; x[1] = n_item; return x + 2; }

void* mem_extend(void *m, size_t new_n) { size_t *x = (size_t*)m - 2; x = realloc(x, sizeof(size_t) * 2 + *x * new_n); if (new_n > x[1]) memset((char*)(x + 2) + x[0] * x[1], 0, x[0] * (new_n - x[1])); x[1] = new_n; return x + 2; }

inline void _clear(void *m) { size_t *x = (size_t*)m - 2; memset(m, 0, x[0] * x[1]); }

  1. define _new(type, n) mem_alloc(sizeof(type), n)
  2. define _del(m) { free((size_t*)(m) - 2); m = 0; }
  3. define _len(m) *((size_t*)m - 1)
  4. define _setsize(m, n) m = mem_extend(m, n)
  5. define _extend(m) m = mem_extend(m, _len(m) * 2)


/* ----------- LZW stuff -------------- */ typedef uint8_t byte; typedef uint16_t ushort;

  1. define M_CLR 256 /* clear table marker */
  2. define M_EOD 257 /* end-of-data marker */
  3. define M_NEW 258 /* new code index */

/* encode and decode dictionary structures.

  for encoding, entry at code index is a list of indices that follow current one,
  i.e. if code 97 is 'a', code 387 is 'ab', and code 1022 is 'abc',
  then dict[97].next['b'] = 387, dict[387].next['c'] = 1022, etc. */

typedef struct { ushort next[256]; } lzw_enc_t;

/* for decoding, dictionary contains index of whatever prefix index plus trailing

  byte.  i.e. like previous example,
  	dict[1022] = { c: 'c', prev: 387 },
  	dict[387]  = { c: 'b', prev: 97 },
  	dict[97]   = { c: 'a', prev: 0 }
  the "back" element is used for temporarily chaining indices when resolving
  a code to bytes
*/

typedef struct { ushort prev, back; byte c; } lzw_dec_t;

byte* lzw_encode(byte *in, int max_bits) { int len = _len(in), bits = 9, next_shift = 512; ushort code, c, nc, next_code = M_NEW; lzw_enc_t *d = _new(lzw_enc_t, 512);

if (max_bits > 16) max_bits = 16; if (max_bits < 9 ) max_bits = 12;

byte *out = _new(ushort, 4); int out_len = 0, o_bits = 0; uint32_t tmp = 0;

inline void write_bits(ushort x) { tmp = (tmp << bits) | x; o_bits += bits; if (_len(out) <= out_len) _extend(out); while (o_bits >= 8) { o_bits -= 8; out[out_len++] = tmp >> o_bits; tmp &= (1 << o_bits) - 1; } }

//write_bits(M_CLR); for (code = *(in++); --len; ) { c = *(in++); if ((nc = d[code].next[c])) code = nc; else { write_bits(code); nc = d[code].next[c] = next_code++; code = c; }

/* next new code would be too long for current table */ if (next_code == next_shift) { /* either reset table back to 9 bits */ if (++bits > max_bits) { /* table clear marker must occur before bit reset */ write_bits(M_CLR);

bits = 9; next_shift = 512; next_code = M_NEW; _clear(d); } else /* or extend table */ _setsize(d, next_shift *= 2); } }

write_bits(code); write_bits(M_EOD); if (tmp) write_bits(tmp);

_del(d);

_setsize(out, out_len); return out; }

byte* lzw_decode(byte *in) { byte *out = _new(byte, 4); int out_len = 0;

inline void write_out(byte c) { while (out_len >= _len(out)) _extend(out); out[out_len++] = c; }

lzw_dec_t *d = _new(lzw_dec_t, 512); int len, j, next_shift = 512, bits = 9, n_bits = 0; ushort code, c, t, next_code = M_NEW;

uint32_t tmp = 0; inline void get_code() { while(n_bits < bits) { if (len > 0) { len --; tmp = (tmp << 8) | *(in++); n_bits += 8; } else { tmp = tmp << (bits - n_bits); n_bits = bits; } } n_bits -= bits; code = tmp >> n_bits; tmp &= (1 << n_bits) - 1; }

inline void clear_table() { _clear(d); for (j = 0; j < 256; j++) d[j].c = j; next_code = M_NEW; next_shift = 512; bits = 9; };

clear_table(); /* in case encoded bits didn't start with M_CLR */ for (len = _len(in); len;) { get_code(); if (code == M_EOD) break; if (code == M_CLR) { clear_table(); continue; }

if (code >= next_code) { fprintf(stderr, "Bad sequence\n"); _del(out); goto bail; }

d[next_code].prev = c = code; while (c > 255) { t = d[c].prev; d[t].back = c; c = t; }

d[next_code - 1].c = c;

while (d[c].back) { write_out(d[c].c); t = d[c].back; d[c].back = 0; c = t; } write_out(d[c].c);

if (++next_code >= next_shift) { if (++bits > 16) { /* if input was correct, we'd have hit M_CLR before this */ fprintf(stderr, "Too many bits\n"); _del(out); goto bail; } _setsize(d, next_shift *= 2); } }

/* might be ok, so just whine, don't be drastic */ if (code != M_EOD) fputs("Bits did not end in EOD\n", stderr);

_setsize(out, out_len); bail: _del(d); return out; }

int main() { int i, fd = open("unixdict.txt", O_RDONLY);

if (fd == -1) { fprintf(stderr, "Can't read file\n"); return 1; };

struct stat st; fstat(fd, &st);

byte *in = _new(char, st.st_size); read(fd, in, st.st_size); _setsize(in, st.st_size); close(fd);

printf("input size: %d\n", _len(in));

byte *enc = lzw_encode(in, 9); printf("encoded size: %d\n", _len(enc));

byte *dec = lzw_decode(enc); printf("decoded size: %d\n", _len(dec));

for (i = 0; i < _len(dec); i++) if (dec[i] != in[i]) { printf("bad decode at %d\n", i); break; }

if (i == _len(dec)) printf("Decoded ok\n");


_del(in); _del(enc); _del(dec);

return 0; }</lang>

CoffeeScript

This only does the encoding step for now.

<lang coffeescript> lzw = (s) ->

 dct = {} # map substrings to codes between 256 and 4096
 stream = [] # array of compression results
 
 # initialize basic ASCII characters
 for code_num in [0..255]
   c = String.fromCharCode(code_num)
   dct[c] = code_num
 code_num = 256
     
 i = 0
 while i < s.length
   # Find word and new_word
   #   word = longest substr already encountered, or next character
   #   new_word = word plus next character, a new substr to encode
   word = 
   j = i
   while j < s.length
     new_word = word + s[j]
     break if !dct[new_word]
     word = new_word
     j += 1
   # stream out the code for the substring
   stream.push dct[word]
     
   # build up our encoding dictionary
   if code_num < 4096
     dct[new_word] = code_num
     code_num += 1
   
   # advance thru the string
   i += word.length
 stream
   

console.log lzw "TOBEORNOTTOBEORTOBEORNOT" </lang> output <lang> > coffee lzw.coffee [ 84,

 79,
 66,
 69,
 79,
 82,
 78,
 79,
 84,
 256,
 258,
 260,
 265,
 259,
 261,
 263 ]

</lang>


Common Lisp

Library: Babel

This version is based upon the Perl one. It doesn't contain mixed type data at the cost of being more consy. It includes vector operation routines, since using VECTOR-PUSH-APPEND reallocates the whole vector with each call.

The Babel library is required to convert octet vectors to strings. Lisp strings can contain characters out of the ASCII/latin1 character set, including the whole Unicode range in them. The exact encoding used is dependent upon the user's locale (LC_CTYPE on Unix).

<lang lisp>(declaim (ftype (function (vector vector &optional fixnum fixnum) vector)

               vector-append))

(defun vector-append (old new &optional (start2 0) end2)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (+ old-fill (length new))))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (replace old new :start1 old-fill :start2 start2 :end2 end2))))

(declaim (ftype (function (vector t) vector) vector-append1)) (defun vector-append1 (old new)

 (prog1 old                     
   (let* ((old-fill (fill-pointer old))
          (new-fill (1+ old-fill)))
     (when (> new-fill (array-dimension old 0))
       (adjust-array old (* 4 new-fill)))
     (setf (fill-pointer old) new-fill)
     (setf (aref old old-fill) new))))

(declaim (ftype (function (&optional t) vector) make-empty-vector)) (defun make-empty-vector (&optional (element-type t))

 (make-array 0 :element-type element-type :fill-pointer 0 :adjustable t))


(declaim (ftype (function (t &optional t) vector) make-vector-with-elt)) (defun make-vector-with-elt (elt &optional (element-type t))

 (make-array 1 :element-type element-type
               :fill-pointer 1
               :adjustable t
               :initial-element elt))

(declaim (ftype (function (vector t) vector) vector-append1-new)) (defun vector-append1-new (old new)

 (vector-append1 (vector-append (make-empty-vector 'octet) old)
                 new))

(declaim (ftype (function (vector vector) vector) vector-append-new)) (defun vector-append-new (old new)

 (vector-append (vector-append (make-empty-vector 'octet) old)
                new))

(deftype octet () '(unsigned-byte 8))

(declaim (ftype (function () hash-table) build-dictionary)) (defun build-dictionary ()

 (let ((dictionary (make-hash-table :test #'equalp)))
   (loop for i below 256
         do (let ((vec (make-vector-with-elt i 'octet)))
              (setf (gethash vec dictionary) vec)))
   dictionary))

(declaim (ftype (function ((vector octet)) (vector octet))

               lzw-compress-octets))

(defun lzw-compress-octets (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (loop with dictionary-size of-type fixnum = 256
       with w = (make-empty-vector 'octet)
       with result = (make-empty-vector 't)
       with dictionary = (build-dictionary)
       for c across octets
       for wc = (vector-append1-new w c)
       if (gethash wc dictionary) do (setq w wc)
       else do
         (vector-append result (gethash w dictionary))
         (setf (gethash wc dictionary)
               (make-vector-with-elt dictionary-size)) 
         (incf dictionary-size)
         (setq w (make-vector-with-elt c 'octet))
       finally (unless (zerop (length (the (vector octet) w)))
                 (vector-append result (gethash w dictionary)))
               (return result)))

(declaim (ftype (function (vector) (vector octet)) lzw-decompress)) (defun #1=lzw-decompress (octets)

 (declare (optimize (speed 3) (safety 0) (debug 0)))
 (when (zerop (length octets))
   (return-from #1# (make-empty-vector 'octet)))
 (loop with dictionary-size = 256
       with dictionary = (build-dictionary)
       with result = (make-vector-with-elt (aref octets 0) 'octet)
       with w = (copy-seq result)
       for i from 1 below (length octets)
       for k = (make-vector-with-elt (aref octets i) 't)
       for entry = (or (gethash k dictionary)
                       (if (equalp k dictionary-size)
                           (coerce (list w (aref w 0)) '(vector octet))
                           (error "bad compresed entry at pos ~S" i)))
       do (vector-append result entry)
          (setf (gethash (make-vector-with-elt dictionary-size) dictionary)
                (vector-append1-new w (aref entry 0)))
          (incf dictionary-size)
          (setq w entry)
       finally (return result)))

(defgeneric lzw-compress (datum)

 (:method ((string string))
   (lzw-compress (babel:string-to-octets string)))
 (:method ((octets vector))
   (lzw-compress-octets octets)))

(defun lzw-decompress-to-string (octets)

 (babel:octets-to-string (lzw-decompress octets)))

(defun test (string)

 (assert (equal #2=(lzw-decompress-to-string (lzw-compress string)) string) ()
         "Can't compress ~S properly, got ~S instead" string #2#)
 t)</lang>

And the format used:

<lang lisp>CL-USER> (test "TOBEORNOTTOBEORTOBEORNOT") T CL-USER> (lzw-compress "TOBEORNOTTOBEORTOBEORNOT")

  1. (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)

CL-USER> (lzw-decompress-to-string *) "TOBEORNOTTOBEORTOBEORNOT"</lang>

C++

<lang cpp>#include <string>

  1. include <map>

// Compress a string to a list of output symbols. // The result will be written to the output iterator // starting at "result"; the final iterator is returned. template <typename Iterator> Iterator compress(const std::string &uncompressed, Iterator result) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<std::string,int> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[std::string(1, i)] = i;
 
 std::string w;
 for (std::string::const_iterator it = uncompressed.begin();
      it != uncompressed.end(); ++it) {
   char c = *it;
   std::string wc = w + c;
   if (dictionary.count(wc))
     w = wc;
   else {
     *result++ = dictionary[w];
     // Add wc to the dictionary.
     dictionary[wc] = dictSize++;
     w = std::string(1, c);
   }
 }
 
 // Output the code for w.
 if (!w.empty())
   *result++ = dictionary[w];
 return result;

}

// Decompress a list of output ks to a string. // "begin" and "end" must form a valid range of ints template <typename Iterator> std::string decompress(Iterator begin, Iterator end) {

 // Build the dictionary.
 int dictSize = 256;
 std::map<int,std::string> dictionary;
 for (int i = 0; i < 256; i++)
   dictionary[i] = std::string(1, i);
 
 std::string w(1, *begin++);
 std::string result = w;
 std::string entry;
 for ( ; begin != end; begin++) {
   int k = *begin;
   if (dictionary.count(k))
     entry = dictionary[k];
   else if (k == dictSize)
     entry = w + w[0];
   else
     throw "Bad compressed k";
   
   result += entry;
   
   // Add w+entry[0] to the dictionary.
   dictionary[dictSize++] = w + entry[0];
   
   w = entry;
 }
 return result;

}

  1. include <iostream>
  2. include <iterator>
  3. include <vector>

int main() {

 std::vector<int> compressed;
 compress("TOBEORNOTTOBEORTOBEORNOT", std::back_inserter(compressed));
 copy(compressed.begin(), compressed.end(), std::ostream_iterator<int>(std::cout, ", "));
 std::cout << std::endl;
 std::string decompressed = decompress(compressed.begin(), compressed.end());
 std::cout << decompressed << std::endl;
 
 return 0;

}</lang>

Clojure

<lang lisp>(defn make-dict []

 (let [vals (range 0 256)]
   (zipmap (map (comp #'list #'char) vals) vals)))

(defn compress [#^String text]

 (loop [t (seq text)
        r '()
        w '()
        dict (make-dict)
        s 256]
   (let [c (first t)]
     (if c
       (let [wc (cons c w)]
         (if (get dict wc)
           (recur (rest t) r wc dict s)
           (recur (rest t) (cons (get dict w) r) (list c) (assoc dict wc s) (inc s))))
       (reverse (if w (cons (get dict w) r) r))))))

(compress "TOBEORNOTTOBEORTOBEORNOT")</lang> The output: <lang lisp>(84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)</lang>

D

<lang d>import std.stdio;

uint[] compress(in string uncompressed) pure nothrow {

   uint dictSize = 256;
   int[string] dictionary;
   foreach (c; 0 .. dictSize)
       dictionary["" ~ cast(char)c] = cast(char)c;
   string w;
   uint[] result;
   foreach (c; uncompressed) {
       string wc = w ~ c;
       if (wc in dictionary)
           w = wc;
       else {
           result ~= dictionary[w];
           dictionary[wc] = dictSize++;
           w = "" ~ c;
       }
   }
   if (w.length)
       result ~= dictionary[w];
   return result;

}

string decompress(in uint[] compressed) pure {

   uint dictSize = 256;
   string[int] dictionary;
   foreach (c; 0 .. dictSize)
       dictionary[cast(char)c] = "" ~ cast(char)c;
   assert(compressed[0] < dictSize);
   string w = "" ~ cast(char)compressed[0];
   string result = w;
   foreach (k; compressed[1 .. $]) {
       string entry;
       if (k in dictionary)
           entry = dictionary[k];
       else if (k == dictSize)
           entry = w ~ w[0];
       else
           throw new Exception("Bad compressed k");
       result ~= entry;
       dictionary[dictSize++] = w ~ entry[0];
       w = entry;
   }
   return result;

}

void main() {

   const compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
   const decompressed = decompress(compressed);
   writeln(compressed, "\n", decompressed);

}</lang> Output: <lang d>[84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT</lang>

Dylan

<lang dylan>Module: LZW Synopsis: LZW implementation for Rosetta code

define method output(n :: <integer>)

 format-out("%d ", n);

end;

define method contains?(dict, var)

 let x = element(dict, var, default: #f);
 x ~= #f;

end;

define method byte->string(c)

 add("", as(<character>, c));

end;

define method compress(input :: <string>) => <vector>;

 let result = make(<vector>);
 let dict = make(<string-table>);
 for (x from 0 to 255) 
   dict[byte->string(x)] := x;
 end;
 let next-code = 256; 
 let cur-seq = "";
 for (c in input)
   let wc = add(cur-seq, c);
   if (contains?(dict, wc))
     cur-seq := wc;
   else
     result := add(result, dict[cur-seq]);
     dict[wc] := next-code;
     next-code := next-code + 1;
     cur-seq := add("", c);
   end
 end;
 unless (empty?(cur-seq)) 
   result := add(result, dict[cur-seq]);
 end;
 result

end;

format-out("%=\n", compress("TOBEORNOTTOBEORTOBEORNOT"))</lang>

Erlang

<lang erlang>-module(lzw).

-export([test/0, encode/1, decode/1]).

-import(lists, [reverse/1, reverse/2]).

test() ->

   Str = "TOBEORNOTTOBEORTOBEORNOT",
   [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263] =

encode(Str),

   Str = decode(encode(Str)),
   ok.

encode(Str) ->

   D = init(dict:new()),
   encode(Str, D, 256, []).

encode([H], D, _, Out) ->

   Val = dict:fetch([H], D),
   reverse([Val|Out]);

encode([H|T], D, Free, Out) ->

   Val = dict:fetch([H], D),
   find_match(T, [H], Val, D, Free, Out).

find_match([H|T], L, LastVal, D, Free, Out) ->

   case dict:find([H|L], D) of

{ok, Val} -> find_match(T, [H|L], Val, D, Free, Out); error -> D1 = dict:store([H|L], Free, D), encode([H|T], D1, Free+1, [LastVal|Out])

   end;

find_match([], _, LastVal, _, _, Out) ->

   reverse([LastVal|Out]).

decode([H|T]) ->

   D   = init1(dict:new()),
   Val = dict:fetch(H, D),
   decode(T, Val, 256, D, Val).

decode([], _, _, _, L) ->

   reverse(L);

decode([H|T], Old, Free, D, L) ->

   Val = dict:fetch(H, D),
   Add = [lists:last(Val)|Old],
   D1  = dict:store(Free, Add, D),
   decode(T, Val, Free+1, D1, Val ++ L).

init(D) -> init(255, D).

init(0, D) -> D; init(N, D) -> D1 = dict:store([N],N,D), init(N-1, D1).

init1(D) -> init1(255, D).

init1(0, D) -> D; init1(N, D) -> D1 = dict:store(N,[N],D), init1(N-1, D1).</lang>

Forth

Works with: GNU Forth version 0.6.2

<lang forth>256 value next-symbol

\ current string fragment

create w 256 allot \ counted string

w=c ( c -- ) w 1+ c! 1 w c! ;
w+c ( c -- ) w count + c! w c@ 1+ w c! ;

\ Compression

\ dictionary of strings to symbols 0 value dict

init-dict table to dict 256 to next-symbol dict set-current ;
free-dict forth-wordlist set-current ;
in-dict? ( key len -- ? ) \ can assume len > 1
 dict search-wordlist dup if nip then ;
lookup-dict ( key len -- symbol )
 dup 1 = if drop c@ exit then
 dict search-wordlist if >body @ else abort" bad-dict!" then ;
put-dict ( data key len -- )
 nextname create , ;

\ output buffer of symbols \ in real life, these symbols would be packed into octets variable out-size create out 256 cells allot

output ( symbol -- )
 dup out out-size @ cells + !  1 out-size +!
 dup 256 < if emit space else . then ;
compress ( addr len -- )
 init-dict  0 out-size !
 over c@ w=c  1 /string
 bounds do
   i c@ w+c
   w count in-dict? 0= if
     w count 1- lookup-dict output
     next-symbol dup w count put-dict
     1+ to next-symbol
     i c@ w=c
   then
 loop
 w count lookup-dict output
 free-dict ;

\ Decompression

\ array of symbols to strings (in real code this would need to be growable) \ next-symbol is reused for the size of this table create symtab 256 cells allot 0 value start

init-symtab 256 to next-symbol here to start ;
free-symtab start here - allot ;
get-symbol ( symbol -- addr len )
 dup 256 < if pad c! pad 1 exit then
 256 - cells symtab + @ count ;
add-symbol ( addr len -- )
 here symtab next-symbol 256 - cells + !
 s,
 next-symbol 1+ to next-symbol ;

create entry 256 allot

decompress ( addr len -- )
 init-symtab
 over @ dup emit w=c
 cells bounds cell+ do
   i @ next-symbol < if
     i @ get-symbol entry place
   else i @ next-symbol = if
     w 1+ c@ w count + c!  w count 1+ entry place
   else
     abort" bad symbol!"
   then then
   entry count type	\ output
   entry 1+ c@ w+c
   w count add-symbol
   entry count w place
 1 cells +loop
 free-symtab ;

\ Testing

s" TOBEORNOTTOBEORTOBEORNOT" compress cr \ T O B E O R N O T 256 258 260 265 259 261 263

out out-size @ decompress cr \ TOBEORNOTTOBEORTOBEORNOT</lang>

Go

Go also has an LZW package in the standard library.

Translation of: Java

<lang go>package main import "fmt"

// Compress a string to a list of output symbols. func compress(uncompressed string) []int {

   // Build the dictionary.
   dictSize := 256
   dictionary := make(map[string]int)
   for i := 0; i < 256; i++ {
       dictionary[string(i)] = i
   }
   
   w := ""
   result := make([]int, 0)
   for _, c := range []byte(uncompressed) {
       wc := w + string(c)
       if _, ok := dictionary[wc]; ok {
           w = wc
       } else {
           result = append(result, dictionary[w])
           // Add wc to the dictionary.
           dictionary[wc] = dictSize
           dictSize++
           w = string(c)
       }
   }
   // Output the code for w.
   if w != "" {
       result = append(result, dictionary[w])
   }
   return result

}

// Decompress a list of output ks to a string. func decompress(compressed []int) string {

   // Build the dictionary.
   dictSize := 256
   dictionary := make(map[int]string)
   for i := 0; i < 256; i++ {
       dictionary[i] = string(i)
   }
   
   w := string(compressed[0])
   result := w
   for _, k := range compressed[1:] {
       var entry string
       if x, ok := dictionary[k]; ok {
           entry = x
       } else if k == dictSize {
           entry = w + w[:1]
       } else {
           panic(fmt.Sprintf("Bad compressed k: %d", k))
       }
       
       result += entry
       
       // Add w+entry[0] to the dictionary.
       dictionary[dictSize] = w + entry[:1]
       dictSize++
       
       w = entry
   }
   return result

}

func main() {

   compressed := compress("TOBEORNOTTOBEORTOBEORNOT")
   fmt.Println(compressed)
   decompressed := decompress(compressed)
   fmt.Println(decompressed)

}</lang>

Haskell

<lang Haskell>import Data.List import Data.Char import Data.Maybe import Control.Monad import Control.Arrow

take2 = filter((==2).length). map (take 2). tails

doLZW _ [] = [] doLZW as (x:xs) = lzw (map return as) [x] xs

  where lzw a w [] = [fromJust $ elemIndex w a]
        lzw a w (x:xs)  | w' `elem` a = lzw a w' xs
                        | otherwise   = fromJust (elemIndex w a) : lzw (a++[w']) [x] xs
             where w' = w++[x]

undoLZW _ [] = [] undoLZW a cs =

 ((cs >>=).(!!)) $
 foldl (liftM2 (.) (++) (((return. liftM2 (++) head (take 1. last)).). map. (!!)))
 (map return a) (take2 cs)</lang>

Testing: <lang Haskell>*Main> doLZW ['\0'..'\255'] "TOBEORNOTTOBEORTOBEORNOT" [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263]

  • Main> undoLZW ['\0'..'\255'] [84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263]

"TOBEORNOTTOBEORTOBEORNOT"</lang> Encode --> decode --> compare with original text. <lang Haskell>*Main> (ap (==) . liftM2 (.) undoLZW doLZW) ['\0'..'\255'] "TOBEORNOTTOBEORTOBEORNOT" True</lang>

Other (elegant) code can be found at Haskell wiki Toy compression

J

Straightforward implementations of encoding and decoding: <lang J>encodeLZW =: 4 : 0

d=. ;/x
r=.0$0
wc=.w=.{.y
for_c. }.y do.
  wc=.w,c
  if. d e.~ <wc do. w=.wc else.
    r=. r, d i.<w
    d=.d,<wc
    w=.c
  end.
end.
r, d i.<w

)</lang> Test:

   a. encodeLZW 'TOBEORNOTTOBEORTOBEORNOT'
84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263

Decoding: <lang J>decodeLZW =: 4 : 0

d=.;/x
w=.r=. >d{~{.y
ds=. #d
for_c. }.y do.
  select. * c-ds
   case. _1 do. r=.r,e=.>c{d  
   case.  0 do. r=.r,e=.w,{.w
   case.    do. 'error' return.
  end.
  d=.d,< w,{.e
  w=.e
  ds=.ds+1
end.
;r

)</lang> Test:

   a. decodeLZW 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

encode --> decode --> compare with original:

   a. (] -: [ decodeLZW encodeLZW) 'TOBEORNOTTOBEORTOBEORNOT'
1

Error test:

   a. decodeLZW 84 79 66 69 79 82 78 79 84 256 258 456 260 265 259 261 263
error

Tacit J expression for decoding:

decodeLZW=:[:;]{[:;[:(],<@(>@{.,{.@>@{:)@:{)&.>/<@(;/@[),~|.@(2<\])
   a. decodeLZW 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

Java

Works with: Java version 1.5+

<lang java5>import java.util.*;

public class LZW {

   /** Compress a string to a list of output symbols. */
   public static List<Integer> compress(String uncompressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<String,Integer> dictionary = new HashMap<String,Integer>();
       for (int i = 0; i < 256; i++)
           dictionary.put("" + (char)i, i);
       
       String w = "";
       List<Integer> result = new ArrayList<Integer>();
       for (char c : uncompressed.toCharArray()) {
           String wc = w + c;
           if (dictionary.containsKey(wc))
               w = wc;
           else {
               result.add(dictionary.get(w));
               // Add wc to the dictionary.
               dictionary.put(wc, dictSize++);
               w = "" + c;
           }
       }

       // Output the code for w.
       if (!w.equals(""))
           result.add(dictionary.get(w));
       return result;
   }
   
   /** Decompress a list of output ks to a string. */
   public static String decompress(List<Integer> compressed) {
       // Build the dictionary.
       int dictSize = 256;
       Map<Integer,String> dictionary = new HashMap<Integer,String>();
       for (int i = 0; i < 256; i++)
           dictionary.put(i, "" + (char)i);
       
       String w = "" + (char)(int)compressed.remove(0);
       String result = w;
       for (int k : compressed) {
           String entry;
           if (dictionary.containsKey(k))
               entry = dictionary.get(k);
           else if (k == dictSize)
               entry = w + w.charAt(0);
           else
               throw new IllegalArgumentException("Bad compressed k: " + k);
           
           result += entry;
           
           // Add w+entry[0] to the dictionary.
           dictionary.put(dictSize++, w + entry.charAt(0));
           
           w = entry;
       }
       return result;
   }
   public static void main(String[] args) {
       List<Integer> compressed = compress("TOBEORNOTTOBEORTOBEORNOT");
       System.out.println(compressed);
       String decompressed = decompress(compressed);
       System.out.println(decompressed);
   }

}</lang>

Output (Command Line direct output): <lang java5>[84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263] TOBEORNOTTOBEORTOBEORNOT</lang>

JavaScript

<lang javascript>//LZW Compression/Decompression for Strings var LZW = {

   compress: function (uncompressed) {
       "use strict";
       // Build the dictionary.
       var i,
           dictionary = {},
           c,
           wc,
           w = "",
           result = [],
           dictSize = 256;
       for (i = 0; i < 256; i += 1) {
           dictionary[String.fromCharCode(i)] = i;
       }
       for (i = 0; i < uncompressed.length; i += 1) {
           c = uncompressed.charAt(i);
           wc = w + c;
           if (dictionary[wc]) {
               w = wc;
           } else {
               result.push(dictionary[w]);
               // Add wc to the dictionary.
               dictionary[wc] = dictSize++;
               w = String(c);
           }
       }
       // Output the code for w.
       if (w !== "") {
           result.push(dictionary[w]);
       }
       return result;
   },


   decompress: function (compressed) {
       "use strict";
       // Build the dictionary.
       var i,
           dictionary = [],
           w,
           result,
           k,
           entry = "",
           dictSize = 256;
       for (i = 0; i < 256; i += 1) {
           dictionary[i] = String.fromCharCode(i);
       }
       w = String.fromCharCode(compressed[0]);
       result = w;
       for (i = 1; i < compressed.length; i += 1) {
           k = compressed[i];
           if (dictionary[k]) {
               entry = dictionary[k];
           } else {
               if (k === dictSize) {
                   entry = w + w.charAt(0);
               } else {
                   return null;
               }
           }
           result += entry;
           // Add w+entry[0] to the dictionary.
           dictionary[dictSize++] = w + entry.charAt(0);
           w = entry;
       }
       return result;
   }

}, // For Test Purposes

   comp = LZW.compress("TOBEORNOTTOBEORTOBEORNOT"),
   decomp = LZW.decompress(comp);

document.write(comp + '
' + decomp);</lang>

Output: <lang javascript>84,79,66,69,79,82,78,79,84,256,258,260,265,259,261,263 TOBEORNOTTOBEORTOBEORNOT</lang>

Objective-C

Works with: GNUstep

The class for the LZW compression algorithm:

<lang objc>#import <Foundation/Foundation.h>

  1. import <stdio.h>

@interface LZWCompressor : NSObject {

 @private
   NSMutableArray *iostream;
   NSMutableDictionary *dict;
   NSUInteger codemark;

}

-(LZWCompressor *) init; -(LZWCompressor *) initWithArray: (NSMutableArray *) stream; -(BOOL) compressData: (NSData *) string; -(void) setArray: (NSMutableArray *) stream; -(NSArray *) getArray; @end

@implementation LZWCompressor : NSObject

-(LZWCompressor *) init {

  self = [super init];
  if ( self )
  {
     iostream = nil;
     codemark = 256;
     dict = [[NSMutableDictionary alloc] initWithCapacity: 512];
  }
  return self;

}

-(LZWCompressor *) initWithArray: (NSMutableArray *) stream {

  self = [self init];
  if ( self )
  {
     [self setArray: stream];
  }
  return self;

}

-(void) dealloc {

  [dict release];
  [iostream release];
  [super dealloc];

}

-(void) setArray: (NSMutableArray *) stream {

  iostream = [stream retain];

}

-(BOOL) compressData: (NSData *) string; {

   NSUInteger i;
   unsigned char j;
   
   // prepare dict
   for(i=0; i < 256; i++)
   {
      j = i;
      NSData *s = [NSData dataWithBytes: &j length: 1];
      [dict setObject: [NSNumber numberWithUnsignedInt: i] forKey: s];
   }
   
   NSMutableData *w = [NSMutableData data];
   NSMutableData *wc = [NSMutableData data];
   
   for(i=0; i < [string length]; i++)
   {
      [wc setData: w];
      [wc appendData: [string subdataWithRange: NSMakeRange(i, 1)]];
      if ( [dict objectForKey: wc] != nil )
      {
         [w setData: wc];
      } else {
         [iostream addObject: [dict objectForKey: w]];
         [dict setObject: [NSNumber numberWithUnsignedInt: codemark] forKey: wc];
         codemark++;
         [w setData: [string subdataWithRange: NSMakeRange(i, 1)]];
      }
   }
   if ( [w length] != 0 )
   {
      [iostream addObject: [dict objectForKey: w]];
   }
   return YES;

}

-(NSArray *) getArray {

 return iostream;

}

@end</lang>

Usage example:

<lang objc>const char *text = "TOBEORNOTTOBEORTOBEORNOT";

int main() {

 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
 
 NSMutableArray *array = [[NSMutableArray alloc] init];
 LZWCompressor *lzw = [[LZWCompressor alloc]
                       initWithArray: array ];
 if ( lzw )
 {
    [lzw compressData: [NSData dataWithBytes: text
                        length: strlen(text)]];
    NSEnumerator *en = [array objectEnumerator];
    id obj;
    while( (obj = [en nextObject]) )
    {
       printf("%u\n", [obj unsignedIntValue]);
    }
    [lzw release];
 }
 [array release];
 
 [pool release];
 return EXIT_SUCCESS;

}</lang>

Output (reformatted by hand):

 84  79  66  69  79  82  78  79
 84 256 258 260 265 259 261 263

OCaml

<lang ocaml>#directory "+extlib" (* or maybe "+site-lib/extlib/" *)

  1. load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in
 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in
 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;
 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = [w] in
 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)
</lang>

here is the interface: <lang ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</lang>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<lang ocaml>let greatest = List.fold_left max 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (IO.write_bits ob nbits) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)
</lang>

Perl

In this version the hashes contain mixed typed data: <lang perl># Compress a string to a list of output symbols. sub compress {

   my $uncompressed = shift;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = "";
   my @result;
   foreach my $c (split , $uncompressed) {
       my $wc = $w . $c;
       if (exists $dictionary{$wc}) {
           $w = $wc;
       } else {
           push @result, $dictionary{$w};
           # Add wc to the dictionary.
           $dictionary{$wc} = $dict_size;
           $dict_size++;
           $w = $c;
       }
   }
   # Output the code for w.
   if ($w) {
       push @result, $dictionary{$w};
   }
   return @result;

}

  1. Decompress a list of output ks to a string.

sub decompress {

   my @compressed = @_;
   # Build the dictionary.
   my $dict_size = 256;
   my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
   my $w = shift @compressed;
   my $result = $w;
   foreach my $k (@compressed) {
       my $entry;
       if (exists $dictionary{$k}) {
           $entry = $dictionary{$k};
       } elsif ($k == $dict_size) {
           $entry = $w . substr($w,0,1);
       } else {
           die "Bad compressed k: $k";
       }
       $result .= $entry;
       # Add w+entry[0] to the dictionary.
       $dictionary{$dict_size} = $w . substr($entry,0,1);
       $dict_size++;
       $w = $entry;
   }
   return $result;

}

  1. How to use:

my @compressed = compress('TOBEORNOTTOBEORTOBEORNOT'); print "@compressed\n"; my $decompressed = decompress(@compressed); print "$decompressed\n";</lang>

Output:

T O B E O R N O T 256 258 260 265 259 261 263
TOBEORNOTTOBEORTOBEORNOT

PicoLisp

<lang PicoLisp>(de lzwCompress (Lst)

  (let (Codes 255  Dict)
     (balance 'Dict
        (make
           (for C Codes
              (link (cons (char C) C)) ) ) )
     (make
        (let W (pop 'Lst)
           (for C Lst
              (let WC (pack W C)
                 (if (lup Dict WC)
                    (setq W WC)
                    (link (cdr (lup Dict W)))
                    (idx 'Dict (cons WC (inc 'Codes)) T)
                    (setq W C) ) ) )
           (and W (link (cdr (lup Dict W)))) ) ) ) )</lang>

Output:

: (lzwCompress (chop "TOBEORNOTTOBEORTOBEORNOT"))
-> (84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)

PureBasic

This version encodes character sequences as 16-bit values. Because this version only encodes an input string it won't handle Null values. This is because PureBasic uses these to terminate strings. Only slight modifications are necessary to handle Null values that would be present for a more generic routine that could be used with a buffer containing any data type. <lang PureBasic>Procedure compress(uncompressed.s, List result.u())

 ;Compress a string to a list of output symbols
 
 ;Build the dictionary.
 Protected  dict_size = 255, i
 newmap dict.u()
 For i = 0 To 254
   dict(Chr(i + 1)) = i
 Next 
 Protected w.s, wc.s, *c.Character = @uncompressed
 w = ""
 LastElement(result())
 While *c\c <> #Null
   wc = w + Chr(*c\c)
   If FindMapElement(dict(), wc)
     w = wc
   Else
     AddElement(result())
     result() = dict(w)
     ;Add wc to the dictionary
     dict(wc) = dict_size
     dict_size + 1 ;no check is performed for overfilling the dictionary.
     w = Chr(*c\c)
   EndIf 
   *c + 1
 Wend
 
 ;Output the code for w
 If w
   AddElement(result())
   result() = dict(w)
 EndIf 

EndProcedure

Procedure.s decompress(List compressed.u())

 ;Decompress a list of encoded values to a string
 If ListSize(compressed()) = 0: ProcedureReturn "": EndIf 
 
 ;Build the dictionary.
 Protected  dict_size = 255, i
 
 Dim dict.s(255)
 For i = 1 To 255
   dict(i - 1) = Chr(i)
 Next 
 Protected w.s, entry.s, result.s
 FirstElement(compressed())
 w = dict(compressed())
 result = w
 
 i = 0
 While NextElement(compressed())
   i + 1
   If compressed() < dict_size
   entry = dict(compressed())
   ElseIf i = dict_size
     entry = w + Left(w, 1)
   Else
     MessageRequester("Error","Bad compression at [" + Str(i) + "]")
     ProcedureReturn result;abort
   EndIf 
   result + entry
   ;Add w + Left(entry, 1) to the dictionary
   If ArraySize(dict()) <= dict_size
     Redim dict(dict_size + 256)
   EndIf 
   dict(dict_size) = w + Left(entry, 1)
   dict_size + 1 ;no check is performed for overfilling the dictionary.
   w = entry
 Wend 
 ProcedureReturn result

EndProcedure

If OpenConsole()

 ;How to use:
 
 Define initial.s, decompressed.s
 
 Print("Type something: ")
 initial = Input()
 NewList compressed.u()
 compress(initial, compressed())
 ForEach compressed()
   Print(Str(compressed()) + " ")
 Next 
 PrintN("")
 
 decompressed = decompress(compressed())
 PrintN(decompressed)
  
 Print(#CRLF$ + #CRLF$ + "Press ENTER to exit")
 Input()
 CloseConsole()

EndIf</lang> Sample output:

Type something: TOBEORNOTTOBEORTOBEORNOT
83 78 65 68 78 81 77 78 83 255 257 259 264 258 260 262
TOBEORNOTTOBEORTOBEORNOT

Python

In this version the dicts contain mixed typed data: <lang python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
   w = ""
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
   # Output the code for w.
   if w:
       result.append(dictionary[w])
   return result


def decompress(compressed):

   """Decompress a list of output ks to a string."""
   # Build the dictionary.
   dict_size = 256
   dictionary = dict((chr(i), chr(i)) for i in xrange(dict_size))
   # in Python 3: dictionary = {chr(i): chr(i) for i in range(dict_size)}
   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == dict_size:
           entry = w + w[0]
       else:
           raise ValueError('Bad compressed k: %s' % k)
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0]
       dict_size += 1
       w = entry
   return result


  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print (compressed) decompressed = decompress(compressed) print (decompressed)</lang>

Output:

['T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Ruby

In this version the hashes contain mixed typed data: <lang ruby># Compress a string to a list of output symbols. def compress(uncompressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
   w = ""
   result = []
   for c in uncompressed.split()
       wc = w + c
       if dictionary.has_key?(wc)
           w = wc
       else
           result << dictionary[w]
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1
           w = c
       end
   end
   # Output the code for w.
   result << dictionary[w] unless w.empty?
   result

end

  1. Decompress a list of output ks to a string.

def decompress(compressed)

   # Build the dictionary.
   dict_size = 256
   dictionary = Hash[ Array.new(dict_size) {|i| [i.chr, i.chr]} ]
   w = result = compressed.shift
   for k in compressed
       if dictionary.has_key?(k)
           entry = dictionary[k]
       elsif k == dict_size
           entry = w + w[0,1]
       else
           raise 'Bad compressed k: %s' % k
       end
       result += entry
       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w + entry[0,1]
       dict_size += 1
       w = entry
   end
   result

end

  1. How to use:

compressed = compress('TOBEORNOTTOBEORTOBEORNOT') p compressed decompressed = decompress(compressed) puts decompressed</lang>

Output:

["T", "O", "B", "E", "O", "R", "N", "O", "T", 256, 258, 260, 265, 259, 261, 263]
TOBEORNOTTOBEORTOBEORNOT

Scheme

<lang scheme>; Get the list reference number for a member or #f if not found (define (member-string-ref m l)

 (define r #f)
 (let loop ((i 0))
   (if (< i (length l))
       (if (not (string=? (list-ref l i) m))
           (loop (+ i 1))
           (set! r i))))
 r)
Compress a string with LZW

(define (lzw-compress uncompressed)

 (define dictionary '())
 (define n 0)
 (define result '())
 (set! uncompressed (string->list uncompressed))
 ;; Setup Dictionary
 (let dict-setup ((c 0))
   (if (> 256 c)
       (begin
         (set! dictionary (append dictionary
                                  (list (string (integer->char c)))))
         (set! n (+ n 1))
         (dict-setup (+ c 1)))))
       
 ;; Compress the string
 (let compress ((w "") (ci 0))
   (define c (string (list-ref uncompressed ci)))
   (define wc "")
   (set! wc (string-append w c))
   (if (member-string-ref wc dictionary)
       (set! w wc)
       (begin
         (set! result (append result
                              (list (member-string-ref w dictionary))))
         (set! dictionary (append dictionary (list wc)))
         (set! n (+ n 1))
         (set! w c)))
   (if (eqv? ci (- (length uncompressed) 1))
       (set! result (append result
                            (list (member-string-ref w dictionary))))
       (compress w (+ ci 1))))
 result)
Decompress a LZW compressed string (input should be a list of integers)

(define (lzw-decompress compressed)

 (define dictionary '())
 (define n 0)
 (define result "")
 ;; Setup Dictionary
 (let dict-setup ((c 0))
   (if (> 256 c)
       (begin
         (set! dictionary (append dictionary
                                  (list (string (integer->char c)))))
         (set! n (+ n 1))
         (dict-setup (+ c 1)))))
 ;; Decompress the list
 (let decompress ((k (list-ref compressed 0)) (ci 0))
   (define kn #f)
   ;; Add to dictionary
   (if (> (length compressed) (+ ci 1))
       (begin
         (set! kn (list-ref compressed (+ ci 1)))
         (if (< kn (length dictionary))
             (set! dictionary
                   (append dictionary
                           (list (string-append

(list-ref dictionary k) (string (string-ref (list-ref dictionary kn) 0)))))))))

   ;; Build the resulting string
   (set! result (string-append result (list-ref dictionary k)))
       
   (if (not (eqv? ci (- (length compressed) 1)))
       (decompress kn (+ ci 1))))
 result)

(define compressed (lzw-compress "TOBEORNOTTOBEORTOBEORNOT")) (display compressed) (newline) (define decompressed (lzw-decompress compressed)) (display decompressed) (newline)</lang>

Output:

(84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263)
TOBEORNOTTOBEORTOBEORNOT

Seed7

<lang seed7>$ include "seed7_05.s7i";

const func string: lzwCompress (in string: uncompressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [string] char: mydict is (hash [string] char).value;
   var string: buffer is "";
   var string: xstr is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [str(ch)] ch;
   end for;
   for ch range uncompressed do
     xstr := buffer & str(ch);
     if xstr in mydict then
       buffer &:= str(ch)
     else
       result &:= str(mydict[buffer]);
       mydict @:= [xstr] chr(length(mydict));
       buffer := str(ch);
     end if;
   end for;
   if buffer <> "" then
     result &:= str(mydict[buffer]);
   end if;
 end func;

const func string: lzwDecompress (in string: compressed) is func

 result
   var string: result is "";
 local
   var char: ch is ' ';
   var hash [char] string: mydict is (hash [char] string).value;
   var string: buffer is "";
   var string: current is "";
   var string: chain is "";
 begin
   for ch range chr(0) to chr(255) do
     mydict @:= [ch] str(ch);
   end for;
   for ch range compressed do
     if buffer = "" then
       buffer := mydict[ch];
       result &:= buffer;
     elsif ch <= chr(255) then
       current := mydict[ch];
       result &:= current;
       chain := buffer & current;
       mydict @:= [chr(length(mydict))] chain;
       buffer := current;
     else
       if ch in mydict then
         chain := mydict[ch];
       else
         chain := buffer & str(buffer[1]);
       end if;
       result &:= chain;
       mydict @:= [chr(length(mydict))] buffer & str(chain[1]);
       buffer := chain;
     end if;
   end for;
 end func;

const proc: main is func

 local
   var string: compressed is "";
   var string: uncompressed is "";
 begin
   compressed := lzwCompress("TOBEORNOTTOBEORTOBEORNOT");
   writeln(literal(compressed));
   uncompressed := lzwDecompress(compressed);
   writeln(uncompressed);
 end func;</lang>

Output:

"TOBEORNOT\256\\258\\260\\265\\259\\261\\263\"
TOBEORNOTTOBEORTOBEORNOT

Original source: [1] and [2]

Tcl

<lang tcl>namespace eval LZW {

   variable char2int
   variable chars
   for {set i 0} {$i < 256} {incr i} {
       set char [binary format c $i]
       set char2int($char) $i
       lappend chars $char
   }

}

proc LZW::encode {data} {

   variable char2int
   array set dict [array get char2int]
   set w ""
   set result [list]
   foreach c [split $data ""] {
       set wc $w$c
       if {[info exists dict($wc)]} {
           set w $wc
       } else {
           lappend result $dict($w)
           set dict($wc) [array size dict]
           set w $c
       }
   }
   lappend result $dict($w)

}

proc LZW::decode {cdata} {

   variable chars
   set dict $chars
   set k [lindex $cdata 0]
   set w [lindex $dict $k]
   set result $w
   foreach k [lrange $cdata 1 end] {
       set currSizeDict [llength $dict]
       if {$k < $currSizeDict} {
           set entry [lindex $dict $k]
       } elseif {$k == $currSizeDict} {
           set entry $w[string index $w 0]
       } else {
           error "invalid code ($k) in ($cdata)"
       }
       append result $entry
       lappend dict $w[string index $entry 0]
       set w $entry
   }
   return $result

}

set s TOBEORNOTTOBEORTOBEORNOT# set e [LZW::encode $s] ;# ==> 84 79 66 69 79 82 78 79 84 256 258 260 265 259 261 263 35 set d [LZW::decode $e] ;# ==> TOBEORNOTTOBEORTOBEORNOT#

  1. or

if {$s eq [LZW::decode [LZW::encode $s]]} then {puts success} else {puts fail} ;# ==> success