LZW compression: Difference between revisions

From Rosetta Code
Content added Content deleted
(OCaml version)
 
(Python version)
Line 136: Line 136:
(Buffer.contents buf)
(Buffer.contents buf)
;;</ocaml>
;;</ocaml>


=={{header|Python}}==

<python>def compress(uncompressed):
"""Compress a string to a list of output symbols."""
# Build the dictionary.
dict_size = 256
dictionary = {}
for i in range(dict_size):
dictionary[chr(i)] = chr(i)
w = ''
result = []
for c in uncompressed:
wc = w + c
if wc in dictionary:
w = wc
else:
result.append(dictionary[w])
# Add wc to the dictionary.
dictionary[wc] = dict_size
dict_size += 1
w = c
# Output the code for w.
if w:
result += [char for char in w]
return result
def decompress(compressed):
"""Decompress a list of output ks to a string."""
# Build the dictionary.
dict_size = 256
dictionary = {}
for i in range(dict_size):
dictionary[chr(i)] = chr(i)
w = result = compressed.pop(0)
for k in compressed:
if k in dictionary:
entry = dictionary[k]
elif k == len(dictionary):
entry = w + w[0]
else:
raise ValueError, 'Bad compressed k: %s' % k
result += entry
# Add w+entry[0] to the dictionary.
dictionary[dict_size] = w+entry[0]
dict_size += 1
w = entry
return result</python>

How to use:

<python>compressed = compress('TOBEORNOTTOBEORTOBEORNOT')
print compressed
decompressed = decompress(compressed)
print decompressed</python>

Revision as of 22:16, 8 August 2008

The Lempel-Ziv-Welch (LZW) algorithm provides lossless data compression. You can read a complete description of it in the Wikipedia article on the subject. It was patented, but it fell in the public domain in 2004.


OCaml

<ocaml>#directory "+site-lib/extlib/"

  1. load "extLib.cma"

open ExtString

(** compress a string to a list of output symbols *) let compress ~uncompressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to 255 do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary str i
 done;

 let f = (fun (w, dict_size, result) c ->
   let c = String.make 1 c in
   let wc = w ^ c in
   if Hashtbl.mem dictionary wc then
     (wc, dict_size, result)
   else
     begin
       (* add wc to the dictionary *)
       Hashtbl.add dictionary wc dict_size;
       let this = Hashtbl.find dictionary w in
       (c, dict_size + 1, this::result)
     end
 ) in
 let w, _, result =
   String.fold_left f ("", dict_size, []) uncompressed
 in
 (* output the code for w *)
 let result =
   if w = ""
   then result
   else (Hashtbl.find dictionary w) :: result
 in
 (List.rev result)

exception ValueError of string

(** decompress a list of output symbols to a string *) let decompress ~compressed =

 (* build the dictionary *)
 let dict_size = 256 in
 let dictionary = Hashtbl.create 397 in
 for i=0 to pred dict_size do
   let str = String.make 1 (char_of_int i) in
   Hashtbl.add dictionary i str
 done;
 let w, compressed =
   match compressed with
   | hd::tl -> (String.make 1 (char_of_int hd)), tl
   | [] -> failwith "empty input"
 in

 let result = w::[] in
 let result, _, _ =
   List.fold_left (fun (result, w, dict_size) k ->
     let entry =
       if Hashtbl.mem dictionary k then
         Hashtbl.find dictionary k
       else if k = Hashtbl.length dictionary then
         w ^ (String.make 1 w.[0])
       else
         raise(ValueError(Printf.sprintf "Bad compressed k: %d" k))
     in
     let result = entry :: result in
  
     (* add (w ^ entry.[0]) to the dictionary *)
     Hashtbl.add dictionary dict_size (w ^ (String.make 1 entry.[0]));
     (result, entry, dict_size + 1)
   ) (result, w, dict_size) compressed
 in
 (List.rev result)
</ocaml>

here is the interface: <ocaml>val compress : uncompressed:string -> int list val decompress : compressed:int list -> string list</ocaml>

How to use:
The compressed datas are a list of symbols (of type int) that will require more than 8 bits to be saved. So to know how many bits are required, you need to know how many bits are required for the greatest symbol in the list.

<ocaml>let greatest = List.fold_left (fun m x -> max m x) 0 ;;

(** number of bits needed to encode the integer m *) let n_bits m =

 let m = float m in
 let rec aux n =
   let max = (2. ** n) -. 1. in
   if max >= m then int_of_float n
   else aux (n +. 1.0)
 in
 aux 1.0

let write_compressed ~filename ~compressed =

 let nbits = n_bits(greatest compressed) in
 let oc = open_out filename in
 output_byte oc nbits;
 let ob = IO.output_bits(IO.output_channel oc) in
 List.iter (fun code ->
     IO.write_bits ob nbits code
   ) compressed;
 IO.flush_bits ob;
 close_out oc;

let read_compressed ~filename =

 let ic = open_in filename in
 let nbits = input_byte ic in
 let ib = IO.input_bits(IO.input_channel ic) in
 let rec loop acc =
   try
     let code = IO.read_bits ib nbits in
     loop (code::acc)
   with _ -> List.rev acc
 in
 let compressed = loop [] in
 let result = decompress ~compressed in
 let buf = Buffer.create 2048 in
 List.iter (Buffer.add_string buf) result;
 (Buffer.contents buf)
</ocaml>


Python

<python>def compress(uncompressed):

   """Compress a string to a list of output symbols."""

   # Build the dictionary.
   dict_size = 256
   dictionary = {}
   for i in range(dict_size):
       dictionary[chr(i)] = chr(i)

   w = 
   result = []
   for c in uncompressed:
       wc = w + c
       if wc in dictionary:
           w = wc
       else:
           result.append(dictionary[w])
           # Add wc to the dictionary.
           dictionary[wc] = dict_size
           dict_size += 1

           w = c
   # Output the code for w.
   if w:
       result += [char for char in w]
   return result

def decompress(compressed):

   """Decompress a list of output ks to a string."""

   # Build the dictionary.
   dict_size = 256
   dictionary = {}
   for i in range(dict_size):
       dictionary[chr(i)] = chr(i)

   w = result = compressed.pop(0)
   for k in compressed:
       if k in dictionary:
           entry = dictionary[k]
       elif k == len(dictionary):
           entry = w + w[0]
       else:
           raise ValueError, 'Bad compressed k: %s' % k
       result += entry

       # Add w+entry[0] to the dictionary.
       dictionary[dict_size] = w+entry[0]
       dict_size += 1

       w = entry
   return result</python>

How to use:

<python>compressed = compress('TOBEORNOTTOBEORTOBEORNOT') print compressed decompressed = decompress(compressed) print decompressed</python>