Huffman coding
From Rosetta Code
You are encouraged to solve this task according to the task description, using any language you may know.
Huffman encoding is a way to assign binary codes to symbols that reduces the overall number of bits used to encode a typical string of those symbols.
For example, if you use letters as symbols and have details of the frequency of occurence of those letters in typical strings, then you could just encode each letter with a fixed number of bits, such as in ASCII codes. You can do better than this by encoding more frequently occurring letters such as e and a, with smaller bit strings; and less frequently occurring letters such as q and x with longer bit strings.
Any string of letters will be encoded as a string of bits that are no-longer of the same length per letter. To successfully decode such as string, the smaller codes assigned to letters such as 'e' cannot occur as a prefix in the larger codes such as that for 'x'.
- If you were to assign a code 01 for 'e' and code 011 for 'x', then if the bits to decode started as 011... then you would not know if you should decode an 'e' or an 'x'.
The Huffman coding scheme takes each symbol and its weight (or frequency of occurrence), and generates proper encodings for each symbol taking account of the weights of each symbol, so that higher weighted symbols have less bits in their encoding. (See the WP article for more information).
A Huffman encoding can be computed by first creating a tree of nodes:
- Create a leaf node for each symbol and add it to the priority queue.
- While there is more than one node in the queue:
- Remove the node of highest priority (lowest probability) twice to get two nodes.
- Create a new internal node with these two nodes as children and with probability equal to the sum of the two nodes' probabilities.
- Add the new node to the queue.
- The remaining node is the root node and the tree is complete.
Traverse the constructed binary tree from root to leaves assigning and accumulating a '0' for one branch and a '1' for the other at each node. The accumulated zeroes and ones at each leaf constitute a Huffman encoding for those symbols and weights:
Using the characters and their frequency from the string "this is an example for huffman encoding", create a program to generate a Huffman encoding for each character as a table.
Contents |
[edit] C
This code lacks a lot of needed checkings, expecially for memory allocation.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BYTES 256
struct huffcode {
int nbits;
int code;
};
typedef struct huffcode huffcode_t;
struct huffheap {
int *h;
int n, s, cs;
long *f;
};
typedef struct huffheap heap_t;
/* heap handling funcs */
static heap_t *_heap_create(int s, long *f)
{
heap_t *h;
h = malloc(sizeof(heap_t));
h->h = malloc(sizeof(int)*s);
h->s = h->cs = s;
h->n = 0;
h->f = f;
return h;
}
static void _heap_destroy(heap_t *heap)
{
free(heap->h);
free(heap);
}
#define swap_(I,J) do { int t_; t_ = a[(I)]; \
a[(I)] = a[(J)]; a[(J)] = t_; } while(0)
static void _heap_sort(heap_t *heap)
{
int i=1, j=2; /* gnome sort */
int *a = heap->h;
while(i < heap->n) { /* smaller values are kept at the end */
if ( heap->f[a[i-1]] >= heap->f[a[i]] ) {
i = j; j++;
} else {
swap_(i-1, i);
i--;
i = (i==0) ? j++ : i;
}
}
}
#undef swap_
static void _heap_add(heap_t *heap, int c)
{
if ( (heap->n + 1) > heap->s ) {
heap->h = realloc(heap->h, heap->s + heap->cs);
heap->s += heap->cs;
}
heap->h[heap->n] = c;
heap->n++;
_heap_sort(heap);
}
static int _heap_remove(heap_t *heap)
{
if ( heap->n > 0 ) {
heap->n--;
return heap->h[heap->n];
}
return -1;
}
/* huffmann code generator */
huffcode_t **create_huffman_codes(long *freqs)
{
huffcode_t **codes;
heap_t *heap;
long efreqs[BYTES*2];
int preds[BYTES*2];
int i, extf=BYTES;
int r1, r2;
memcpy(efreqs, freqs, sizeof(long)*BYTES);
memset(&efreqs[BYTES], 0, sizeof(long)*BYTES);
heap = _heap_create(BYTES*2, efreqs);
if ( heap == NULL ) return NULL;
for(i=0; i < BYTES; i++) if ( efreqs[i] > 0 ) _heap_add(heap, i);
while( heap->n > 1 )
{
r1 = _heap_remove(heap);
r2 = _heap_remove(heap);
efreqs[extf] = efreqs[r1] + efreqs[r2];
_heap_add(heap, extf);
preds[r1] = extf;
preds[r2] = -extf;
extf++;
}
r1 = _heap_remove(heap);
preds[r1] = r1;
_heap_destroy(heap);
codes = malloc(sizeof(huffcode_t *)*BYTES);
int bc, bn, ix;
for(i=0; i < BYTES; i++) {
bc=0; bn=0;
if ( efreqs[i] == 0 ) { codes[i] = NULL; continue; }
ix = i;
while( abs(preds[ix]) != ix ) {
bc |= ((preds[ix] >= 0) ? 1 : 0 ) << bn;
ix = abs(preds[ix]);
bn++;
}
codes[i] = malloc(sizeof(huffcode_t));
codes[i]->nbits = bn;
codes[i]->code = bc;
}
return codes;
}
void free_huffman_codes(huffcode_t **c)
{
int i;
for(i=0; i < BYTES; i++) if (c[i] != NULL) free(c[i]);
free(c);
}
#define MAXBITSPERCODE 100
void inttobits(int c, int n, char *s)
{
s[n] = 0;
while(n > 0) {
s[n-1] = (c%2) + '0';
c >>= 1; n--;
}
}
const char *test = "this is an example for huffman encoding";
int main()
{
huffcode_t **r;
int i;
char strbit[MAXBITSPERCODE];
const char *p;
long freqs[BYTES];
memset(freqs, 0, sizeof freqs);
p = test;
while(*p != '\0') freqs[*p++]++;
r = create_huffman_codes(freqs);
for(i=0; i < BYTES; i++) {
if ( r[i] != NULL ) {
inttobits(r[i]->code, r[i]->nbits, strbit);
printf("%c (%d) %s\n", i, r[i]->code, strbit);
}
}
free_huffman_codes(r);
return 0;
}
[edit] C++
This code builds a tree to generate huffman codes, then prints the codes.
#include <iostream>
#include <queue>
#include <map>
#include <boost/dynamic_bitset.hpp>
const int UniqueSymbols = 1 << CHAR_BIT;
const char* SampleString = "this is an example for huffman encoding";
typedef boost::dynamic_bitset<> HuffCode;
typedef std::map<char, HuffCode> HuffCodeMap;
class INode
{
public:
INode* parent;
int f;
virtual ~INode() {}
protected:
INode(INode* parent, int f) : parent(parent), f(f) {}
};
class InternalNode : public INode
{
public:
INode* children[2];
InternalNode(INode* parent, INode* c0, INode* c1) : INode(parent, c0->f + c1->f)
{
children[0] = c0;
children[1] = c1;
c0->parent = this;
c1->parent = this;
}
~InternalNode()
{
delete children[0];
delete children[1];
}
};
class LeafNode : public INode
{
public:
char c;
LeafNode(INode* parent, int f, char c) : INode(parent, f), c(c)
{}
};
struct NodeCmp
{
bool operator()(const INode* lhs, const INode* rhs) { return (lhs->f > rhs->f); }
};
InternalNode* BuildTree(const int (&frequencies)[UniqueSymbols])
{
typedef std::priority_queue<INode*, std::vector<INode*>, NodeCmp> TSymbolContainer;
TSymbolContainer trees = TSymbolContainer(NodeCmp());
for (int i = 0; i < UniqueSymbols; ++i)
{
if(frequencies[i] != 0)
trees.push(new LeafNode(NULL, frequencies[i], (char)i));
}
while (trees.size() > 1)
{
INode* childR = trees.top();
trees.pop();
INode* childL = trees.top();
trees.pop();
INode* parent = new InternalNode(NULL, childR, childL);
trees.push(parent);
}
return static_cast<InternalNode*>(trees.top());
}
void GenerateCodes(const INode* node, const HuffCode& prefix, HuffCodeMap& outCodes)
{
if (typeid(*node) == typeid(LeafNode))
{
const LeafNode* lf = static_cast<const LeafNode*>(node);
outCodes[lf->c] = prefix;
}
else
{
HuffCode temp = prefix;
temp.resize(prefix.size() + 1);
temp <<= 1;
HuffCode leftPrefix = temp;
leftPrefix[0] = false;
GenerateCodes(static_cast<const InternalNode*>(node)->children[0], leftPrefix, outCodes);
HuffCode rightPrefix = temp;
rightPrefix[0] = true;
GenerateCodes(static_cast<const InternalNode*>(node)->children[1], rightPrefix, outCodes);
}
}
int main()
{
// Build frequency table
int frequencies[UniqueSymbols] = {0};
const char* ptr = SampleString;
while (*ptr != '\0')
++frequencies[*ptr++];
InternalNode* root = BuildTree(frequencies);
HuffCodeMap codes;
GenerateCodes(root, HuffCode(), codes);
for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); ++it)
{
std::cout << it->first << " " << it->second << std::endl;
}
}
Sample output:
110 a 1001 c 101010 d 10001 e 1111 f 1011 g 101011 h 0101 i 1110 l 01110 m 0011 n 000 o 0010 p 01000 r 01001 s 0110 t 01111 u 10100 x 10000
[edit] Clojure
(use 'clojure.contrib.seq-utils)
(defn probs [items]
(let [freqs (frequencies items) sum (reduce + (vals freqs))]
(into {} (map (fn [[k v]] [k (/ v sum)]) freqs))))
(defn init-pq [weighted-items]
(let [comp (proxy [java.util.Comparator] []
(compare [a b] (compare (:priority a) (:priority b))))
pq (java.util.PriorityQueue. (count weighted-items) comp)]
(doseq [[item prob] weighted-items] (.add pq { :symbol item, :priority prob }))
pq))
(defn huffman-tree [pq]
(while (> (.size pq) 1)
(let [a (.poll pq) b (.poll pq) new-node { :priority (+ (:priority a) (:priority b)), :left a, :right b }]
(.add pq new-node)))
(.poll pq))
(defn symbol-map
([t] (into {} (symbol-map t [])))
([{:keys [symbol,left,right] :as t} code]
(if symbol [[symbol code]]
(concat (symbol-map left (conj code 0))
(symbol-map right (conj code 1))))))
(defn huffman-encode [items]
(-> items probs init-pq huffman-tree symbol-map))
(defn display-huffman-encode [s]
(println "SYMBOL\tWEIGHT\tHUFFMAN CODE")
(let [probs (probs (seq s))]
(doseq [[char code] (huffman-encode (seq s))]
(printf "%s:\t\t%s\t\t%s\n" char (probs char) (apply str code)))))
(display-huffman-encode "this is an example for huffman encoding")
Program Output:
SYMBOL WEIGHT HUFFMAN CODE : 2/13 111 a: 1/13 1001 c: 1/39 00110 d: 1/39 00000 e: 1/13 1011 f: 1/13 1101 g: 1/39 101010 h: 2/39 0001 i: 1/13 1100 l: 1/39 10001 m: 2/39 0100 n: 4/39 011 o: 2/39 0101 p: 1/39 101011 r: 1/39 10100 s: 2/39 0010 t: 1/39 00001 u: 1/39 10000 x: 1/39 00111
[edit] D
From the Python version, D V.2.
import std.stdio, std.algorithm, std.typecons;
/// Huffman encode the given dict mapping symbols to weights
auto encode(TFreq, TSymb)(TFreq[TSymb] symb2freq) {
alias Tuple!(TSymb, "symb", string, "code") Pair;
alias Tuple!(TFreq, "freq", Pair[], "pairs") Block;
Block[] blocks;
foreach (symb, freq; symb2freq)
blocks ~= Block(freq, [Pair(symb, "")]);
auto heap = BinaryHeap!(Block[], "b < a")(blocks);
while (heap.length > 1) {
auto lo = heap.pop(1)[0];
auto hi = heap.pop(1)[0];
foreach (ref pair; lo.pairs)
pair.code = '0' ~ pair.code;
foreach (ref pair; hi.pairs)
pair.code = '1' ~ pair.code;
heap.push(Block(lo.freq + hi.freq, lo.pairs ~ hi.pairs));
}
auto r = heap.pop(1)[0].pairs;
schwartzSort!((x){ return x.code.length; })(r);
return r;
}
void main() {
auto txt = "this is an example for huffman encoding";
int[char] symb2freq;
foreach (c; txt)
symb2freq[c]++;
auto huff = encode(symb2freq);
writeln("Symbol\tWeight\tHuffman Code");
foreach (p; huff)
writefln("%s\t%s\t%s", p.symb, symb2freq[p.symb], p.code);
}
Example output:
Symbol Weight Huffman Code
n 4 010
6 101
a 3 1001
e 3 1100
o 2 0011
i 3 1110
h 2 0001
f 3 1101
s 2 0111
m 2 0010
t 1 10000
g 1 00000
r 1 01101
p 1 01100
l 1 00001
u 1 10001
x 1 11110
d 1 111111
c 1 111110
[edit] Common Lisp
This implementation uses a tree built of huffman-nodes, and a hash table mapping from elements of the input sequence to huffman-nodes. The priority queue is implemented as a sorted list. (For a more efficient implementation of a priority queue, see the Heapsort task.)
(defstruct huffman-node
(weight 0 :type number)
(element nil :type t)
(encoding nil :type (or null bit-vector))
(left nil :type (or null huffman-node))
(right nil :type (or null huffman-node)))
(defun initial-huffman-nodes (sequence &key (test 'eql))
(let* ((length (length sequence))
(increment (/ 1 length))
(nodes (make-hash-table :size length :test test))
(queue '()))
(map nil #'(lambda (element)
(multiple-value-bind (node presentp) (gethash element nodes)
(if presentp
(incf (huffman-node-weight node) increment)
(let ((node (make-huffman-node :weight increment
:element element)))
(setf (gethash element nodes) node
queue (list* node queue))))))
sequence)
(values nodes (sort queue '< :key 'huffman-node-weight))))
(defun huffman-tree (sequence &key (test 'eql))
(multiple-value-bind (nodes queue)
(initial-huffman-nodes sequence :test test)
(do () ((endp (rest queue)) (values nodes (first queue)))
(destructuring-bind (n1 n2 &rest queue-rest) queue
(let ((n3 (make-huffman-node
:left n1
:right n2
:weight (+ (huffman-node-weight n1)
(huffman-node-weight n2)))))
(setf queue (merge 'list (list n3) queue-rest '<
:key 'huffman-node-weight)))))))1
(defun huffman-codes (sequence &key (test 'eql))
(multiple-value-bind (nodes tree)
(huffman-tree sequence :test test)
(labels ((hc (node length bits)
(let ((left (huffman-node-left node))
(right (huffman-node-right node)))
(cond
((and (null left) (null right))
(setf (huffman-node-encoding node)
(make-array length :element-type 'bit
:initial-contents (reverse bits))))
(t (hc left (1+ length) (list* 0 bits))
(hc right (1+ length) (list* 1 bits)))))))
(hc tree 0 '())
nodes)))
(defun print-huffman-code-table (nodes &optional (out *standard-output*))
(format out "~&Element~10tWeight~20tCode")
(loop for node being each hash-value of nodes
do (format out "~&~s~10t~s~20t~s"
(huffman-node-element node)
(huffman-node-weight node)
(huffman-node-encoding node))))
Example:
> (print-huffman-code-table (huffman-codes "this is an example for huffman encoding")) Element Weight Code #\t 1/39 #*10010 #\d 1/39 #*01101 #\m 2/39 #*0100 #\f 1/13 #*1100 #\o 2/39 #*0111 #\x 1/39 #*100111 #\h 2/39 #*1000 #\a 1/13 #*1010 #\s 2/39 #*0101 #\c 1/39 #*00010 #\l 1/39 #*00001 #\u 1/39 #*00011 #\e 1/13 #*1101 #\n 4/39 #*001 #\g 1/39 #*01100 #\p 1/39 #*100110 #\i 1/13 #*1011 #\r 1/39 #*00000 #\Space 2/13 #*111
[edit] Haskell
Credits go to huffman where you'll also find a non-tree solution.
import Data.List
import Control.Arrow
import Control.Monad
import Data.Ord
data HTree a = Leaf a | Branch (HTree a) (HTree a)
deriving (Show, Eq, Ord)
freq :: (Eq a) => [a] -> [(Int, a)]
freq = map(length &&& head). takeWhile(not.null). unfoldr (Just. (partition =<< (==). head))
serialize :: HTree d -> [(d, String)]
serialize (Branch l r) = map (second('0':)) (serialize l) ++ map (second('1':)) (serialize r)
serialize (Leaf x) = [(x, "")]
htree :: (Ord t, Num t) => [(t, HTree a)] -> HTree a
htree [(_, t)] = t
htree ((w1,t1):(w2,t2):wts) =
htree $ insertBy (comparing fst) (w1 + w2, Branch t1 t2) wts
huffman :: (Ord w, Num w) => [(w, a)] -> [(a, String)]
huffman = serialize. htree. map (second Leaf)
Example output:
*Main> mapM_ (putStrLn.uncurry((.(' ':)).(:))). huffman $ freq "this is an example for huffman encoding"
s 000
t 00100
h 00101
i 0011
010
a 011
r 10000
u 10001
g 10010
c 100110
d 100111
f 1010
o 1011
p 11000
l 11001
x 11010
m 11011
n 1110
e 1111
[edit] Icon and Unicon
[edit] Icon
record huffnode(l,r,n,c) # internal and leaf nodes
record huffcode(c,n,b,i) # encoding table char, freq, bitstring, bits (int)
procedure main()
s := "this is an example for huffman encoding"
Count := huffcount(s) # frequency count
Tree := huffTree(Count) # heap and tree
Code := [] # extract encodings
CodeT := table()
every x := huffBits(Tree) do
put( Code, CodeT[c] := huffcode( c := x[-1], Count[c].n, b := x[1:-1], integer("2r"||b) ) )
Code := sortf( Code, 1 ) # show table in char order
write("Input String : ",image(s))
write(right("char",5), right("freq",5), " encoding" )
every write(right(image((x := !Code).c),5), right(x.n,5), " ", x.b )
end
procedure huffBits(N) # generates huffman bitcodes with trailing character
if \N.c then return N.c # . append leaf char code
suspend "0" || huffBits(N.l) # . left
suspend "1" || huffBits(N.r) # . right
end
procedure huffTree(T) # two queue huffman tree method
local Q1,Q2,x,n1,n2
Q1 := [] # queue of characters and weights
every x := !T do # ensure all are huffnodes
if type(x) == "huffnode" then put(Q1,x) else runerr(205,x)
Q1 := sortf(Q1,3) # sort by weight ( 3 means by .n )
if *Q1 > 1 then Q2 := []
while *Q1+*\Q2 > 1 do { # While there is more than one node ...
n1 := if Q1[1] & ( ( Q1[1].n <= Q2[1].n ) | not Q2[1] ) then get(Q1) else get(Q2) # lowest weight from Q1 or Q2
n2 := if Q1[1] & ( ( Q1[1].n <= Q2[1].n ) | not Q2[1] ) then get(Q1) else get(Q2) # lowest weight from Q1 or Q2
put( Q2, huffnode( n1, n2, n1.n + n2.n ) ) # new weighted node to end of Q2
}
return (\Q2 | Q1)[1] # return the root node
end
procedure huffcount(s) # return characters and frequencies in a table of huffnodes by char
local c,T
T := table()
every c := !s do {
/T[c] := huffnode(,,0,c)
T[c].n +:= 1
}
return T
end
Sample output:
Input String : "this is an example for huffman encoding" char freq encoding " " 6 101 "a" 3 1100 "c" 1 10000 "d" 1 10001 "e" 3 1101 "f" 3 1110 "g" 1 11110 "h" 2 11111 "i" 3 1001 "l" 1 01101 "m" 2 0011 "n" 4 000 "o" 2 0100 "p" 1 01100 "r" 1 01110 "s" 2 0010 "t" 1 01010 "u" 1 01111 "x" 1 01011
[edit] Unicon
This Icon solution works in Unicon. Library: Icon Programming Library HuffStuff provides huffman encoding routines
[edit] J
Implementation from http://www.jsoftware.com/jwiki/Essays/Huffman%20Coding
hc=: 4 : 0
if. 1=#x do. y
else. ((i{x),+/j{x) hc (i{y),<j{y [ i=. (i.#x) -. j=. 2{./:x end.
)
hcodes=: 4 : 0
assert. x -:&$ y NB. weights and words have same shape
assert. (0<:x) *. 1=#$x NB. weights are non-negative
assert. 1 >: L.y NB. words are boxed not more than once
w=. ,&.> y NB. standardized words
assert. w -: ~.w NB. words are unique
t=. 0 {:: x hc w NB. minimal weight binary tree
((< S: 0 t) i. w) { <@(1&=)@; S: 1 {:: t
)
;"1":L:0(#/.~ (],.(<' '),.hcodes) ,&.>@~.)'this is an example for huffman encoding'
t 0 1 0 1 0
h 1 1 1 1 1
i 1 0 0 1
s 0 0 1 0
1 0 1
a 1 1 0 0
n 0 0 0
e 1 1 0 1
x 0 1 0 1 1
m 0 0 1 1
p 0 1 1 0 0
l 0 1 1 0 1
f 1 1 1 0
o 0 1 0 0
r 0 1 1 1 0
u 0 1 1 1 1
c 1 0 0 0 0
d 1 0 0 0 1
g 1 1 1 1 0
[edit] Java
This implementation creates an actual tree structure, and then traverses the tree to recover the code.
import java.util.*;
abstract class HuffmanTree implements Comparable<HuffmanTree> {
public int frequency; // the frequency of this tree
public HuffmanTree(int freq) { frequency = freq; }
// compares on the frequency
public int compareTo(HuffmanTree tree) {
return frequency - tree.frequency;
}
}
class HuffmanLeaf extends HuffmanTree {
public char value; // the character this leaf represents
public HuffmanLeaf(int freq, char val) {
super(freq);
value = val;
}
}
class HuffmanNode extends HuffmanTree {
public HuffmanTree left, right; // subtrees
public HuffmanNode(HuffmanTree l, HuffmanTree r) {
super(l.frequency + r.frequency);
left = l;
right = r;
}
}
public class HuffmanCode {
// input is an array of frequencies, indexed by character code
public static HuffmanTree buildTree(int[] charFreqs) {
PriorityQueue<HuffmanTree> trees = new PriorityQueue<HuffmanTree>();
// initially, we have a forest of leaves
// one for each non-empty character
for (int i = 0; i < charFreqs.length; i++)
if (charFreqs[i] > 0)
trees.offer(new HuffmanLeaf(charFreqs[i], (char)i));
assert trees.size() > 0;
// loop until there is only one tree left
while (trees.size() > 1) {
// two trees with least frequency
HuffmanTree a = trees.poll();
HuffmanTree b = trees.poll();
// put into new node and re-insert into queue
trees.offer(new HuffmanNode(a, b));
}
return trees.poll();
}
public static void printCodes(HuffmanTree tree, Stack<Character> prefix) {
assert tree != null;
if (tree instanceof HuffmanLeaf) {
HuffmanLeaf leaf = (HuffmanLeaf)tree;
// print out character and frequency
System.out.print(leaf.value + "\t" + leaf.frequency + "\t");
// print out code for this leaf, which is just the prefix
for (char bit : prefix)
System.out.print(bit);
System.out.println();
} else if (tree instanceof HuffmanNode) {
HuffmanNode node = (HuffmanNode)tree;
// traverse left
prefix.push('0');
printCodes(node.left, prefix);
prefix.pop();
// traverse right
prefix.push('1');
printCodes(node.right, prefix);
prefix.pop();
}
}
public static void main(String[] args) {
String test = "this is an example for huffman encoding";
// we will assume that all our characters will have
// code less than 256, for simplicity
int[] charFreqs = new int[256];
// read each character and record the frequencies
for (char c : test.toCharArray())
charFreqs[c]++;
// build tree
HuffmanTree tree = buildTree(charFreqs);
// print out results
System.out.println("SYMBOL\tWEIGHT\tHUFFMAN CODE");
printCodes(tree, new Stack<Character>());
}
}
Example output:
SYMBOL WEIGHT HUFFMAN CODE d 1 00000 t 1 00001 h 2 0001 s 2 0010 c 1 00110 x 1 00111 m 2 0100 o 2 0101 n 4 011 u 1 10000 l 1 10001 a 3 1001 r 1 10100 g 1 101010 p 1 101011 e 3 1011 i 3 1100 f 3 1101 6 111
[edit] JavaScript
Translation of: Ruby
Works with: SpiderMonkey for the print() function.
First, implement a Priority Queue
// first argument should be a boolean: sort order
// true if minimum priority first
// default is false
function PriorityQueue() {
if (arguments.length == 1 && arguments[0] == true)
this.sort = this.sort_ascending;
else
this.sort = this.sort_descending;
this.queue = [];
}
PriorityQueue.prototype.enqueue = function(priority, data) {
this.queue.push([priority, data]);
this.sort();
}
// returns an array: [priority, data]
PriorityQueue.prototype.dequeue = function() {
return this.queue.shift();
}
PriorityQueue.prototype.length = function() {
return this.queue.length;
}
PriorityQueue.prototype.sort_ascending = function() {
this.queue.sort(function(a,b) {
return a[0] - b[0];
})
}
PriorityQueue.prototype.sort_descending = function() {
this.queue.sort(function(a,b) {
return b[0] - a[0];
})
}
PriorityQueue.prototype.peek = function() {
return this.queue[0];
}
PriorityQueue.prototype.inspect = function() {
var out = [];
for (var i = 0; i < this.queue.length; i++) {
out.push("[" + this.queue[i].join(",") + "]");
}
return out.join(",");
}
Then, a utility function to query the existance of a property of an object
function has_property(obj, propname) {
return typeof(obj[propname]) == "undefined" ? false : true;
}
The Huffman encoder
function HuffmanEncoding(str) {
this.str = str;
var count_chars = {};
for (var i = 0; i < str.length; i++)
if (has_property(count_chars, str[i]))
count_chars[str[i]] ++;
else
count_chars[str[i]] = 1;
var pq = new PriorityQueue(true);
for (var ch in count_chars)
pq.enqueue(count_chars[ch], ch);
while (pq.length() > 1) {
var pair1 = pq.dequeue();
var pair2 = pq.dequeue();
pq.enqueue(pair1[0]+pair2[0], [pair1[1], pair2[1]]);
}
var tree = pq.peek();
this.encoding = {};
this._generate_encoding(tree[1], "");
this.encoded_string = ""
for (var i = 0; i < this.str.length; i++) {
this.encoded_string += this.encoding[str[i]];
}
}
HuffmanEncoding.prototype._generate_encoding = function(ary, prefix) {
if (ary instanceof Array) {
this._generate_encoding(ary[0], prefix + "0");
this._generate_encoding(ary[1], prefix + "1");
}
else {
this.encoding[ary] = prefix;
}
}
HuffmanEncoding.prototype.inspect_encoding = function() {
for (var ch in this.encoding) {
print("'" + ch + "': " + this.encoding[ch])
}
}
HuffmanEncoding.prototype.decode = function(encoded) {
var rev_enc = {};
for (var ch in this.encoding)
rev_enc[this.encoding[ch]] = ch;
var decoded = "";
var pos = 0;
while (pos < encoded.length) {
var key = ""
while (has_property(rev_enc, key) == false) {
key += encoded[pos];
pos++;
}
decoded += rev_enc[key];
}
return decoded;
}
And, using the Huffman encoder
var s = "this is an example for huffman encoding";
print(s);
var huff = new HuffmanEncoding(s);
huff.inspect_encoding();
var e = huff.encoded_string;
print(e);
var t = huff.decode(e);
print(t);
print("is decoded string same as original? " + (s==t));
output
this is an example for huffman encoding 'n': 000 's': 0010 'm': 0011 'o': 0100 't': 01010 'x': 01011 'p': 01100 'l': 01101 'r': 01110 'u': 01111 'c': 10000 'd': 10001 'i': 1001 ' ': 101 'a': 1100 'e': 1101 'f': 1110 'g': 11110 'h': 11111 0101011111100100101011001001010111000001011101010111100001101100011011101101111001000111010111111011111110111000111100000101110100010000010010001100100011110 this is an example for huffman encoding is decoded string same as original? true
[edit] OCaml
Translation of: Standard ML
type 'a huffman_tree =
| Leaf of int * 'a
| Node of int * 'a huffman_tree * 'a huffman_tree
let freq = function Leaf (f, _) | Node (f, _, _) -> f
let freq_compare a b = compare (freq a) (freq b)
let build_tree charFreqs =
let leaves = List.map (fun (c,f) -> Leaf (f,c)) charFreqs in
let freq_sort = List.sort freq_compare in
let rec aux = function
| [] -> assert false
| [a] -> (a)
| a::b::tl ->
let node = Node(freq a + freq b, a, b) in
aux (freq_sort(node::tl))
in
aux (freq_sort leaves)
let rec print_tree = function
| code, Leaf (f, c) ->
Printf.printf "%c\t%d\t%s\n" c f (String.concat "" (List.rev code));
| code, Node (_, l, r) ->
print_tree ("0"::code, l);
print_tree ("1"::code, r)
let () =
let str = "this is an example for huffman encoding" in
let charFreqs = Hashtbl.create 42 in
String.iter (fun c ->
let old =
try Hashtbl.find charFreqs c
with Not_found -> 0 in
Hashtbl.replace charFreqs c (old+1)
) str;
let charFreqs = Hashtbl.fold (fun c f acc -> (c,f)::acc) charFreqs [] in
let tree = build_tree charFreqs in
print_string "Symbol\tWeight\tHuffman code\n";
print_tree ([], tree)
[edit] Python
A slight modification of the method outlined in the task description allows the code to be accumulated as the heap is manipulated.
The output is sorted first on length of the code, then on the symbols.
from heapq import heappush, heappop, heapify
from collections import defaultdict
def encode(symb2freq):
"""Huffman encode the given dict mapping symbols to weights"""
heap = [[wt, [sym, ""]] for sym, wt in symb2freq.iteritems()]
heapify(heap)
while len(heap) > 1:
lo = heappop(heap)
hi = heappop(heap)
for pair in lo[1:]:
pair[1] = '0' + pair[1]
for pair in hi[1:]:
pair[1] = '1' + pair[1]
heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
return sorted(heappop(heap)[1:], key=lambda p: (len(p[-1]), p))
txt = "this is an example for huffman encoding"
symb2freq = defaultdict(int)
for ch in txt:
symb2freq[ch] += 1
# in Python 3.1+:
# symb2freq = collections.Counter(txt)
huff = encode(symb2freq)
print "Symbol\tWeight\tHuffman Code"
for p in huff:
print "%s\t%s\t%s" % (p[0], symb2freq[p[0]], p[1])
Example output:
Symbol Weight Huffman Code
6 101
n 4 010
a 3 1001
e 3 1100
f 3 1101
h 2 0001
i 3 1110
m 2 0010
o 2 0011
s 2 0111
g 1 00000
l 1 00001
p 1 01100
r 1 01101
t 1 10000
u 1 10001
x 1 11110
c 1 111110
d 1 111111
An extension to the method outlined above is given here.
[edit] Ruby
Uses a Library: RubyGems package PriorityQueue
require 'priority_queue'
def huffman_encoding(str)
char_count = Hash.new(0)
str.each_char {|c| char_count[c] += 1}
pq = CPriorityQueue.new
# chars with fewest count have highest priority
char_count.each {|char, count| pq.push(char, count)}
while pq.length > 1
key1, prio1 = pq.delete_min
key2, prio2 = pq.delete_min
pq.push([key1, key2], prio1 + prio2)
end
Hash[*generate_encoding(pq.min_key)]
end
def generate_encoding(ary, prefix="")
case ary
when Array
generate_encoding(ary[0], "#{prefix}0") + generate_encoding(ary[1], "#{prefix}1")
else
[ary, prefix]
end
end
def encode(str, encoding)
str.each_char.inject("") {|encoded, char| encoded << encoding[char]}
end
def decode(encoded, encoding)
rev_enc = Hash[*encoding.to_a.flatten.reverse]
decoded = ""
pos = 0
while pos < encoded.length
key = ""
while rev_enc[key].nil?
key << encoded[pos]
pos += 1
end
decoded << rev_enc[key]
end
decoded
end
str = "this is an example for huffman encoding"
encoding = huffman_encoding(str)
encoding.to_a.sort.each {|x| p x}
enc = encode(str, encoding)
dec = decode(enc, encoding)
puts "success!" if str == dec
[" ", "111"] ["a", "1011"] ["c", "00001"] ["d", "00000"] ["e", "1101"] ["f", "1100"] ["g", "00100"] ["h", "1000"] ["i", "1001"] ["l", "01110"] ["m", "10101"] ["n", "010"] ["o", "0001"] ["p", "00101"] ["r", "00111"] ["s", "0110"] ["t", "00110"] ["u", "01111"] ["x", "10100"] success!
[edit] Scala
Works with: scala version 2.8
object Huffman {
import scala.collection.mutable.{Map, PriorityQueue}
sealed abstract class Tree
case class Node(left: Tree, right: Tree) extends Tree
case class Leaf(c: Char) extends Tree
def treeOrdering(m: Map[Tree, Int]) = new Ordering[Tree] {
def compare(x: Tree, y: Tree) = m(y).compare(m(x))
}
def stringMap(text: String) = text groupBy (x => Leaf(x) : Tree) mapValues (_.length)
def buildNode(queue: PriorityQueue[Tree], map: Map[Tree,Int]) {
val right = queue.dequeue
val left = queue.dequeue
val node = Node(left, right)
map(node) = map(left) + map(right)
queue.enqueue(node)
}
def codify(tree: Tree, map: Map[Tree, Int]) = {
def recurse(tree: Tree, prefix: String): List[(Char, (Int, String))] = tree match {
case Node(left, right) => recurse(left, prefix+"0") ::: recurse(right, prefix+"1")
case leaf @ Leaf(c) => c -> ((map(leaf), prefix)) :: Nil
}
recurse(tree, "")
}
def encode(text: String) = {
val map = Map.empty[Tree,Int] ++= stringMap(text)
val queue = new PriorityQueue[Tree]()(treeOrdering(map)) ++= map.keysIterator
while(queue.size > 1) {
buildNode(queue, map)
}
codify(queue.dequeue, map)
}
def main(args: Array[String]) {
val text = "this is an example for huffman encoding"
val code = encode(text)
println("Char\tWeight\t\tEncoding")
code sortBy (_._2._1) foreach {
case (c, (weight, encoding)) => println("%c:\t%3d/%-3d\t\t%s" format (c, weight, text.length, encoding))
}
}
}
Output:
Char Weight Encoding t: 1/39 011000 p: 1/39 011001 r: 1/39 01101 c: 1/39 01110 x: 1/39 01111 g: 1/39 10110 l: 1/39 10111 u: 1/39 11000 d: 1/39 11001 o: 2/39 1010 s: 2/39 1101 m: 2/39 1110 h: 2/39 1111 f: 3/39 0000 a: 3/39 0001 e: 3/39 0010 i: 3/39 0011 n: 4/39 100 : 6/39 010
[edit] SETL
var forest := {}, encTab := {};
plaintext := 'this is an example for huffman encoding';
ft := {};
(for c in plaintext)
ft(c) +:= 1;
end;
forest := {[f, c]: [c, f] in ft};
(while 1 < #forest)
[f1, n1] := getLFN();
[f2, n2] := getLFN();
forest with:= [f1+f2, [n1,n2]];
end;
addToTable('', arb range forest);
(for e = encTab(c))
print(c, ft(c), e);
end;
print(+/ [encTab(c): c in plaintext]);
proc addToTable(prefix, node);
if is_tuple node then
addToTable(prefix + '0', node(1));
addToTable(prefix + '1', node(2));
else
encTab(node) := prefix;
end;
end proc;
proc getLFN();
f := min/ domain forest;
n := arb forest{f};
forest less:= [f, n];
return [f, n];
end proc;
[edit] Standard ML
Works with: SML/NJ
datatype 'a huffman_tree = Leaf of int * 'a
| Node of int * 'a huffman_tree * 'a huffman_tree
fun freq (Leaf (f, _)) = f
| freq (Node (f, _, _)) = f
structure HuffmanPriority = struct
type priority = int
(* reverse comparision to achieve min-heap *)
fun compare (a, b) = Int.compare (b, a)
type item = char huffman_tree
val priority = freq
end
structure HPQueue = LeftPriorityQFn (HuffmanPriority)
fun buildTree charFreqs = let
fun aux trees = let
val (a, trees) = HPQueue.remove trees
in
if HPQueue.isEmpty trees then
a
else let
val (b, trees) = HPQueue.remove trees
val trees = HPQueue.insert (Node (freq a + freq b, a, b),
trees)
in
aux trees
end
end
val trees = HPQueue.fromList (map (fn (c,f) => Leaf (f,c)) charFreqs)
in
aux trees
end
fun printCodes (revPrefix, Leaf (f, c)) =
print (String.str c ^ "\t" ^ Int.toString f ^ "\t" ^
implode (rev revPrefix) ^ "\n")
| printCodes (revPrefix, Node (_, l, r)) = (
printCodes (#"0"::revPrefix, l);
printCodes (#"1"::revPrefix, r)
);
let
val test = "this is an example for huffman encoding"
val charFreqs = HashTable.mkTable
(HashString.hashString o String.str, op=)
(42, Empty)
val () =
app (fn c =>
let val old = getOpt (HashTable.find charFreqs c, 0)
in HashTable.insert charFreqs (c, old+1)
end)
(explode test)
val tree = buildTree (HashTable.listItemsi charFreqs)
in
print "SYMBOL\tWEIGHT\tHUFFMAN CODE\n";
printCodes ([], tree)
end
[edit] Tcl
uses package struct::prioqueue from Library: tcllib
package require Tcl 8.5
package require struct::prioqueue
proc huffmanEncode {str args} {
array set opts [concat -dump false $args]
set charcount [dict create]
foreach char [split $str ""] {
dict incr charcount $char
}
set pq [struct::prioqueue -dictionary] ;# want lower values to have higher priority
dict for {char count} $charcount {
$pq put $char $count
}
while {[$pq size] > 1} {
lassign [$pq peekpriority 2] p1 p2
$pq put [$pq get 2] [expr {$p1 + $p2}]
}
set encoding [walkTree [$pq get]]
set map [dict create {*}[lreverse $encoding]]
if {$opts(-dump)} {
foreach key [lsort -command compare [dict keys $map]] {
set char [dict get $map $key]
puts "$char\t[dict get $charcount $char]\t$key"
}
}
return $encoding
}
proc walkTree {tree {prefix ""}} {
if {[llength $tree] < 2} {
return [list $tree $prefix]
}
lassign $tree left right
return [concat [walkTree $left "${prefix}0"] [walkTree $right "${prefix}1"]]
}
proc compare {a b} {
if {[string length $a] < [string length $b]} {return -1}
if {[string length $a] > [string length $b]} {return 1}
return [string compare $a $b]
}
set str "this is an example for huffman encoding"
set encoding [huffmanEncode $str -dump true]
puts $str
puts [string map $encoding $str]
outputs
n 4 000 6 101 s 2 0010 m 2 0011 o 2 0100 i 3 1001 a 3 1100 e 3 1101 f 3 1110 t 1 01010 x 1 01011 p 1 01100 l 1 01101 r 1 01110 u 1 01111 c 1 10000 d 1 10001 g 1 11110 h 2 11111 this is an example for huffman encoding 0101011111100100101011001001010111000001011101010111100001101100011011101101111001000111010111111011111110111000111100000101110100010000010010001100100011110
[edit] Ursala
following the algorithm given above
#import std
#import nat
#import flo
code_table = # takes a training dataset to a table <char: code...>
-+
*^ ~&v?\~&iNC @v ~&t?\~&h ~&plrDSLrnPlrmPCAS/'01',
~&itB->h fleq-<&d; ^C\~&tt @hthPX ^V\~&lrNCC plus@bd,
^V(div@rrPlX,~&rlNVNC)^*D(plus:-0.@rS,~&)+ *K2 ^/~&h float+ length+-
#cast %csAL
table = code_table 'this is an example for huffman encoding'
a quick walk through the code starting from the bottom:
-
*K2 ^/~&h float+ lengthcompute character frequencies by partitioning the input list of characters by equality, and transforming each equivalence class to a pair containing its member and its cardinality represented as a floating point number -
^V(div@rrPlX,~&rlNVNC)^*D(plus:-0.@rS,~&)construct a list of unary trees, one for each character class, with its normalized frequency in the root, and the character in the leaf -
~&itB->hwhile the list contains more than one tree, do the following, and when done take the head of the list -
fleq-<&d;sort the trees in increasing order by their roots -
^C\~&tt @hthPX ^V\~&lrNCC plus@bdchange the first two trees in the sorted list to a single binary tree whose root is the sum of their roots -
*^visit the following function on each node of the tree obtained from the loop and propagate the results upward from the leaves -
~&v?\~&iNCif the node is a leaf, construct a singleton list containing the pair of its root (a character) and the empty string (of bits) -
@v ~&t?\~&hif there is only a single subtree, propagate the result already obtained for it -
~&plrDSLrnPlrmPCAS/'01'otherwise there are two subtrees, hence two lists previously computed results propagating upward, so insert a zero into all of the bit strings in the results on the left, and a one into all the ones on the right, concatenate the left and right results, and propagate the contatenation upward
output:
< `r: '00000', `l: '00001', `c: '00010', `u: '00011', `n: '001', `m: '0100', `h: '0101', `g: '01100', `d: '01101', `o: '0111', `s: '1000', `t: '10010', `p: '100110', `x: '100111', `a: '1010', `f: '1011', `i: '1100', `e: '1101', ` : '111'>

