Arithmetic coding/As a generalized change of radix
Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.
- Task
Create a program which implements the arithmetic coding as a generalized change of radix.
Show the results, in base 10, for all the following strings:
- "DABDDB"
- "DABDDBBDDBA"
- "ABRACADABRA"
- "TOBEORNOTTOBEORTOBEORNOT"
Verify the implementation by decoding the results back into strings and checking for equality with the given strings.
Go
<lang go>package main
import (
"fmt" "math/big"
)
func cumulative_freq(freq map[byte]int64) map[byte]int64 {
total := int64(0) cf := make(map[byte]int64) for i := 0; i < 256; i++ { b := byte(i) if v, ok := freq[b]; ok { cf[b] = total total += v } } return cf
}
func arithmethic_coding(str string, radix int64) (*big.Int,
*big.Int, map[byte]int64) {
// Convert the string into a slice of bytes chars := []byte(str)
// The frequency characters freq := make(map[byte]int64) for _, c := range chars { freq[c] += 1 }
// The cumulative frequency cf := cumulative_freq(freq)
// Base base := len(chars)
// Lower bound L := big.NewInt(0)
// Product of all frequencies pf := big.NewInt(1)
// Each term is multiplied by the product of the // frequencies of all previously occurring symbols bigBase := big.NewInt(int64(base)) bigLim := big.NewInt(int64(base - 1))
for i := 0; i < base; i++ {
bigI := big.NewInt(int64(i))
diff := big.NewInt(0) diff.Sub(bigLim, bigI)
pow := big.NewInt(0) pow.Exp(bigBase, diff, nil)
x := big.NewInt(1) x.Mul(big.NewInt(cf[chars[i]]), pow)
L.Add(L, x.Mul(x, pf)) pf.Mul(pf, big.NewInt(freq[chars[i]])) }
// Upper bound U := big.NewInt(0) U.Set(L) U.Add(U, pf)
bigOne := big.NewInt(1) bigZero := big.NewInt(0) bigRadix := big.NewInt(radix)
tmp := big.NewInt(0).Set(pf) powr := big.NewInt(0)
for { tmp.Div(tmp, bigRadix) if tmp.Cmp(bigZero) == 0 { break } powr.Add(powr, bigOne) }
diff := big.NewInt(0) diff.Sub(U, bigOne) diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
return diff, powr, freq
}
func arithmethic_decoding(num *big.Int, radix int64,
pow *big.Int, freq map[byte]int64) string {
powr := big.NewInt(radix)
enc := big.NewInt(0).Set(num) enc.Mul(enc, powr.Exp(powr, pow, nil))
base := int64(0) for _, v := range freq { base += v }
// Create the cumulative frequency table cf := cumulative_freq(freq)
// Create the dictionary dict := make(map[int64]byte) for k, v := range cf { dict[v] = k }
// Fill the gaps in the dictionary lchar := -1 for i := int64(0); i < base; i++ { if v, ok := dict[i]; ok { lchar = int(v) } else if lchar != -1 { dict[i] = byte(lchar) } }
// Decode the input number decoded := make([]byte, base) bigBase := big.NewInt(base)
for i := base - 1; i >= 0; i-- {
pow := big.NewInt(0) pow.Exp(bigBase, big.NewInt(i), nil)
div := big.NewInt(0) div.Div(enc, pow)
c := dict[div.Int64()] fv := freq[c] cv := cf[c]
prod := big.NewInt(0).Mul(pow, big.NewInt(cv)) diff := big.NewInt(0).Sub(enc, prod) enc.Div(diff, big.NewInt(fv))
decoded[base-i-1] = c }
// Return the decoded output return string(decoded)
}
func main() {
var radix = int64(10)
strSlice := []string{ `DABDDB`, `DABDDBBDDBA`, `ABRACADABRA`, `TOBEORNOTTOBEORTOBEORNOT`, }
for _, str := range strSlice { enc, pow, freq := arithmethic_coding(str, radix) dec := arithmethic_decoding(enc, radix, pow, freq) fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec { panic("\tHowever that is incorrect!") } }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Perl
<lang perl>use Math::BigInt (try => 'GMP');
sub cumulative_freq {
my ($freq) = @_;
my %cf; my $total = Math::BigInt->new(0); foreach my $c (map { chr } 0 .. 255) { if (exists $freq->{$c}) { $cf{$c} = $total; $total += $freq->{$c}; } }
return %cf;
}
sub arithmethic_coding {
my ($str, $radix) = @_; my @chars = split(//, $str);
# The frequency characters my %freq; $freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(\%freq);
# Limit and base my $lim = Math::BigInt->new($#chars); my $base = $lim + 1;
# Lower bound my $L = Math::BigInt->new(0);
# Product of all frequencies my $pf = Math::BigInt->new(1);
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for (my $i = 0 ; $i < $base ; $i++) { my $x = $cf{$chars[$i]} * $base**($lim - $i); $L->badd($x * $pf); $pf->bmul($freq{$chars[$i]}); }
# Upper bound my $U = $L + $pf;
my $pow = Math::BigInt->new($pf)->blog($radix); my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
return ($enc, $pow, \%freq);
}
sub arithmethic_decoding {
my ($enc, $radix, $pow, $freq) = @_;
# Multiply enc by 10^pow $enc *= $radix**$pow;
my $base = Math::BigInt->new(0); $base += $_ for values %{$freq};
# Create the cumulative frequency table my %cf = cumulative_freq($freq);
# Create the dictionary my %dict; while (my ($k, $v) = each %cf) { $dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; foreach my $i (0 .. $base - 1) { if (exists $dict{$i}) { $lchar = $dict{$i}; } elsif (defined $lchar) { $dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for (my $i = $base - 1 ; $i >= 0 ; $i--) {
my $pow = $base**$i; my $div = ($enc / $pow);
my $c = $dict{$div}; my $fv = $freq->{$c}; my $cv = $cf{$c};
my $rem = ($enc - $pow * $cv) / $fv;
$enc = $rem; $decoded .= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {
my ($enc, $pow, $freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Sidef
<lang ruby>func cumulative_freq(freq) {
var cf = Hash() var total = 0 256.range.each { |b| if (freq.contains(b)) { cf{b} = total total += freq{b} } } return cf
}
func arithmethic_coding(bytes, radix=10) {
# The frequency characters var freq = Hash() bytes.each { |c| freq{c} := 0 ++ }
# The cumulative frequency table var cf = cumulative_freq(freq)
# Limit and base var lim = bytes.end var base = lim+1
# Lower bound var L = 0
# Product of all frequencies var pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols base.range.each { |i| var x = (cf{bytes[i]} * base**(lim - i)) L += x*pf pf *= freq{bytes[i]} }
# Upper bound var U = L+pf
var pow = pf.log(radix).int var enc = ((U-1) // radix**pow)
return (enc, pow, freq)
}
func arithmethic_decoding(enc, radix, pow, freq) {
# Multiply enc by 10^pow enc *= radix**pow;
var base = 0 freq.each_value { |v| base += v }
# Create the cumulative frequency table var cf = cumulative_freq(freq);
# Create the dictionary var dict = Hash() cf.each_kv { |k,v| dict{v} = k }
# Fill the gaps in the dictionary var lchar = base.range.each { |i| if (dict.contains(i)) { lchar = dict{i} } elsif (!lchar.is_empty) { dict{i} = lchar } }
# Decode the input number var decoded = [] base.range.reverse.each { |i|
var pow = base**i; var div = enc//pow
var c = dict{div} var fv = freq{c} var cv = cf{c}
var rem = ((enc - pow*cv) // fv)
enc = rem decoded << c }
# Return the decoded output return decoded
}
var radix = 10; # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|
var (enc, pow, freq) = arithmethic_coding(str.bytes, radix) var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
if (str != dec) { die "\tHowever that is incorrect!" }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15