Arithmetic coding/As a generalized change of radix
Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.
- Task
Create a program which implements the arithmetic coding as a generalized change of radix.
Show the results, in base 10, for all the following strings:
- "DABDDB"
- "DABDDBBDDBA"
- "ABRACADABRA"
- "TOBEORNOTTOBEORTOBEORNOT"
Verify the implementation by decoding the results back into strings and checking for equality with the given strings.
Go
<lang go>package main
import (
"fmt" "math/big"
)
func cumulative_freq(freq map[byte]int64) map[byte]int64 {
total := int64(0) cf := make(map[byte]int64) for i := 0; i < 256; i++ { b := byte(i) if v, ok := freq[b]; ok { cf[b] = total total += v } } return cf
}
func arithmethic_coding(str string, radix int64) (*big.Int,
*big.Int, map[byte]int64) {
// Convert the string into a slice of bytes chars := []byte(str)
// The frequency characters freq := make(map[byte]int64) for _, c := range chars { freq[c] += 1 }
// The cumulative frequency cf := cumulative_freq(freq)
// Base base := len(chars)
// Lower bound L := big.NewInt(0)
// Product of all frequencies pf := big.NewInt(1)
// Each term is multiplied by the product of the // frequencies of all previously occurring symbols bigBase := big.NewInt(int64(base))
for _, c := range chars { x := big.NewInt(cf[c])
L.Mul(L, bigBase) L.Add(L, x.Mul(x, pf)) pf.Mul(pf, big.NewInt(freq[c])) }
// Upper bound U := big.NewInt(0) U.Set(L) U.Add(U, pf)
bigOne := big.NewInt(1) bigZero := big.NewInt(0) bigRadix := big.NewInt(radix)
tmp := big.NewInt(0).Set(pf) powr := big.NewInt(0)
for { tmp.Div(tmp, bigRadix) if tmp.Cmp(bigZero) == 0 { break } powr.Add(powr, bigOne) }
diff := big.NewInt(0) diff.Sub(U, bigOne) diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
return diff, powr, freq
}
func arithmethic_decoding(num *big.Int, radix int64,
pow *big.Int, freq map[byte]int64) string {
powr := big.NewInt(radix)
enc := big.NewInt(0).Set(num) enc.Mul(enc, powr.Exp(powr, pow, nil))
base := int64(0) for _, v := range freq { base += v }
// Create the cumulative frequency table cf := cumulative_freq(freq)
// Create the dictionary dict := make(map[int64]byte) for k, v := range cf { dict[v] = k }
// Fill the gaps in the dictionary lchar := -1 for i := int64(0); i < base; i++ { if v, ok := dict[i]; ok { lchar = int(v) } else if lchar != -1 { dict[i] = byte(lchar) } }
// Decode the input number decoded := make([]byte, base) bigBase := big.NewInt(base)
for i := base - 1; i >= 0; i-- {
pow := big.NewInt(0) pow.Exp(bigBase, big.NewInt(i), nil)
div := big.NewInt(0) div.Div(enc, pow)
c := dict[div.Int64()] fv := freq[c] cv := cf[c]
prod := big.NewInt(0).Mul(pow, big.NewInt(cv)) diff := big.NewInt(0).Sub(enc, prod) enc.Div(diff, big.NewInt(fv))
decoded[base-i-1] = c }
// Return the decoded output return string(decoded)
}
func main() {
var radix = int64(10)
strSlice := []string{ `DABDDB`, `DABDDBBDDBA`, `ABRACADABRA`, `TOBEORNOTTOBEORTOBEORNOT`, }
for _, str := range strSlice { enc, pow, freq := arithmethic_coding(str, radix) dec := arithmethic_decoding(enc, radix, pow, freq) fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec { panic("\tHowever that is incorrect!") } }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
J
Note that decoding assumes we know the length of the string (and also that strings are assumed to contain only upper case alphanumeric letters) - but that's not a part of this task.
Implementation:
<lang J>aek=:3 :0
b=. x:#y f=. (#/.~{~~.i.]) y c=. _65+a.i.y L=. b #. c**/\1,}:f p=. */f e=. x:<.10^.p e,~<.(L+p)%10^e
)</lang>
Results are number pairs where the second number is the number of zeros to append to the first.
Example use:
<lang J> aek 'DABDDB' 251 2
aek 'DABDDBBDDBA'
83677 6
aek 'ABRACADABRA'
4860830 4
aek 'TOBEORNOTTOBEORTOBEORNOT'
1224972022395235072 15</lang>
Draft note: these results are different from those of some other implementations currently visible here. I expect that that difference occurs from those other implementations using cumulative frequency instead of frequency at the wrong place in the calculation. (If I am incorrect about this, I'd appreciate a detailed exposition.)
Perl
<lang perl>use Math::BigInt (try => 'GMP');
sub cumulative_freq {
my ($freq) = @_;
my %cf; my $total = Math::BigInt->new(0); foreach my $c (map { chr } 0 .. 255) { if (exists $freq->{$c}) { $cf{$c} = $total; $total += $freq->{$c}; } }
return %cf;
}
sub arithmethic_coding {
my ($str, $radix) = @_; my @chars = split(//, $str);
# The frequency characters my %freq; $freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(\%freq);
# Base my $base = Math::BigInt->new(scalar @chars);
# Lower bound my $L = Math::BigInt->new(0);
# Product of all frequencies my $pf = Math::BigInt->new(1);
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols foreach my $c (@chars) { my $x = $cf{$c}; $L->bmuladd($base, $x * $pf); $pf->bmul($freq{$c}); }
# Upper bound my $U = $L + $pf;
my $pow = Math::BigInt->new($pf)->blog($radix); my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
return ($enc, $pow, \%freq);
}
sub arithmethic_decoding {
my ($enc, $radix, $pow, $freq) = @_;
# Multiply enc by radix^pow $enc *= $radix**$pow;
my $base = Math::BigInt->new(0); $base += $_ for values %{$freq};
# Create the cumulative frequency table my %cf = cumulative_freq($freq);
# Create the dictionary my %dict; while (my ($k, $v) = each %cf) { $dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; foreach my $i (0 .. $base - 1) { if (exists $dict{$i}) { $lchar = $dict{$i}; } elsif (defined $lchar) { $dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for (my $i = $base - 1 ; $i >= 0 ; $i--) {
my $pow = $base**$i; my $div = ($enc / $pow);
my $c = $dict{$div}; my $fv = $freq->{$c}; my $cv = $cf{$c};
my $rem = ($enc - $pow * $cv) / $fv;
$enc = $rem; $decoded .= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {
my ($enc, $pow, $freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Perl 6
<lang perl6>sub cumulative_freq(%freq) {
my %cf; my $total = 0; for (0..255).map({.chr}) -> $c { if (%freq{$c}:exists) { %cf{$c} = $total; $total += %freq{$c}; } } return %cf;
}
sub arithmethic_coding($str, $radix) {
my @chars = $str.comb;
# The frequency characters my %freq; %freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(%freq);
# Base my $base = @chars.elems;
# Lower bound my $L = 0;
# Product of all frequencies my $pf = 1;
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for @chars -> $c { my $x = %cf{$c}; $L = $L * $base + $x * $pf; $pf *= %freq{$c}; }
# Upper bound my $U = $L + $pf;
my $pow = 0; loop { $pf div= $radix; last if $pf == 0; ++$pow; }
my $enc = ($U - 1) div ($radix ** $pow); ($enc, $pow, %freq);
}
sub arithmethic_decoding($encoding, $radix, $pow, %freq) {
# Multiply encoding by radix^pow my $enc = $encoding * $radix**$pow;
my $base = 0; $base += $_ for %freq.values;
# Create the cumulative frequency table my %cf = cumulative_freq(%freq);
# Create the dictionary my %dict; for %cf.kv -> $k,$v { %dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; for ^$base -> $i { if (%dict{$i}:exists) { $lchar = %dict{$i}; } elsif (defined $lchar) { %dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for reverse(^$base) -> $i {
my $pow = $base**$i; my $div = $enc div $pow;
my $c = %dict{$div}; my $fv = %freq{$c}; my $cv = %cf{$c};
my $rem = ($enc - $pow*$cv) div $fv;
$enc = $rem; $decoded ~= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
for <DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT> -> $str {
my ($enc, $pow, %freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, %freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Python
<lang python>from collections import Counter
def cumulative_freq(freq):
cf = {} total = 0 for b in range(256): if b in freq: cf[b] = total total += freq[b] return cf
def arithmethic_coding(bytes, radix):
# The frequency characters freq = Counter(bytes)
# The cumulative frequency table cf = cumulative_freq(freq)
# Base base = len(bytes)
# Lower bound lower = 0
# Product of all frequencies pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for b in bytes: x = cf[b] lower = lower * base + x*pf pf *= freq[b]
# Upper bound upper = lower+pf
pow = 0 while True: pf //= radix if pf==0: break pow += 1
enc = (upper-1) // radix**pow return enc, pow, freq
def arithmethic_decoding(enc, radix, pow, freq):
# Multiply enc by radix^pow enc *= radix**pow;
base = sum(freq.values())
# Create the cumulative frequency table cf = cumulative_freq(freq)
# Create the dictionary dict = {} for k,v in cf.items(): dict[v] = k
# Fill the gaps in the dictionary lchar = None for i in range(base): if i in dict: lchar = dict[i] elif lchar is not None: dict[i] = lchar
# Decode the input number decoded = bytearray() for i in range(base-1, -1, -1): pow = base**i div = enc//pow
c = dict[div] fv = freq[c] cv = cf[c]
rem = (enc - pow*cv) // fv
enc = rem decoded.append(c)
# Return the decoded output return bytes(decoded)
radix = 10 # can be any integer greater or equal with 2
for str in b'DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT'.split():
enc, pow, freq = arithmethic_coding(str, radix) dec = arithmethic_decoding(enc, radix, pow, freq)
print("%-25s=> %19s * %d^%s" % (str, enc, radix, pow))
if str != dec: raise Exception("\tHowever that is incorrect!")</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Ruby
<lang ruby>def cumulative_freq(freq)
cf = {} total = 0 (0..255).each do |b| if freq.has_key?(b) cf[b] = total total += freq[b] end end return cf
end
def arithmethic_coding(bytes, radix)
# The frequency characters freq = Hash.new(0) bytes.each { |b| freq[b] += 1 }
# The cumulative frequency table cf = cumulative_freq(freq)
# Base base = bytes.size
# Lower bound lower = 0
# Product of all frequencies pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols bytes.each do |b| x = cf[b] lower = lower * base + x*pf pf *= freq[b] end
# Upper bound upper = lower+pf
pow = 0 loop do pf /= radix break if pf==0 pow += 1 end
enc = ((upper-1) / radix**pow) [enc, pow, freq]
end
def arithmethic_decoding(enc, radix, pow, freq)
# Multiply enc by radix^pow enc *= radix**pow;
base = 0 freq.each_value { |v| base += v }
# Create the cumulative frequency table cf = cumulative_freq(freq)
# Create the dictionary dict = {} cf.each_pair do |k,v| dict[v] = k end
# Fill the gaps in the dictionary lchar = nil (0...base).each do |i| if dict.has_key?(i) lchar = dict[i] elsif lchar != nil dict[i] = lchar end end
# Decode the input number decoded = [] (0...base).reverse_each do |i| pow = base**i div = enc/pow
c = dict[div] fv = freq[c] cv = cf[c]
rem = ((enc - pow*cv) / fv)
enc = rem decoded << c end
# Return the decoded output return decoded
end
radix = 10 # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str|
enc, pow, freq = arithmethic_coding(str.bytes, radix) dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec raise "\tHowever that is incorrect!" end
end</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Sidef
<lang ruby>func cumulative_freq(freq) {
var cf = Hash() var total = 0 256.range.each { |b| if (freq.contains(b)) { cf{b} = total total += freq{b} } } return cf
}
func arithmethic_coding(bytes, radix=10) {
# The frequency characters var freq = Hash() bytes.each { |b| freq{b} := 0 ++ }
# The cumulative frequency table var cf = cumulative_freq(freq)
# Base var base = bytes.len
# Lower bound var L = 0
# Product of all frequencies var pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols bytes.each { |b| var x = cf{b} L *= base += x*pf pf *= freq{b} }
# Upper bound var U = L+pf
var pow = pf.log(radix).int var enc = ((U-1) // radix**pow)
return (enc, pow, freq)
}
func arithmethic_decoding(enc, radix, pow, freq) {
# Multiply enc by radix^pow enc *= radix**pow;
var base = 0 freq.each_value { |v| base += v }
# Create the cumulative frequency table var cf = cumulative_freq(freq);
# Create the dictionary var dict = Hash() cf.each_kv { |k,v| dict{v} = k }
# Fill the gaps in the dictionary var lchar = base.range.each { |i| if (dict.contains(i)) { lchar = dict{i} } elsif (!lchar.is_empty) { dict{i} = lchar } }
# Decode the input number var decoded = [] base.range.reverse.each { |i|
var pow = base**i; var div = enc//pow
var c = dict{div} var fv = freq{c} var cv = cf{c}
var rem = ((enc - pow*cv) // fv)
enc = rem decoded << c }
# Return the decoded output return decoded
}
var radix = 10; # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|
var (enc, pow, freq) = arithmethic_coding(str.bytes, radix) var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
if (str != dec) { die "\tHowever that is incorrect!" }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15