Arithmetic coding/As a generalized change of radix: Difference between revisions
Line 217: | Line 217: | ||
1150764267498783364 15</lang> |
1150764267498783364 15</lang> |
||
Note that these results are not sufficient to recover the original |
Note that these results are not sufficient to recover the original character sequences. |
||
=={{header|Perl}}== |
=={{header|Perl}}== |
Revision as of 07:47, 1 February 2016
Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.
- Task
Create a program which implements the arithmetic coding as a generalized change of radix.
Show the results, in base 10, for all the following strings:
- "DABDDB"
- "DABDDBBDDBA"
- "ABRACADABRA"
- "TOBEORNOTTOBEORTOBEORNOT"
Verify the implementation by decoding the results back into strings and checking for equality with the given strings.
Go
<lang go>package main
import (
"fmt" "math/big"
)
func cumulative_freq(freq map[byte]int64) map[byte]int64 {
total := int64(0) cf := make(map[byte]int64) for i := 0; i < 256; i++ { b := byte(i) if v, ok := freq[b]; ok { cf[b] = total total += v } } return cf
}
func arithmethic_coding(str string, radix int64) (*big.Int,
*big.Int, map[byte]int64) {
// Convert the string into a slice of bytes chars := []byte(str)
// The frequency characters freq := make(map[byte]int64) for _, c := range chars { freq[c] += 1 }
// The cumulative frequency cf := cumulative_freq(freq)
// Base base := len(chars)
// Lower bound L := big.NewInt(0)
// Product of all frequencies pf := big.NewInt(1)
// Each term is multiplied by the product of the // frequencies of all previously occurring symbols bigBase := big.NewInt(int64(base))
for _, c := range chars { x := big.NewInt(cf[c])
L.Mul(L, bigBase) L.Add(L, x.Mul(x, pf)) pf.Mul(pf, big.NewInt(freq[c])) }
// Upper bound U := big.NewInt(0) U.Set(L) U.Add(U, pf)
bigOne := big.NewInt(1) bigZero := big.NewInt(0) bigRadix := big.NewInt(radix)
tmp := big.NewInt(0).Set(pf) powr := big.NewInt(0)
for { tmp.Div(tmp, bigRadix) if tmp.Cmp(bigZero) == 0 { break } powr.Add(powr, bigOne) }
diff := big.NewInt(0) diff.Sub(U, bigOne) diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
return diff, powr, freq
}
func arithmethic_decoding(num *big.Int, radix int64,
pow *big.Int, freq map[byte]int64) string {
powr := big.NewInt(radix)
enc := big.NewInt(0).Set(num) enc.Mul(enc, powr.Exp(powr, pow, nil))
base := int64(0) for _, v := range freq { base += v }
// Create the cumulative frequency table cf := cumulative_freq(freq)
// Create the dictionary dict := make(map[int64]byte) for k, v := range cf { dict[v] = k }
// Fill the gaps in the dictionary lchar := -1 for i := int64(0); i < base; i++ { if v, ok := dict[i]; ok { lchar = int(v) } else if lchar != -1 { dict[i] = byte(lchar) } }
// Decode the input number decoded := make([]byte, base) bigBase := big.NewInt(base)
for i := base - 1; i >= 0; i-- {
pow := big.NewInt(0) pow.Exp(bigBase, big.NewInt(i), nil)
div := big.NewInt(0) div.Div(enc, pow)
c := dict[div.Int64()] fv := freq[c] cv := cf[c]
prod := big.NewInt(0).Mul(pow, big.NewInt(cv)) diff := big.NewInt(0).Sub(enc, prod) enc.Div(diff, big.NewInt(fv))
decoded[base-i-1] = c }
// Return the decoded output return string(decoded)
}
func main() {
var radix = int64(10)
strSlice := []string{ `DABDDB`, `DABDDBBDDBA`, `ABRACADABRA`, `TOBEORNOTTOBEORTOBEORNOT`, }
for _, str := range strSlice { enc, pow, freq := arithmethic_coding(str, radix) dec := arithmethic_decoding(enc, radix, pow, freq) fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec { panic("\tHowever that is incorrect!") } }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
J
Note that decoding assumes we know the length of the string (and also that strings are assumed to contain only upper case alphanumeric letters) - but that's not a part of this task.
Implementation:
<lang J>aek=:3 :0
b=. x:#y NB. numeric base u=. ~.y NB. unique list of characters i=. u i.y NB. character indices into uniques n=. #/.~y NB. frequencies of uniques o=. /:u NB. indices to sort uniques alphabetically c=. i{(+/\0,}:o{n)/:o NB. cumulative frequencies of characters f=. i{n NB. frequencies of characters L=. b #. c**/\1,}:f NB. lower bound p=. */f NB. product of frequencies of characters e=. x:<.10^.p NB. number of decimal positions to drop e,~<.(L+p)%10^e NB. convenient result with trailing zeros count
)</lang>
Example use:
<lang J> aek 'DABDDB' 251 2
aek 'DABDDBBDDBA'
167351 6
aeka 'ABRACADABRA'
7954170 4
aek 'TOBEORNOTTOBEORTOBEORNOT'
1150764267498783364 15</lang>
Note that these results are not sufficient to recover the original character sequences.
Perl
<lang perl>use Math::BigInt (try => 'GMP');
sub cumulative_freq {
my ($freq) = @_;
my %cf; my $total = Math::BigInt->new(0); foreach my $c (sort keys %$freq) { $cf{$c} = $total; $total += $freq->{$c}; }
return %cf;
}
sub arithmethic_coding {
my ($str, $radix) = @_; my @chars = split(//, $str);
# The frequency characters my %freq; $freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(\%freq);
# Base my $base = Math::BigInt->new(scalar @chars);
# Lower bound my $L = Math::BigInt->new(0);
# Product of all frequencies my $pf = Math::BigInt->new(1);
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols foreach my $c (@chars) { $L->bmuladd($base, $cf{$c} * $pf); $pf->bmul($freq{$c}); }
# Upper bound my $U = $L + $pf;
my $pow = Math::BigInt->new($pf)->blog($radix); my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
return ($enc, $pow, \%freq);
}
sub arithmethic_decoding {
my ($enc, $radix, $pow, $freq) = @_;
# Multiply enc by radix^pow $enc *= $radix**$pow;
# Base my $base = Math::BigInt->new(0); $base += $_ for values %{$freq};
# Create the cumulative frequency table my %cf = cumulative_freq($freq);
# Create the dictionary my %dict; while (my ($k, $v) = each %cf) { $dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; foreach my $i (0 .. $base - 1) { if (exists $dict{$i}) { $lchar = $dict{$i}; } elsif (defined $lchar) { $dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for (my $pow = $base**($base - 1) ; $pow > 0 ; $pow /= $base) { my $div = $enc / $pow;
my $c = $dict{$div}; my $fv = $freq->{$c}; my $cv = $cf{$c};
$enc = ($enc - $pow * $cv) / $fv; $decoded .= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {
my ($enc, $pow, $freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Perl 6
<lang perl6>sub cumulative_freq(%freq) {
my %cf; my $total = 0; for %freq.keys.sort -> $c { %cf{$c} = $total; $total += %freq{$c}; } return %cf;
}
sub arithmethic_coding($str, $radix) {
my @chars = $str.comb;
# The frequency characters my %freq; %freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(%freq);
# Base my $base = @chars.elems;
# Lower bound my $L = 0;
# Product of all frequencies my $pf = 1;
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for @chars -> $c { $L = $L*$base + %cf{$c}*$pf; $pf *= %freq{$c}; }
# Upper bound my $U = $L + $pf;
my $pow = 0; loop { $pf div= $radix; last if $pf == 0; ++$pow; }
my $enc = ($U - 1) div ($radix ** $pow); ($enc, $pow, %freq);
}
sub arithmethic_decoding($encoding, $radix, $pow, %freq) {
# Multiply encoding by radix^pow my $enc = $encoding * $radix**$pow;
# Base my $base = [+] %freq.values;
# Create the cumulative frequency table my %cf = cumulative_freq(%freq);
# Create the dictionary my %dict; for %cf.kv -> $k,$v { %dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; for ^$base -> $i { if (%dict{$i}:exists) { $lchar = %dict{$i}; } elsif (defined $lchar) { %dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for reverse(^$base) -> $i {
my $pow = $base**$i; my $div = $enc div $pow;
my $c = %dict{$div}; my $fv = %freq{$c}; my $cv = %cf{$c};
my $rem = ($enc - $pow*$cv) div $fv;
$enc = $rem; $decoded ~= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
for <DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT> -> $str {
my ($enc, $pow, %freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, %freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Python
<lang python>from collections import Counter
def cumulative_freq(freq):
cf = {} total = 0 for b in range(256): if b in freq: cf[b] = total total += freq[b] return cf
def arithmethic_coding(bytes, radix):
# The frequency characters freq = Counter(bytes)
# The cumulative frequency table cf = cumulative_freq(freq)
# Base base = len(bytes)
# Lower bound lower = 0
# Product of all frequencies pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for b in bytes: lower = lower*base + cf[b]*pf pf *= freq[b]
# Upper bound upper = lower+pf
pow = 0 while True: pf //= radix if pf==0: break pow += 1
enc = (upper-1) // radix**pow return enc, pow, freq
def arithmethic_decoding(enc, radix, pow, freq):
# Multiply enc by radix^pow enc *= radix**pow;
# Base base = sum(freq.values())
# Create the cumulative frequency table cf = cumulative_freq(freq)
# Create the dictionary dict = {} for k,v in cf.items(): dict[v] = k
# Fill the gaps in the dictionary lchar = None for i in range(base): if i in dict: lchar = dict[i] elif lchar is not None: dict[i] = lchar
# Decode the input number decoded = bytearray() for i in range(base-1, -1, -1): pow = base**i div = enc//pow
c = dict[div] fv = freq[c] cv = cf[c]
rem = (enc - pow*cv) // fv
enc = rem decoded.append(c)
# Return the decoded output return bytes(decoded)
radix = 10 # can be any integer greater or equal with 2
for str in b'DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT'.split():
enc, pow, freq = arithmethic_coding(str, radix) dec = arithmethic_decoding(enc, radix, pow, freq)
print("%-25s=> %19s * %d^%s" % (str, enc, radix, pow))
if str != dec: raise Exception("\tHowever that is incorrect!")</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Ruby
<lang ruby>def cumulative_freq(freq)
cf = {} total = 0 freq.keys.sort.each do |b| cf[b] = total total += freq[b] end return cf
end
def arithmethic_coding(bytes, radix)
# The frequency characters freq = Hash.new(0) bytes.each { |b| freq[b] += 1 }
# The cumulative frequency table cf = cumulative_freq(freq)
# Base base = bytes.size
# Lower bound lower = 0
# Product of all frequencies pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols bytes.each do |b| lower = lower*base + cf[b]*pf pf *= freq[b] end
# Upper bound upper = lower+pf
pow = 0 loop do pf /= radix break if pf==0 pow += 1 end
enc = ((upper-1) / radix**pow) [enc, pow, freq]
end
def arithmethic_decoding(enc, radix, pow, freq)
# Multiply enc by radix^pow enc *= radix**pow;
# Base base = freq.values.reduce(:+)
# Create the cumulative frequency table cf = cumulative_freq(freq)
# Create the dictionary dict = {} cf.each_pair do |k,v| dict[v] = k end
# Fill the gaps in the dictionary lchar = nil (0...base).each do |i| if dict.has_key?(i) lchar = dict[i] elsif lchar != nil dict[i] = lchar end end
# Decode the input number decoded = [] (0...base).reverse_each do |i| pow = base**i div = enc/pow
c = dict[div] fv = freq[c] cv = cf[c]
rem = ((enc - pow*cv) / fv)
enc = rem decoded << c end
# Return the decoded output return decoded
end
radix = 10 # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str|
enc, pow, freq = arithmethic_coding(str.bytes, radix) dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec raise "\tHowever that is incorrect!" end
end</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Sidef
<lang ruby>func cumulative_freq(freq) {
var cf = Hash() var total = 0 256.range.each { |b| if (freq.contains(b)) { cf{b} = total total += freq{b} } } return cf
}
func arithmethic_coding(bytes, radix=10) {
# The frequency characters var freq = Hash() bytes.each { |b| freq{b} := 0 ++ }
# The cumulative frequency table var cf = cumulative_freq(freq)
# Base var base = bytes.len
# Lower bound var L = 0
# Product of all frequencies var pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols bytes.each { |b| L = (L*base + cf{b}*pf) pf *= freq{b} }
# Upper bound var U = L+pf
var pow = pf.log(radix).int var enc = ((U-1) // radix**pow)
return (enc, pow, freq)
}
func arithmethic_decoding(enc, radix, pow, freq) {
# Multiply enc by radix^pow enc *= radix**pow;
# Base var base = freq.values.sum
# Create the cumulative frequency table var cf = cumulative_freq(freq);
# Create the dictionary var dict = Hash() cf.each_kv { |k,v| dict{v} = k }
# Fill the gaps in the dictionary var lchar = base.range.each { |i| if (dict.contains(i)) { lchar = dict{i} } elsif (!lchar.is_empty) { dict{i} = lchar } }
# Decode the input number var decoded = [] base.range.reverse.each { |i|
var pow = base**i; var div = enc//pow
var c = dict{div} var fv = freq{c} var cv = cf{c}
var rem = ((enc - pow*cv) // fv)
enc = rem decoded << c }
# Return the decoded output return decoded
}
var radix = 10; # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|
var (enc, pow, freq) = arithmethic_coding(str.bytes, radix) var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
if (str != dec) { die "\tHowever that is incorrect!" }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15