Arithmetic coding/As a generalized change of radix: Difference between revisions
m (Added Perl 6) |
m (Added Ruby) |
||
Line 438: | Line 438: | ||
} |
} |
||
}</lang> |
}</lang> |
||
{{out}} |
|||
<pre> |
|||
DABDDB => 251 * 10^2 |
|||
DABDDBBDDBA => 167351 * 10^6 |
|||
ABRACADABRA => 7954170 * 10^4 |
|||
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15 |
|||
</pre> |
|||
=={{header|Ruby}}== |
|||
<lang ruby>def cumulative_freq(freq) |
|||
cf = {} |
|||
total = 0 |
|||
(0..255).each do |b| |
|||
if freq.has_key?(b) |
|||
cf[b] = total |
|||
total += freq[b] |
|||
end |
|||
end |
|||
return cf |
|||
end |
|||
def arithmethic_coding(bytes, radix) |
|||
# The frequency characters |
|||
freq = Hash.new(0) |
|||
bytes.each { |b| freq[b] += 1 } |
|||
# The cumulative frequency table |
|||
cf = cumulative_freq(freq) |
|||
# Limit and base |
|||
lim = bytes.size-1 |
|||
base = lim+1 |
|||
# Lower bound |
|||
lower = 0 |
|||
# Product of all frequencies |
|||
pf = 1 |
|||
# Each term is multiplied by the product of the |
|||
# frequencies of all previously occurring symbols |
|||
(0...base).each do |i| |
|||
x = cf[bytes[i]] * base**(lim-i) |
|||
lower += x*pf |
|||
pf *= freq[bytes[i]] |
|||
end |
|||
# Upper bound |
|||
upper = lower+pf |
|||
pow = 0 |
|||
loop do |
|||
pf /= radix |
|||
break if pf==0 |
|||
pow += 1 |
|||
end |
|||
enc = ((upper-1) / radix**pow) |
|||
[enc, pow, freq] |
|||
end |
|||
def arithmethic_decoding(enc, radix, pow, freq) |
|||
# Multiply enc by radix^pow |
|||
enc *= radix**pow; |
|||
base = 0 |
|||
freq.each_value { |v| base += v } |
|||
# Create the cumulative frequency table |
|||
cf = cumulative_freq(freq) |
|||
# Create the dictionary |
|||
dict = {} |
|||
cf.each_pair do |k,v| |
|||
dict[v] = k |
|||
end |
|||
# Fill the gaps in the dictionary |
|||
lchar = nil |
|||
(0...base).each do |i| |
|||
if dict.has_key?(i) |
|||
lchar = dict[i] |
|||
elsif lchar != nil |
|||
dict[i] = lchar |
|||
end |
|||
end |
|||
# Decode the input number |
|||
decoded = [] |
|||
(0...base).reverse_each do |i| |
|||
pow = base**i |
|||
div = enc/pow |
|||
c = dict[div] |
|||
fv = freq[c] |
|||
cv = cf[c] |
|||
rem = ((enc - pow*cv) / fv) |
|||
enc = rem |
|||
decoded << c |
|||
end |
|||
# Return the decoded output |
|||
return decoded |
|||
end |
|||
radix = 10 # can be any integer greater or equal with 2 |
|||
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str| |
|||
enc, pow, freq = arithmethic_coding(str.bytes, radix) |
|||
dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join |
|||
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow) |
|||
if str != dec |
|||
raise "\tHowever that is incorrect!" |
|||
end |
|||
end</lang> |
|||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
Revision as of 22:14, 24 January 2016
Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.
- Task
Create a program which implements the arithmetic coding as a generalized change of radix.
Show the results, in base 10, for all the following strings:
- "DABDDB"
- "DABDDBBDDBA"
- "ABRACADABRA"
- "TOBEORNOTTOBEORTOBEORNOT"
Verify the implementation by decoding the results back into strings and checking for equality with the given strings.
Go
<lang go>package main
import (
"fmt" "math/big"
)
func cumulative_freq(freq map[byte]int64) map[byte]int64 {
total := int64(0) cf := make(map[byte]int64) for i := 0; i < 256; i++ { b := byte(i) if v, ok := freq[b]; ok { cf[b] = total total += v } } return cf
}
func arithmethic_coding(str string, radix int64) (*big.Int,
*big.Int, map[byte]int64) {
// Convert the string into a slice of bytes chars := []byte(str)
// The frequency characters freq := make(map[byte]int64) for _, c := range chars { freq[c] += 1 }
// The cumulative frequency cf := cumulative_freq(freq)
// Base base := len(chars)
// Lower bound L := big.NewInt(0)
// Product of all frequencies pf := big.NewInt(1)
// Each term is multiplied by the product of the // frequencies of all previously occurring symbols bigBase := big.NewInt(int64(base)) bigLim := big.NewInt(int64(base - 1))
for i := 0; i < base; i++ {
bigI := big.NewInt(int64(i))
diff := big.NewInt(0) diff.Sub(bigLim, bigI)
pow := big.NewInt(0) pow.Exp(bigBase, diff, nil)
x := big.NewInt(1) x.Mul(big.NewInt(cf[chars[i]]), pow)
L.Add(L, x.Mul(x, pf)) pf.Mul(pf, big.NewInt(freq[chars[i]])) }
// Upper bound U := big.NewInt(0) U.Set(L) U.Add(U, pf)
bigOne := big.NewInt(1) bigZero := big.NewInt(0) bigRadix := big.NewInt(radix)
tmp := big.NewInt(0).Set(pf) powr := big.NewInt(0)
for { tmp.Div(tmp, bigRadix) if tmp.Cmp(bigZero) == 0 { break } powr.Add(powr, bigOne) }
diff := big.NewInt(0) diff.Sub(U, bigOne) diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
return diff, powr, freq
}
func arithmethic_decoding(num *big.Int, radix int64,
pow *big.Int, freq map[byte]int64) string {
powr := big.NewInt(radix)
enc := big.NewInt(0).Set(num) enc.Mul(enc, powr.Exp(powr, pow, nil))
base := int64(0) for _, v := range freq { base += v }
// Create the cumulative frequency table cf := cumulative_freq(freq)
// Create the dictionary dict := make(map[int64]byte) for k, v := range cf { dict[v] = k }
// Fill the gaps in the dictionary lchar := -1 for i := int64(0); i < base; i++ { if v, ok := dict[i]; ok { lchar = int(v) } else if lchar != -1 { dict[i] = byte(lchar) } }
// Decode the input number decoded := make([]byte, base) bigBase := big.NewInt(base)
for i := base - 1; i >= 0; i-- {
pow := big.NewInt(0) pow.Exp(bigBase, big.NewInt(i), nil)
div := big.NewInt(0) div.Div(enc, pow)
c := dict[div.Int64()] fv := freq[c] cv := cf[c]
prod := big.NewInt(0).Mul(pow, big.NewInt(cv)) diff := big.NewInt(0).Sub(enc, prod) enc.Div(diff, big.NewInt(fv))
decoded[base-i-1] = c }
// Return the decoded output return string(decoded)
}
func main() {
var radix = int64(10)
strSlice := []string{ `DABDDB`, `DABDDBBDDBA`, `ABRACADABRA`, `TOBEORNOTTOBEORTOBEORNOT`, }
for _, str := range strSlice { enc, pow, freq := arithmethic_coding(str, radix) dec := arithmethic_decoding(enc, radix, pow, freq) fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec { panic("\tHowever that is incorrect!") } }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Perl
<lang perl>use Math::BigInt (try => 'GMP');
sub cumulative_freq {
my ($freq) = @_;
my %cf; my $total = Math::BigInt->new(0); foreach my $c (map { chr } 0 .. 255) { if (exists $freq->{$c}) { $cf{$c} = $total; $total += $freq->{$c}; } }
return %cf;
}
sub arithmethic_coding {
my ($str, $radix) = @_; my @chars = split(//, $str);
# The frequency characters my %freq; $freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(\%freq);
# Limit and base my $lim = Math::BigInt->new($#chars); my $base = $lim + 1;
# Lower bound my $L = Math::BigInt->new(0);
# Product of all frequencies my $pf = Math::BigInt->new(1);
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for (my $i = 0 ; $i < $base ; $i++) { my $x = $cf{$chars[$i]} * $base**($lim - $i); $L->badd($x * $pf); $pf->bmul($freq{$chars[$i]}); }
# Upper bound my $U = $L + $pf;
my $pow = Math::BigInt->new($pf)->blog($radix); my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
return ($enc, $pow, \%freq);
}
sub arithmethic_decoding {
my ($enc, $radix, $pow, $freq) = @_;
# Multiply enc by radix^pow $enc *= $radix**$pow;
my $base = Math::BigInt->new(0); $base += $_ for values %{$freq};
# Create the cumulative frequency table my %cf = cumulative_freq($freq);
# Create the dictionary my %dict; while (my ($k, $v) = each %cf) { $dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; foreach my $i (0 .. $base - 1) { if (exists $dict{$i}) { $lchar = $dict{$i}; } elsif (defined $lchar) { $dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for (my $i = $base - 1 ; $i >= 0 ; $i--) {
my $pow = $base**$i; my $div = ($enc / $pow);
my $c = $dict{$div}; my $fv = $freq->{$c}; my $cv = $cf{$c};
my $rem = ($enc - $pow * $cv) / $fv;
$enc = $rem; $decoded .= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {
my ($enc, $pow, $freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Perl 6
<lang perl6>sub cumulative_freq(%freq) {
my %cf; my $total = 0; for (0..255).map({.chr}) -> $c { if (%freq{$c}:exists) { %cf{$c} = $total; $total += %freq{$c}; } } return %cf;
}
sub arithmethic_coding($str, $radix) {
my @chars = $str.comb;
# The frequency characters my %freq; %freq{$_}++ for @chars;
# The cumulative frequency table my %cf = cumulative_freq(%freq);
# Limit and base my $lim = @chars.end; my $base = $lim + 1;
# Lower bound my $L = 0;
# Product of all frequencies my $pf = 1;
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols for ^$base -> $i { my $x = %cf{@chars[$i]} * $base**($lim - $i); $L += $x * $pf; $pf *= %freq{@chars[$i]}; }
# Upper bound my $U = $L + $pf;
my $pow = 0; loop { $pf div= $radix; last if $pf == 0; ++$pow; }
my $enc = ($U - 1) div ($radix ** $pow); ($enc, $pow, %freq);
}
sub arithmethic_decoding($encoding, $radix, $pow, %freq) {
# Multiply encoding by radix^pow my $enc = $encoding * $radix**$pow;
my $base = 0; $base += $_ for %freq.values;
# Create the cumulative frequency table my %cf = cumulative_freq(%freq);
# Create the dictionary my %dict; for %cf.kv -> $k,$v { %dict{$v} = $k; }
# Fill the gaps in the dictionary my $lchar; for ^$base -> $i { if (%dict{$i}:exists) { $lchar = %dict{$i}; } elsif (defined $lchar) { %dict{$i} = $lchar; } }
# Decode the input number my $decoded = ; for reverse(^$base) -> $i {
my $pow = $base**$i; my $div = $enc div $pow;
my $c = %dict{$div}; my $fv = %freq{$c}; my $cv = %cf{$c};
my $rem = ($enc - $pow*$cv) div $fv;
$enc = $rem; $decoded ~= $c; }
# Return the decoded output return $decoded;
}
my $radix = 10; # can be any integer greater or equal with 2
for <DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT> -> $str {
my ($enc, $pow, %freq) = arithmethic_coding($str, $radix); my $dec = arithmethic_decoding($enc, $radix, $pow, %freq);
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
if ($str ne $dec) { die "\tHowever that is incorrect!"; }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Ruby
<lang ruby>def cumulative_freq(freq)
cf = {} total = 0 (0..255).each do |b| if freq.has_key?(b) cf[b] = total total += freq[b] end end return cf
end
def arithmethic_coding(bytes, radix)
# The frequency characters freq = Hash.new(0) bytes.each { |b| freq[b] += 1 }
# The cumulative frequency table cf = cumulative_freq(freq)
# Limit and base lim = bytes.size-1 base = lim+1
# Lower bound lower = 0
# Product of all frequencies pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols (0...base).each do |i| x = cf[bytes[i]] * base**(lim-i) lower += x*pf pf *= freq[bytes[i]] end
# Upper bound upper = lower+pf
pow = 0 loop do pf /= radix break if pf==0 pow += 1 end
enc = ((upper-1) / radix**pow) [enc, pow, freq]
end
def arithmethic_decoding(enc, radix, pow, freq)
# Multiply enc by radix^pow enc *= radix**pow;
base = 0 freq.each_value { |v| base += v }
# Create the cumulative frequency table cf = cumulative_freq(freq)
# Create the dictionary dict = {} cf.each_pair do |k,v| dict[v] = k end
# Fill the gaps in the dictionary lchar = nil (0...base).each do |i| if dict.has_key?(i) lchar = dict[i] elsif lchar != nil dict[i] = lchar end end
# Decode the input number decoded = [] (0...base).reverse_each do |i| pow = base**i div = enc/pow
c = dict[div] fv = freq[c] cv = cf[c]
rem = ((enc - pow*cv) / fv)
enc = rem decoded << c end
# Return the decoded output return decoded
end
radix = 10 # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str|
enc, pow, freq = arithmethic_coding(str.bytes, radix) dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
if str != dec raise "\tHowever that is incorrect!" end
end</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15
Sidef
<lang ruby>func cumulative_freq(freq) {
var cf = Hash() var total = 0 256.range.each { |b| if (freq.contains(b)) { cf{b} = total total += freq{b} } } return cf
}
func arithmethic_coding(bytes, radix=10) {
# The frequency characters var freq = Hash() bytes.each { |b| freq{b} := 0 ++ }
# The cumulative frequency table var cf = cumulative_freq(freq)
# Limit and base var lim = bytes.end var base = lim+1
# Lower bound var L = 0
# Product of all frequencies var pf = 1
# Each term is multiplied by the product of the # frequencies of all previously occurring symbols base.range.each { |i| var x = (cf{bytes[i]} * base**(lim - i)) L += x*pf pf *= freq{bytes[i]} }
# Upper bound var U = L+pf
var pow = pf.log(radix).int var enc = ((U-1) // radix**pow)
return (enc, pow, freq)
}
func arithmethic_decoding(enc, radix, pow, freq) {
# Multiply enc by radix^pow enc *= radix**pow;
var base = 0 freq.each_value { |v| base += v }
# Create the cumulative frequency table var cf = cumulative_freq(freq);
# Create the dictionary var dict = Hash() cf.each_kv { |k,v| dict{v} = k }
# Fill the gaps in the dictionary var lchar = base.range.each { |i| if (dict.contains(i)) { lchar = dict{i} } elsif (!lchar.is_empty) { dict{i} = lchar } }
# Decode the input number var decoded = [] base.range.reverse.each { |i|
var pow = base**i; var div = enc//pow
var c = dict{div} var fv = freq{c} var cv = cf{c}
var rem = ((enc - pow*cv) // fv)
enc = rem decoded << c }
# Return the decoded output return decoded
}
var radix = 10; # can be any integer greater or equal with 2
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|
var (enc, pow, freq) = arithmethic_coding(str.bytes, radix) var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
if (str != dec) { die "\tHowever that is incorrect!" }
}</lang>
- Output:
DABDDB => 251 * 10^2 DABDDBBDDBA => 167351 * 10^6 ABRACADABRA => 7954170 * 10^4 TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15