CloudFlare suffered a massive security issue affecting all of its customers, including Rosetta Code. All passwords not changed since February 19th 2017 have been expired, and session cookie longevity will be reduced until late March.--Michael Mol (talk) 05:15, 25 February 2017 (UTC)

Arithmetic coding/As a generalized change of radix

From Rosetta Code
Arithmetic coding/As a generalized change of radix is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Arithmetic coding is a form of entropy encoding used in lossless data compression. Normally, a string of characters such as the words "hello there" is represented using a fixed number of bits per character, as in the ASCII code. When a string is converted to arithmetic encoding, frequently used characters will be stored with fewer bits and not-so-frequently occurring characters will be stored with more bits, resulting in fewer bits used in total. Arithmetic coding differs from other forms of entropy encoding, such as Huffman coding, in that rather than separating the input into component symbols and replacing each with a code, arithmetic coding encodes the entire message into a single number.
Task

Create a program which implements the arithmetic coding as a generalized change of radix.

Show the results, in base 10, for all the following strings:

  • "DABDDB"
  • "DABDDBBDDBA"
  • "ABRACADABRA"
  • "TOBEORNOTTOBEORTOBEORNOT"


Verify the implementation by decoding the results back into strings and checking for equality with the given strings.

Go[edit]

package main
 
import (
"fmt"
"math/big"
)
 
func cumulative_freq(freq map[byte]int64) map[byte]int64 {
total := int64(0)
cf := make(map[byte]int64)
for i := 0; i < 256; i++ {
b := byte(i)
if v, ok := freq[b]; ok {
cf[b] = total
total += v
}
}
return cf
}
 
func arithmethic_coding(str string, radix int64) (*big.Int,
*big.Int, map[byte]int64) {
 
// Convert the string into a slice of bytes
chars := []byte(str)
 
// The frequency characters
freq := make(map[byte]int64)
for _, c := range chars {
freq[c] += 1
}
 
// The cumulative frequency
cf := cumulative_freq(freq)
 
// Base
base := len(chars)
 
// Lower bound
L := big.NewInt(0)
 
// Product of all frequencies
pf := big.NewInt(1)
 
// Each term is multiplied by the product of the
// frequencies of all previously occurring symbols
bigBase := big.NewInt(int64(base))
 
for _, c := range chars {
x := big.NewInt(cf[c])
 
L.Mul(L, bigBase)
L.Add(L, x.Mul(x, pf))
pf.Mul(pf, big.NewInt(freq[c]))
}
 
// Upper bound
U := big.NewInt(0)
U.Set(L)
U.Add(U, pf)
 
bigOne := big.NewInt(1)
bigZero := big.NewInt(0)
bigRadix := big.NewInt(radix)
 
tmp := big.NewInt(0).Set(pf)
powr := big.NewInt(0)
 
for {
tmp.Div(tmp, bigRadix)
if tmp.Cmp(bigZero) == 0 {
break
}
powr.Add(powr, bigOne)
}
 
diff := big.NewInt(0)
diff.Sub(U, bigOne)
diff.Div(diff, big.NewInt(0).Exp(bigRadix, powr, nil))
 
return diff, powr, freq
}
 
func arithmethic_decoding(num *big.Int, radix int64,
pow *big.Int, freq map[byte]int64) string {
 
powr := big.NewInt(radix)
 
enc := big.NewInt(0).Set(num)
enc.Mul(enc, powr.Exp(powr, pow, nil))
 
base := int64(0)
for _, v := range freq {
base += v
}
 
// Create the cumulative frequency table
cf := cumulative_freq(freq)
 
// Create the dictionary
dict := make(map[int64]byte)
for k, v := range cf {
dict[v] = k
}
 
// Fill the gaps in the dictionary
lchar := -1
for i := int64(0); i < base; i++ {
if v, ok := dict[i]; ok {
lchar = int(v)
} else if lchar != -1 {
dict[i] = byte(lchar)
}
}
 
// Decode the input number
decoded := make([]byte, base)
bigBase := big.NewInt(base)
 
for i := base - 1; i >= 0; i-- {
 
pow := big.NewInt(0)
pow.Exp(bigBase, big.NewInt(i), nil)
 
div := big.NewInt(0)
div.Div(enc, pow)
 
c := dict[div.Int64()]
fv := freq[c]
cv := cf[c]
 
prod := big.NewInt(0).Mul(pow, big.NewInt(cv))
diff := big.NewInt(0).Sub(enc, prod)
enc.Div(diff, big.NewInt(fv))
 
decoded[base-i-1] = c
}
 
// Return the decoded output
return string(decoded)
}
 
func main() {
 
var radix = int64(10)
 
strSlice := []string{
`DABDDB`,
`DABDDBBDDBA`,
`ABRACADABRA`,
`TOBEORNOTTOBEORTOBEORNOT`,
}
 
for _, str := range strSlice {
enc, pow, freq := arithmethic_coding(str, radix)
dec := arithmethic_decoding(enc, radix, pow, freq)
fmt.Printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
 
if str != dec {
panic("\tHowever that is incorrect!")
}
}
}
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

J[edit]

Implementation:

NB. generate a frequency dictionary from a reference string
aekDict=:verb define
d=. ~.y NB. dictionary lists unique characters
o=. /:d NB. in canonical order
f=. (#/.~%&x:#)y NB. and their relative frequencies
(o{d);o{f
)
 
NB. encode a string against a reference dict
aekEnc=:verb define
NB. use string to generate a dict if none provided
(aekDict y) aekEnc y
:
'u F'=.x NB. unpack dictionary
b=. x:#y NB. numeric base
f=. b*F NB. absolute frequencies
i=. u i.y NB. character indices
c=. +/\0,}:f NB. cumulative frequencies
L=. b #. (i{c)**/\1,}:i{f NB. lower bound
p=. */i{f NB. product of character frequencies
e=. x:<.10^.p NB. number of decimal positions to drop
e,~<.(L+p)%10^e
)
 
aekDec=:adverb define
:
'u F'=. x NB. unpack dictionary
f=. m*F NB. frequencies of characters
c=.+/\0,}:f NB. cumulative frequencies
C=.<:}.c,m NB. id lookup table
N=. (* 10&^)/y NB. remainder being decoded
r=. '' NB. result of decode
for_d. m^x:i.-m do. NB. positional values
id=. <.N%d NB. character id
i=.C I.id NB. character index
N=.<.(N -(i{c)*d)%i{f NB. corrected remainder
r=.r,u{~i NB. accumulated result
end.
)
 
NB. task demo utility:
aek=:verb define
dict=. aekDict y
echo 'Dictionary:'
echo ' ',.(0{::dict),.' ',.":,.1{::dict
echo 'Length:'
echo ' ',":#y
echo 'Encoded:'
echo ' ',":dict aekEnc y
echo 'Decoded:'
echo ' ',":dict (#y) aekDec aekEnc y
)

Example use:

   aek 'DABDDB'
Dictionary:
A 1r6
B 1r3
D 1r2
Length:
6
Encoded:
251 2
Decoded:
DABDDB
 
aek 'DABDDBBDDBA'
Dictionary:
A 2r11
B 4r11
D 5r11
Length:
11
Encoded:
167351 6
Decoded:
DABDDBBDDBA
 
aek 'ABRACADABRA'
Dictionary:
A 5r11
B 2r11
C 1r11
D 1r11
R 2r11
Length:
11
Encoded:
7954170 4
Decoded:
ABRACADABRA
 
aek 'TOBEORNOTTOBEORTOBEORNOT'
Dictionary:
B 1r8
E 1r8
N 1r12
O 1r3
R 1r8
T 5r24
Length:
24
Encoded:
1150764267498783364 15
Decoded:
TOBEORNOTTOBEORTOBEORNOT

Note that for this task we use our plaintext to generate our dictionary for decoding. Also note that we use rational numbers, rather than floating point, for our dictionary, because floating point tends to be inexact.

Perl[edit]

use Math::BigInt (try => 'GMP');
 
sub cumulative_freq {
my ($freq) = @_;
 
my %cf;
my $total = Math::BigInt->new(0);
foreach my $c (sort keys %$freq) {
$cf{$c} = $total;
$total += $freq->{$c};
}
 
return %cf;
}
 
sub arithmethic_coding {
my ($str, $radix) = @_;
my @chars = split(//, $str);
 
# The frequency characters
my %freq;
$freq{$_}++ for @chars;
 
# The cumulative frequency table
my %cf = cumulative_freq(\%freq);
 
# Base
my $base = Math::BigInt->new(scalar @chars);
 
# Lower bound
my $L = Math::BigInt->new(0);
 
# Product of all frequencies
my $pf = Math::BigInt->new(1);
 
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
foreach my $c (@chars) {
$L->bmuladd($base, $cf{$c} * $pf);
$pf->bmul($freq{$c});
}
 
# Upper bound
my $U = $L + $pf;
 
my $pow = Math::BigInt->new($pf)->blog($radix);
my $enc = ($U - 1)->bdiv(Math::BigInt->new($radix)->bpow($pow));
 
return ($enc, $pow, \%freq);
}
 
sub arithmethic_decoding {
my ($enc, $radix, $pow, $freq) = @_;
 
# Multiply enc by radix^pow
$enc *= $radix**$pow;
 
# Base
my $base = Math::BigInt->new(0);
$base += $_ for values %{$freq};
 
# Create the cumulative frequency table
my %cf = cumulative_freq($freq);
 
# Create the dictionary
my %dict;
while (my ($k, $v) = each %cf) {
$dict{$v} = $k;
}
 
# Fill the gaps in the dictionary
my $lchar;
foreach my $i (0 .. $base - 1) {
if (exists $dict{$i}) {
$lchar = $dict{$i};
}
elsif (defined $lchar) {
$dict{$i} = $lchar;
}
}
 
# Decode the input number
my $decoded = '';
for (my $pow = $base**($base - 1) ; $pow > 0 ; $pow /= $base) {
my $div = $enc / $pow;
 
my $c = $dict{$div};
my $fv = $freq->{$c};
my $cv = $cf{$c};
 
$enc = ($enc - $pow * $cv) / $fv;
$decoded .= $c;
}
 
# Return the decoded output
return $decoded;
}
 
my $radix = 10; # can be any integer greater or equal with 2
 
foreach my $str (qw(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT)) {
my ($enc, $pow, $freq) = arithmethic_coding($str, $radix);
my $dec = arithmethic_decoding($enc, $radix, $pow, $freq);
 
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
 
if ($str ne $dec) {
die "\tHowever that is incorrect!";
}
}
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Perl 6[edit]

sub cumulative_freq(%freq) {
my %cf;
my $total = 0;
for %freq.keys.sort -> $c {
%cf{$c} = $total;
$total += %freq{$c};
}
return %cf;
}
 
sub arithmethic_coding($str, $radix) {
my @chars = $str.comb;
 
# The frequency characters
my %freq;
%freq{$_}++ for @chars;
 
# The cumulative frequency table
my %cf = cumulative_freq(%freq);
 
# Base
my $base = @chars.elems;
 
# Lower bound
my $L = 0;
 
# Product of all frequencies
my $pf = 1;
 
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
for @chars -> $c {
$L = $L*$base + %cf{$c}*$pf;
$pf *= %freq{$c};
}
 
# Upper bound
my $U = $L + $pf;
 
my $pow = 0;
loop {
$pf div= $radix;
last if $pf == 0;
++$pow;
}
 
my $enc = ($U - 1) div ($radix ** $pow);
($enc, $pow, %freq);
}
 
sub arithmethic_decoding($encoding, $radix, $pow, %freq) {
 
# Multiply encoding by radix^pow
my $enc = $encoding * $radix**$pow;
 
# Base
my $base = [+] %freq.values;
 
# Create the cumulative frequency table
my %cf = cumulative_freq(%freq);
 
# Create the dictionary
my %dict;
for %cf.kv -> $k,$v {
%dict{$v} = $k;
}
 
# Fill the gaps in the dictionary
my $lchar;
for ^$base -> $i {
if (%dict{$i}:exists) {
$lchar = %dict{$i};
}
elsif (defined $lchar) {
%dict{$i} = $lchar;
}
}
 
# Decode the input number
my $decoded = '';
for reverse(^$base) -> $i {
 
my $pow = $base**$i;
my $div = $enc div $pow;
 
my $c = %dict{$div};
my $fv = %freq{$c};
my $cv = %cf{$c};
 
my $rem = ($enc - $pow*$cv) div $fv;
 
$enc = $rem;
$decoded ~= $c;
}
 
# Return the decoded output
return $decoded;
}
 
my $radix = 10; # can be any integer greater or equal with 2
 
for <DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT> -> $str {
my ($enc, $pow, %freq) = arithmethic_coding($str, $radix);
my $dec = arithmethic_decoding($enc, $radix, $pow, %freq);
 
printf("%-25s=> %19s * %d^%s\n", $str, $enc, $radix, $pow);
 
if ($str ne $dec) {
die "\tHowever that is incorrect!";
}
}
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Python[edit]

Works with: Python version 3.1+
from collections import Counter
 
def cumulative_freq(freq):
cf = {}
total = 0
for b in range(256):
if b in freq:
cf[b] = total
total += freq[b]
return cf
 
def arithmethic_coding(bytes, radix):
 
# The frequency characters
freq = Counter(bytes)
 
# The cumulative frequency table
cf = cumulative_freq(freq)
 
# Base
base = len(bytes)
 
# Lower bound
lower = 0
 
# Product of all frequencies
pf = 1
 
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
for b in bytes:
lower = lower*base + cf[b]*pf
pf *= freq[b]
 
# Upper bound
upper = lower+pf
 
pow = 0
while True:
pf //= radix
if pf==0: break
pow += 1
 
enc = (upper-1) // radix**pow
return enc, pow, freq
 
def arithmethic_decoding(enc, radix, pow, freq):
 
# Multiply enc by radix^pow
enc *= radix**pow;
 
# Base
base = sum(freq.values())
 
# Create the cumulative frequency table
cf = cumulative_freq(freq)
 
# Create the dictionary
dict = {}
for k,v in cf.items():
dict[v] = k
 
# Fill the gaps in the dictionary
lchar = None
for i in range(base):
if i in dict:
lchar = dict[i]
elif lchar is not None:
dict[i] = lchar
 
# Decode the input number
decoded = bytearray()
for i in range(base-1, -1, -1):
pow = base**i
div = enc//pow
 
c = dict[div]
fv = freq[c]
cv = cf[c]
 
rem = (enc - pow*cv) // fv
 
enc = rem
decoded.append(c)
 
# Return the decoded output
return bytes(decoded)
 
radix = 10 # can be any integer greater or equal with 2
 
for str in b'DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT'.split():
enc, pow, freq = arithmethic_coding(str, radix)
dec = arithmethic_decoding(enc, radix, pow, freq)
 
print("%-25s=> %19s * %d^%s" % (str, enc, radix, pow))
 
if str != dec:
raise Exception("\tHowever that is incorrect!")
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Ruby[edit]

def cumulative_freq(freq)
cf = {}
total = 0
freq.keys.sort.each do |b|
cf[b] = total
total += freq[b]
end
return cf
end
 
def arithmethic_coding(bytes, radix)
 
# The frequency characters
freq = Hash.new(0)
bytes.each { |b| freq[b] += 1 }
 
# The cumulative frequency table
cf = cumulative_freq(freq)
 
# Base
base = bytes.size
 
# Lower bound
lower = 0
 
# Product of all frequencies
pf = 1
 
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
bytes.each do |b|
lower = lower*base + cf[b]*pf
pf *= freq[b]
end
 
# Upper bound
upper = lower+pf
 
pow = 0
loop do
pf /= radix
break if pf==0
pow += 1
end
 
enc = ((upper-1) / radix**pow)
[enc, pow, freq]
end
 
def arithmethic_decoding(enc, radix, pow, freq)
 
# Multiply enc by radix^pow
enc *= radix**pow;
 
# Base
base = freq.values.reduce(:+)
 
# Create the cumulative frequency table
cf = cumulative_freq(freq)
 
# Create the dictionary
dict = {}
cf.each_pair do |k,v|
dict[v] = k
end
 
# Fill the gaps in the dictionary
lchar = nil
(0...base).each do |i|
if dict.has_key?(i)
lchar = dict[i]
elsif lchar != nil
dict[i] = lchar
end
end
 
# Decode the input number
decoded = []
(0...base).reverse_each do |i|
pow = base**i
div = enc/pow
 
c = dict[div]
fv = freq[c]
cv = cf[c]
 
rem = ((enc - pow*cv) / fv)
 
enc = rem
decoded << c
end
 
# Return the decoded output
return decoded
end
 
radix = 10 # can be any integer greater or equal with 2
 
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each do |str|
 
enc, pow, freq = arithmethic_coding(str.bytes, radix)
dec = arithmethic_decoding(enc, radix, pow, freq).map{|b| b.chr }.join
 
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow)
 
if str != dec
raise "\tHowever that is incorrect!"
end
end
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

Sidef[edit]

func cumulative_freq(freq) {
var cf = Hash()
var total = 0
256.range.each { |b|
if (freq.contains(b)) {
cf{b} = total
total += freq{b}
}
}
return cf
}
 
func arithmethic_coding(bytes, radix=10) {
 
# The frequency characters
var freq = Hash()
bytes.each { |b| freq{b} := 0 ++ }
 
# The cumulative frequency table
var cf = cumulative_freq(freq)
 
# Base
var base = bytes.len
 
# Lower bound
var L = 0
 
# Product of all frequencies
var pf = 1
 
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
bytes.each { |b|
L = (L*base + cf{b}*pf)
pf *= freq{b}
}
 
# Upper bound
var U = L+pf
 
var pow = pf.log(radix).int
var enc = ((U-1) // radix**pow)
 
return (enc, pow, freq)
}
 
func arithmethic_decoding(enc, radix, pow, freq) {
 
# Multiply enc by radix^pow
enc *= radix**pow;
 
# Base
var base = freq.values.sum
 
# Create the cumulative frequency table
var cf = cumulative_freq(freq);
 
# Create the dictionary
var dict = Hash()
cf.each_kv { |k,v|
dict{v} = k
}
 
# Fill the gaps in the dictionary
var lchar = ''
base.range.each { |i|
if (dict.contains(i)) {
lchar = dict{i}
}
elsif (!lchar.is_empty) {
dict{i} = lchar
}
}
 
# Decode the input number
var decoded = []
base.range.reverse.each { |i|
 
var pow = base**i;
var div = enc//pow
 
var c = dict{div}
var fv = freq{c}
var cv = cf{c}
 
var rem = ((enc - pow*cv) // fv)
 
enc = rem
decoded << c
}
 
# Return the decoded output
return decoded
}
 
var radix = 10; # can be any integer greater or equal with 2
 
%w(DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT).each { |str|
 
var (enc, pow, freq) = arithmethic_coding(str.bytes, radix)
var dec = arithmethic_decoding(enc, radix, pow, freq).join_bytes('UTF-8')
 
printf("%-25s=> %19s * %d^%s\n", str, enc, radix, pow);
 
if (str != dec) {
die "\tHowever that is incorrect!"
}
}
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15

zkl[edit]

Uses libGMP (GNU MP Bignum Library)

var [const] BN=Import("zklBigNum");  // libGMP
 
fcn cumulativeFreq(freqHash){
total,cf := 0,Dictionary();
foreach b in (256){ if(v:=freqHash.find(b)){ cf[b]=total; total+=v; } }
cf
}
 
fcn arithmethicCoding(str, radix){
bytes  :=str.split("").apply("toAsc"); // string to bytes: "0"-->0x31
freqHash:=Dictionary(); bytes.pump(Void,freqHash.incV); // frequency chars
cf  :=cumulativeFreq(freqHash); // The cumulative frequency
 
base,lower:=bytes.len(), BN(0); // Lower bound
pf:=BN(1); // Product of all frequencies
 
// Each term is multiplied by the product of the
// frequencies of all previously occurring symbols
foreach b in (bytes){
lower.mul(base).add(pf*cf[b]); // gets quite large
pf.mul(freqHash[b]); // gets big
}
 
upper,powr := lower + pf, 0;
while(1){
pf.div(radix); // in place BigInt math, no garbage
if(pf==0) break;
powr+=1;
}
enc:=(upper - 1)/BN(radix).pow(powr);
 
return(enc,powr,freqHash);
}
fcn arithmethicDecoding(enc, radix, powr, freqHash){
enc*=radix.pow(powr);
base:=freqHash.values.sum(0);
cf  :=cumulativeFreq(freqHash); // Create the cumulative frequency table
dict:=cf.pump(Dictionary(), // Invert/transpose cumulative table, keys are strings
fcn(kv){ kv.reverse().apply("toInt")});
// Fill the gaps in the dictionary
lchar:=Void;
foreach b in (base){
if(v:=dict.find(b)) lchar=v;
else if(lchar) dict[b]=lchar;
}
 
// Decode the input number
decoded:=Data(); // byte bucket
foreach n in ([base-1..0, -1]){
pow:=BN(base).pow(n); // a big number
div:=(enc/pow).toInt(); // a small number, convert from BigInt
c,fv,cv := dict[div],freqHash[c],cf[c];
decoded.append(c.toChar());
enc.sub(pow*cv).div(fv); // in place BigInt math, no garbage
}
decoded.text // Return the decoded output
}
radix:=10;
testStrings:=T(
"DABDDB",
"DABDDBBDDBA",
"ABRACADABRA",
"TOBEORNOTTOBEORTOBEORNOT",);
 
foreach str in (testStrings){
enc,pow,freq := arithmethicCoding(str,radix);
dec:=arithmethicDecoding(enc, radix, pow, freq);
print("%-25s=> %19s * %d^%s\n".fmt(str,enc,radix,pow));
 
if(str!=dec) println("\tHowever that is incorrect!");
}
Output:
DABDDB                   =>                 251 * 10^2
DABDDBBDDBA              =>              167351 * 10^6
ABRACADABRA              =>             7954170 * 10^4
TOBEORNOTTOBEORTOBEORNOT => 1150764267498783364 * 10^15