Posit numbers/decoding

Posit is a quantization of the real projective line proposed by John Gustafson in 2015. It is claimed to be an improvement over IEEE 754.

The purpose of this task is to write a program capable of decoding a posit number. You will use the example provided by Gustafson in his paper : 0b0000110111011101, representing a 16-bit long real number with three bits for the exponent. Once decoded, you should obtain either the fraction 477/134217728 or the floating point value 3.55393E−6.

Jeff Johnson from Facebook research, described posit numbers as such:

A more efficient representation for tapered floating points is the recent posit format by Gustafson. It has no explicit size field; the exponent is encoded using a Golomb-Rice prefix-free code, with the exponent

e

encoded as a Golomb-Rice quotient and remainder

(q,r)

with

q

in unary and

r

in binary (in posit terminology,

q

is the regime). Remainder encoding size is defined by the exponent scale

s

, where

2^{s}

is the Golomb-Rice divisor. Any space not used by the exponent encoding is used by the significand, which unlike IEEE 754 always has a leading 1; gradual underflow (and overflow) is handled by tapering. A posit number system is characterized by

(N,s)

, where

N

is the word length in bits and

s

is the exponent scale. The minimum and maximum positive finite numbers in

(N,s)

are Failed to parse (syntax error): {\displaystyle f_\mathrm{min} = 2^{−(N−2)2^s}} and Failed to parse (SVG (MathML can be enabled via browser plugin): Invalid response ("Math extension cannot connect to Restbase.") from server "https://wikimedia.org/api/rest_v1/":): {\displaystyle f_\mathrm{max} = 2^{(N−2)2^s}} . The number line is represented much as the projective reals, with a single point at

\pm \infty

bounding Failed to parse (syntax error): {\displaystyle −f_\mathrm{max}} and

f_{\mathrm {max} }

.

\pm \infty

and 0 have special encodings; there is no NaN. The number system allows any choice of

N\geq 3

and Failed to parse (syntax error): {\displaystyle 0\le s\le N − 3} .

s

controls the dynamic range achievable; e.g., 8-bit (8, 5)-posit

f_{\mathrm {max} }=2^{192}

is larger than

f_{\mathrm {max} }

in float32. (8, 0) and (8, 1) are more reasonable values to choose for 8-bit floating point representations, with

f_{\mathrm {max} }

of 64 and 4096 accordingly. Precision is maximized in the range Failed to parse (syntax error): {\displaystyle \pm\left[2^{−(s+1)}, 2^{s+1}\right)} with Failed to parse (syntax error): {\displaystyle N − 3 − s} significand fraction bits, tapering to no fraction bits at

\pm f_{\mathrm {max} }

.

— Jeff Johnson, Rethinking floating point for deep learning, Facebook research.

Julia

struct PositType3{T<:Integer}
    numbits::UInt16
    es::UInt16
    bits::T
    PositType3(nb, ne, i) = new{typeof(i)}(UInt16(nb), UInt16(ne), i)
  end
  
""" From posithub.org/docs/Posits4.pdf """
function Base.Rational(p::PositType3)
    s = signbit(signed(p.bits))              # s for S signbit, is 1 if negative
    pabs = p.bits << 1                       # shift off signbit (adds a 0 to F at LSB)
    pabs == 0 && return s ? 1 // 0 : 0 // 1  # if p is 0, return 0 or if s 1 error
    expsign = signbit(signed(pabs))          # exponent sign from 2nd bit now MSB 
    r = expsign == 1 ? leading_ones(pabs) : leading_zeros(pabs) # r regime R size
    k = expsign ? r - 1 : -r                 # k for the exponent calculation
    pabs <<= (r + 1)                         # shift off unwanted R bits
    pabs >>= (r + 2)                         # shift back for E, F
    fsize = p.numbits - 1 - r - 1 - p.es     # check how many F bits explicit
    f = fsize < 1 ? 1 :
        1 + (pabs & (2^fsize-1)) // 2^fsize  # Get F value. Can be missing -> 1
    e = fsize < 1 ? pabs : pabs >> fsize     # Get E value.
    pw = 2^p.es * k + e
    return pw >= 0 ? (-1)^s * f * big"2"^pw // 1 : (-1)^s * f // big"2"^(-pw)
end
  
@show Rational(PositType3(16, 3, 0b0000110111011101)) == 477 // 134217728
const tests = [
(16,3,0b0000110111011101),
(16,3,0b1000000000000000),
(16,3,0b0000000000000000),
(16,1,0b0110110010101000),
(16,1,0b1001001101011000),
(16,2,0b0000000000000001),
(16,0,0b0111111111111111),
(16,6,0b0111111111111110),
(8,1,0b01000000),
(8,1,0b11000000),
(8,1,0b00110000),
(8,1,0b00100000),
(8,2,0b00000001),
(8,2,0b01111111),
(8,7,0b01111110),
(32,2,0b00000000000000000000000000000001),
(32,2,0b01111111111111111111111111111111),
(32,5,0b01111111111111111111111111111110),
]

for t in tests
    r = Rational(PositType3(t...))
    println(string(t[3], base=2, pad = t[1]), " => $r = ", float(r))
end

Output:

Rational(PositType3(16, 3, 0x0ddd)) == 477 // 134217728 = true
0000110111011101 => 477//134217728 = 3.553926944732666015625e-06
1000000000000000 => 1//0 = Inf
0000000000000000 => 0//1 = 0.0
0110110010101000 => 405//32 = 12.65625
1001001101011000 => -363//4096 = -0.088623046875
0000000000000001 => 1//72057594037927936 = 1.387778780781445675529539585113525390625e-17
0111111111111111 => 16384//1 = 16384.0
0111111111111110 => 28638903918474961204418783933674838490721739172170652529441449702311064005352904159345284265824628375429359509218999720074396860757073376700445026041564579620512874307979212102266801261478978776245040008231745247475930553606737583615358787106474295296//1 = 2.86389039184749612044187839336748384907217391721706525294414497023110640053529e+250
01000000 => 1//1 = 1.0
11000000 => -1//1 = -1.0
00110000 => 1//2 = 0.5
00100000 => 1//4 = 0.25
00000001 => 1//16777216 = 5.9604644775390625e-08
01111111 => 16777216//1 = 1.6777216e+07
01111110 => 4562440617622195218641171605700291324893228507248559930579192517899275167208677386505912811317371399778642309573594407310688704721375437998252661319722214188251994674360264950082874192246603776//1 = 4.562440617622195218641171605700291324893228507248559930579192517899275167208677e+192
00000000000000000000000000000001 => 1//1329227995784915872903807060280344576 = 7.523163845262640050999913838222372338039459563341360137656010920181870460510254e-37   
01111111111111111111111111111111 => 1329227995784915872903807060280344576//1 = 1.329227995784915872903807060280344576e+36
01111111111111111111111111111110 => 2269007733883335972287082669296112915239349672942191252221331572442536403137824056312817862695551072066953619064625508194663368599769448406663254670871573830845597595897613333042429214224697474472410882236254024057110212260250671521235807709272244389361641091086035023229622419456//1 = 2.269007733883335972287082669296112915239349672942191252221331572442536403137824e+279

Phix

with javascript_semantics
function twos_compliment_2_on(string bits, integer nbits)
    for i=2 to nbits do
        bits[i] = iff(bits[i]='0'?'1':'0')
    end for
    for i=nbits to 2 by -1 do
        if bits[i]='0' then
            bits[i] = '1'
            exit
        end if
        bits[i] = '0'
    end for
    return bits
end function

function posit_decode(integer nbits, es, object bits)
    --
    -- nbits: number of bits (aka n)
    -- es: exponent scale
    -- bits: (binary) integer or string of nbits 0|1
    --
    if not string(bits) then
        string fmt = sprintf("%%0%db",nbits)
        bits = sprintf(fmt,bits)
    end if
    assert(length(bits)==nbits)
    string ibits = bits
    integer s = bits[1]='1'
    if s then bits = twos_compliment_2_on(bits,nbits) end if
    integer r = find(xor_bits(bits[2],1),bits,3)-2,
            b2z = bits[2]='0', exponent = 0, fraction = 0
    atom fs = 1, useed = power(2,power(2,es))
    if r<0 then
        if b2z then
            if s then
                return {bits,"NaR",""}  -- aka inf
            end if
            return {bits,"zero",""}
        end if
        r = nbits-1
    else
        integer estart = r+3,
               efinish = min(r+2+es,nbits)
        exponent = to_integer(bits[estart..efinish],0,2)
        fraction = to_integer(bits[efinish+1..$],0,2)
        fs = power(2,nbits-efinish)
    end if
    integer k = iff(b2z?-r:r-1)
    atom res = iff(s?-1:+1)*power(useed,k)*power(2,exponent)*(1+fraction/fs)
    return {ibits,res}
end function

constant tests = {{16,3,0b0000110111011101},
                  {16,3,0b1000000000000000},
                  {16,3,0b0000000000000000},
                  {16,1,0b0110110010101000},
                  {16,1,0b1001001101011000},
                  {16,2,0b0000000000000001},
                  {16,0,0b0111111111111111},
                  {16,2,0b0111111111111111},
                  {16,6,0b0111111111111110},
                  {8,1,0b01000000},
                  {8,1,0b11000000},
                  {8,1,0b00110000},
                  {8,1,0b00100000},
                  {8,2,0b00000001},
                  {8,2,0b01111111},
                  {8,7,0b01111110},
                  {32,2,0b00000000000000000000000000000001},
                  {32,2,0b01111111111111111111111111111111},
                  {32,5,0b01111111111111111111111111111110}}
for t in tests do
    printf(1,"%s = %v\n",call_func(posit_decode,t))
end for

Output:

(Still disagrees with Julia on the -12.65625, but I think I'm right)

0000110111011101 = 3.553926944e-6
1000000000000000 = "NaR"
0000000000000000 = "zero"
0110110010101000 = 12.65625
1001001101011000 = -12.65625
0000000000000001 = 1.38777878e-17
0111111111111111 = 16384
0111111111111111 = 7.205759404e+16
0111111111111110 = 2.863890392e+250
01000000 = 1
11000000 = -1
00110000 = 0.5
00100000 = 0.25
00000001 = 5.960464478e-8
01111111 = 16777216
01111110 = 4.562440618e+192
00000000000000000000000000000001 = 7.523163846e-37
01111111111111111111111111111111 = 1.329227996e+36
01111111111111111111111111111110 = 2.269007734e+279

raku

unit role Posit[UInt $N, UInt $es];

has UInt $.UInt;
method sign { self.UInt > 2**($N - 1) ?? -1 !! +1 }

method FatRat {
  return 0   if self.UInt == 0;
  my UInt $mask = 2**($N - 1);
  return Inf if self.UInt == $mask;
  my UInt $n = self.UInt;
  my $sign = $n +& $mask ?? -1 !! +1;
  my $r = $sign;
  $n = ((2**$n - 1) +^ $n) + 1 if self.sign < 0;
  my int $count = 0;
  $mask +>= 1;
  my Bool $first-bit = ?($n +& $mask);
  repeat { $count++; $mask +>= 1;
  } while ?($n +& $mask) == $first-bit && $mask;
  my $m = $count;
  my $k = $first-bit ?? $m - 1 !! -$m;
  $r *= 2**($k*2**$es);
  return $r unless $mask > 1;
  $mask +>= 1;
  $count = 0;
  my UInt $exponent = 0;
  while $mask && $count++ < $es {
    $exponent +<= 1;
    $exponent +|= 1 if $n +& $mask;
    $mask +>= 1;
  }
  $r *= 2**$exponent;
  my $fraction = 1.FatRat;
  while $mask {
    (state $power-of-two = 1) +<= 1;
    $fraction += 1/$power-of-two if $n +& $mask;
    $mask +>= 1;
  }
  $r *= $fraction;

  return $r;
}

CHECK {
  use Test;
  # example from L<http://www.johngustafson.net/pdfs/BeatingFloatingPoint.pdf>
  is Posit[16, 3]
    .new(UInt => 0b0000110111011101)
    .FatRat, 477.FatRat/134217728;
}

Output:

ok 1 -

Wren

Library: Wren-fmt

Library: Wren-big

import "./fmt" for Conv
import "./big" for BigRat

var posit16_decode = Fn.new { |ps, maxExpSize|
    var p = ps.map { |c| c == "0" ? 0 : 1 }.toList

    // Deal with exceptional values.
    if (p[1..-1].all { |i| i == 0 }) {
        return (p[0] == 0) ? 0 : Conv.infinity
    }

    // Convert bits after sign bit to two's complement if negative.
    if (p[0] == 1) {
        for (i in 1..15) p[i] = (p[i] == 0) ? 1 : 0
        for (i in 15..1) {
            if (p[i] == 1) {
                p[i] = 0
            } else {
                p[i] = 1
                break
            }
        }
    }
    var first = p[1]
    var rs = 15  // regime size
    for (i in 2..15) {
        if (p[i] != first) {
            rs = i - 1
            break
        }
    }
    var regime = p[1..rs]
    var es = (rs == 15) ? 0 : maxExpSize.min(14-rs)  // actual exponent size
    var exponent = [0]
    if (es > 0) exponent = p[rs + 2...rs + 2 + es]
    var fs = (es == 0) ? 0 : 14 - rs - es  // function size
    var s = (p[0] == 0) ? 1 : -1  // sign
    var k = regime.all { |i| i == 0 } ? -rs : rs - 1
    var u = 2.pow(2.pow(maxExpSize))
    var e = Conv.atoi(exponent.join(""), 2)
    var f = BigRat.one
    if (fs > 0) {
        var fraction = ps[-fs..-1]
        f = Conv.atoi(fraction.join(""), 2)
        f = BigRat.one + BigRat.new(f, 2.pow(fs))
    }
    return f * BigRat.new(u, 1).pow(k) * s * 2.pow(e)
}

var ps = "0000110111011101"
System.print(posit16_decode.call(ps, 3))

Output:

477/134217728