Verify distribution uniformity/Naive: Difference between revisions

From Rosetta Code
Content added Content deleted
(Added PicoLisp)
(added Fortran)
Line 191: Line 191:
writefln(freqs);
writefln(freqs);
}</lang>
}</lang>

=={{header|Fortran}}==
{{works with|Fortran|95 and later}}
<lang fortran>subroutine distcheck(randgen, n, delta)

interface
function randgen
integer :: randgen
end function randgen
end interface
real, intent(in) :: delta
integer, intent(in) :: n
integer :: i, mval, lolim, hilim
integer, allocatable :: buckets(:)
integer, allocatable :: rnums(:)
logical :: skewed = .false.
allocate(rnums(n))
do i = 1, n
rnums(i) = randgen()
end do

mval = maxval(rnums)
allocate(buckets(mval))
buckets = 0
do i = 1, n
buckets(rnums(i)) = buckets(rnums(i)) + 1
end do

lolim = n/mval - n/mval*delta
hilim = n/mval + n/mval*delta
do i = 1, mval
if(buckets(i) < lolim .or. buckets(i) > hilim) then
write(*,"(a,i0,a,i0,a,i0)") "Distribution potentially skewed for bucket ", i, " Expected: ", &
n/mval, " Actual: ", buckets(i)
skewed = .true.
end if
end do

if (.not. skewed) write(*,"(a)") "Distribution uniform"
deallocate(rnums)
deallocate(buckets)
end subroutine</lang>


=={{header|Haskell}}==
=={{header|Haskell}}==

Revision as of 11:17, 5 October 2010

Task
Verify distribution uniformity/Naive
You are encouraged to solve this task according to the task description, using any language you may know.

This task is an adjunct to Seven-dice from Five-dice.

Create a function to check that the random integers returned from a small-integer generator function have uniform distribution.

The function should take as arguments:

  • The function (or object) producing random integers.
  • The number of times to call the integer generator.
  • A 'delta' value of some sort that indicates how close to a flat distribution is close enough.

The function should produce:

  • Some indication of the distribution achieved.
  • An 'error' if the distribution is not flat enough.

Show the distribution checker working when the produced distribution is flat enough and when it is not. (Use a generator from Seven-dice from Five-dice).

See also:

AutoHotkey

<lang AutoHotkey>MsgBox, % DistCheck("dice7",10000,3)


DistCheck(function, repetitions, delta) { Loop, % 7 ; initialize array

  {  bucket%A_Index% := 0
  }

  Loop, % repetitions ; populate buckets
  {  v := %function%()
     bucket%v% += 1
  }
  lbnd := round((repetitions/7)*(100-delta)/100)
  ubnd := round((repetitions/7)*(100+delta)/100)
  text := "Distribution check:`n`nTotal elements = " repetitions
        . "`n`nMargin = " delta "% --> Lbound = " lbnd ", Ubound = " ubnd "`n"
  Loop, % 7
  {  text := text "`nBucket " A_Index " contains " bucket%A_Index% " elements."
     If bucket%A_Index% not between %lbnd% and %ubnd%
        text := text " Skewed."
  }
  Return, text

}</lang>

Distribution check:

Total elements = 10000

Margin = 3% --> Lbound = 1386, Ubound = 1471

Bucket 1 contains 1450 elements.
Bucket 2 contains 1374 elements. Skewed.
Bucket 3 contains 1412 elements.
Bucket 4 contains 1465 elements.
Bucket 5 contains 1370 elements. Skewed.
Bucket 6 contains 1485 elements. Skewed.
Bucket 7 contains 1444 elements.

C

Library: Judy

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <stdbool.h>
  3. include <math.h>
  1. include <Judy.h>

bool distcheck(int (*dist)(), int n, double D) {

 Pvoid_t h = (Pvoid_t) NULL;
 PWord_t value;
 PWord_t element;
 Word_t i;
 int h_length;
 // populate hashes
 for(i=0; i < n; i++) {
   int rn = dist();
   JLI(value, h, rn);
   ++*value;
 }
 JLC(h_length, h, 0, -1);
 double target = 1.0 * n / (double)h_length;
 i = 0;
 JLF(element, h, i);
 while(element != NULL) {
   if ( abs(*element - target) > 0.01*n*D ) {
     fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",

i, (int)target, *element);

     JudyLFreeArray(&h, PJE0);
     return false; // bad distr.
   }
   JLN(element, h, i);
 }
 JudyLFreeArray(&h, PJE0);
 return true; // distr. ok

}

int main() {

 distcheck(rand, 1000000, 1);
 return 0;

}</lang>

C++

<lang cpp>#include <map>

  1. include <iostream>
  2. include <cmath>

template<typename F>

bool test_distribution(F f, int calls, double delta)

{

 typedef std::map<int, int> distmap;
 distmap dist;
 for (int i = 0; i < calls; ++i)
   ++dist[f()];
 double mean = 1.0/dist.size();
 bool good = true;
 for (distmap::iterator i = dist.begin(); i != dist.end(); ++i)
 {
   if (std::abs((1.0 * i->second)/calls - mean) > delta)
   {
     std::cout << "Relative frequency " << i->second/(1.0*calls)
               << " of result " << i->first
               << " deviates by more than " << delta
               << " from the expected value " << mean << "\n";
     good = false;
   }
 }
 return good;

}</lang>

Common Lisp

Translation of: OCaml

<lang lisp>(defun check-distribution (function n &optional (delta 1.0))

 (let ((bins (make-hash-table)))
   (loop repeat n do (incf (gethash (funcall function) bins 0)))
   (loop with target = (/ n (hash-table-count bins))
         for key being each hash-key of bins using (hash-value value)
         when (> (abs (- value target)) (* 0.01 delta n))
         do (format t "~&Distribution potentially skewed for ~w:~
                        expected around ~w got ~w." key target value)
         finally (return bins))))</lang>
> (check-distribution 'd7 1000)
Distribution potentially skewed for 1: expected around 1000/7 got 153.
Distribution potentially skewed for 2: expected around 1000/7 got 119.
Distribution potentially skewed for 3: expected around 1000/7 got 125.
Distribution potentially skewed for 7: expected around 1000/7 got 156.
T
#<EQL Hash Table{7} 200B5A53>

> (check-distribution 'd7 10000)
NIL
#<EQL Hash Table{7} 200CB5BB>

D

<lang d>import std.math: abs; import std.string: format; import std.stdio: writefln;

/** Bin the answers to fn() and check bin counts are within +/- delta % of repeats/bincount.

  • /

void distCheck(TF)(TF func, int nrepeats, double delta) {

   int[int] freqs;
   for (int i; i < nrepeats; i++)
       freqs[func()]++;
   double target = nrepeats / cast(double)freqs.length;
   int deltaCount = cast(int)(delta / 100.0 * target);
   foreach (k, count; freqs)
       if (abs(target - count) >= deltaCount)
           throw new Exception(format("distribution potentially skewed for '%s': '%d'\n",
                                      k, count));
   writefln(freqs);

}</lang>

Fortran

Works with: Fortran version 95 and later

<lang fortran>subroutine distcheck(randgen, n, delta)

 interface
   function randgen
     integer :: randgen 
   end function randgen
 end interface
 
 real, intent(in) :: delta
 integer, intent(in) :: n
 integer :: i, mval, lolim, hilim
 integer, allocatable :: buckets(:)
 integer, allocatable :: rnums(:)
 logical :: skewed = .false.
    
 allocate(rnums(n))
 
 do i = 1, n
   rnums(i) = randgen()
 end do
 mval = maxval(rnums)
 allocate(buckets(mval))
 buckets = 0
 
 do i = 1, n
   buckets(rnums(i)) = buckets(rnums(i)) + 1
 end do
 lolim = n/mval - n/mval*delta
 hilim = n/mval + n/mval*delta
 
 do i = 1, mval  
   if(buckets(i) < lolim .or. buckets(i) > hilim) then
     write(*,"(a,i0,a,i0,a,i0)") "Distribution potentially skewed for bucket ", i, "   Expected: ", &
                                  n/mval, "   Actual: ", buckets(i)
     skewed = .true.
   end if
 end do 
 if (.not. skewed) write(*,"(a)") "Distribution uniform"
 
 deallocate(rnums)
 deallocate(buckets)
   

end subroutine</lang>

Haskell

<lang haskell>import System.Random import Data.List import Control.Monad import Control.Arrow

distribCheck :: IO Int -> Int -> Int -> IO [(Int,(Int,Bool))] distribCheck f n d = do

 nrs <- replicateM n f
 let group  = takeWhile (not.null) $ unfoldr (Just. (partition =<< (==). head)) nrs
     avg    = (fromIntegral n) / fromIntegral (length group)
     ul     = round $ (100 + fromIntegral d)/100 * avg
     ll     = round $ (100 - fromIntegral d)/100 * avg
 return $ map (head &&& (id &&& liftM2 (&&) (>ll)(<ul)).length) group</lang>

Example: <lang haskell>*Main> mapM_ print .sort =<< distribCheck (randomRIO(1,6)) 100000 3 (1,(16911,True)) (2,(16599,True)) (3,(16670,True)) (4,(16624,True)) (5,(16526,True)) (6,(16670,True))</lang>

J

This solution defines the checker as an adverb. Adverbs combine with the verb immediately to their left to create a new verb. So for a verb generateDistribution and an adverb checkUniform, the new verb might be thought of as checkGeneratedDistributionisUniform.

The delta is given as an optional left argument (x), defaulting to 5%. The right argument (y) is any valid argument to the distribution generating verb. <lang j>checkUniform=: adverb define

 0.05 u checkUniform y
 :
 n=. */y
 delta=. x
 sample=. u n              NB. the "u" refers to the verb to left of adverb
 freqtable=. /:~ (~. sample) ,. #/.~ sample
 expected=. n % # freqtable
 errmsg=. 'Distribution is potentially skewed'
 errmsg assert (delta * expected) > | expected - {:"1 freqtable
 freqtable

)</lang>

It is possible to use tacit expressions within an explicit definition enabling a more functional and concise style: <lang j>checkUniformT=: adverb define

 0.05 u checkUniformT y
 :
 freqtable=. /:~ (~. ,. #/.~) u n=. */y
 errmsg=. 'Distribution is potentially skewed' 
 errmsg assert ((n % #) (x&*@[ > |@:-) {:"1) freqtable
 freqtable

)</lang>

Show usage using rollD7t given in Seven-dice from Five-dice:

<lang j> 0.05 rollD7t checkUniform 1e5 1 14082 2 14337 3 14242 4 14470 5 14067 6 14428 7 14374

  0.05 rollD7t checkUniform 1e2

|Distribution is potentially skewed: assert | errmsg assert(delta*expected)>|expected-{:"1 freqtable</lang>

JavaScript

Translation of: Tcl

<lang javascript>function distcheck(random_func, times, opts) {

   if (opts === undefined) opts = {}
   opts['delta'] = opts['delta'] || 2;
   var count = {}, vals = [];
   for (var i = 0; i < times; i++) {
       var val = random_func();
       if (! has_property(count, val)) {
           count[val] = 1;
           vals.push(val);
       }
       else
           count[val] ++;
   }
   vals.sort(function(a,b) {return a-b});
   var target = times / vals.length;
   var tolerance = target * opts['delta'] / 100; 
   for (var i = 0; i < vals.length; i++) {
       var val = vals[i];
       if (Math.abs(count[val] - target) > tolerance) 
           throw "distribution potentially skewed for " + val +
                 ": expected result around " + target + ", got " +count[val];
       else
           print(val + "\t" + count[val]);
   }

}

function has_property(obj, propname) {

   return typeof(obj[propname]) == "undefined" ? false : true;

}

try {

   distcheck(function() {return Math.floor(10 * Math.random())}, 100000);
   print();
   distcheck(function() {return (Math.random() > 0.95 ? 1 : 0)}, 100000);

} catch (e) {

   print(e);

}</lang>

output

0       9945
1       9862
2       9954
3       10104
4       9861
5       10140
6       10066
7       10001
8       10101
9       9966

distribution potentially skewed for 0: expected result around 50000, got 95040

OCaml

<lang ocaml>let distcheck fn n ?(delta=1.0) () =

 let h = Hashtbl.create 5 in
 for i = 1 to n do
   let v = fn() in
   let n = 
     try Hashtbl.find h v 
     with Not_found -> 0 
   in
   Hashtbl.replace h v (n+1)
 done;
 Hashtbl.iter (fun v n -> Printf.printf "%d => %d\n%!" v n) h;
 let target = (float n) /. float (Hashtbl.length h) in
 Hashtbl.iter (fun key value ->
   if abs_float(float value -. target) > 0.01 *. delta *. (float n)
   then (Printf.eprintf
     "distribution potentially skewed for '%d': expected around %f, got %d\n%!"
      key target value)
 ) h;
</lang>

PicoLisp

The following function takes a count, and allowed deviation in per mill (one-tenth of a percent), and a 'prg' code body (i.e. an arbitrary number of executable expressions). <lang PicoLisp>(de checkDistribution (Cnt Pm . Prg)

  (let Res NIL
     (do Cnt (accu 'Res (run Prg 1) 1))
     (let
        (N (/ Cnt (length Res))
           Min (*/ N (- 1000 Pm) 1000)
           Max (*/ N (+ 1000 Pm) 1000) )
        (for R Res
           (prinl (cdr R) " " (if (>= Max (cdr R) Min) "Good" "Bad")) ) ) ) )</lang>

Output:

: (checkDistribution 100000 5 (rand 1 7))
14299 Good
14394 Bad
14147 Bad
14418 Bad
14159 Bad
14271 Good
14312 Good

PureBasic

<lang PureBasic>Prototype RandNum_prt()

Procedure.s distcheck(*function.RandNum_prt, repetitions, delta.d)

 Protected text.s, maxIndex = 0
 Dim bucket(maxIndex) ;array will be resized as needed
 
 For i = 1 To repetitions ;populate buckets
   v = *function()
   If v > maxIndex
     maxIndex = v
     Redim bucket(maxIndex)
   EndIf 
   bucket(v) + 1
 Next 
 
 
 lbnd = Round((repetitions / maxIndex) * (100 - delta) / 100, #PB_Round_Up)
 ubnd = Round((repetitions / maxIndex) * (100 + delta) / 100, #PB_Round_Down)
 text = "Distribution check:" + #crlf$ + #crlf$
 text + "Total elements = " + Str(repetitions) + #crlf$ + #crlf$
 text + "Margin = " + StrF(delta, 2) + "% --> Lbound = " + Str(lbnd) + ", Ubound = " + Str(ubnd) + #crlf$
 
 For i = 1 To maxIndex 
   If bucket(i) < lbnd Or bucket(i) > ubnd
     text + #crlf$ + "Bucket " + Str(i) + " contains " + Str(bucket(i)) + " elements.  Skewed."
   EndIf 
 Next 
 ProcedureReturn text

EndProcedure

MessageRequester("Results", distcheck(@dice7(), 1000000, 0.20))</lang> A small delta was chosen to increase the chance of a skewed result in the sample output:

Distribution check:

Total elements = 1000000

Margin = 0.20% --> Lbound = 142572, Ubound = 143142

Bucket 1 contains 141977 elements.  Skewed.
Bucket 6 contains 143860 elements.  Skewed.

Python

Works with: Python version 3.1

<lang python>from collections import Counter from pprint import pprint as pp

def distcheck(fn, repeats, delta):

   \
   Bin the answers to fn() and check bin counts are within +/- delta %
   of repeats/bincount
   bin = Counter(fn() for i in range(repeats))
   target = repeats // len(bin)
   deltacount = int(delta / 100. * target)
   assert all( abs(target - count) < deltacount
               for count in bin.values() ), "Bin distribution skewed from %i +/- %i: %s" % (
                   target, deltacount, [ (key, target - count)
                                         for key, count in sorted(bin.items()) ]
                   )
   pp(dict(bin))</lang>

Sample output:

>>> distcheck(dice5, 1000000, 1)
{1: 200244, 2: 199831, 3: 199548, 4: 199853, 5: 200524}
>>> distcheck(dice5, 1000, 1)
Traceback (most recent call last):
  File "<pyshell#30>", line 1, in <module>
    distcheck(dice5, 1000, 1)
  File "C://Paddys/rand7fromrand5.py", line 54, in distcheck
    for key, count in sorted(bin.items()) ]
AssertionError: Bin distribution skewed from 200 +/- 2: [(1, 4), (2, -33), (3, 6), (4, 11), (5, 12)]

R

<lang r>distcheck <- function(fn, repetitions=1e4, delta=3) {

  if(is.character(fn))
  {
     fn <- get(fn)
  }
  if(!is.function(fn))
  {
     stop("fn is not a function")
  }
  samp <- fn(n=repetitions)   
  counts <- table(samp)
  expected <- repetitions/length(counts)
  lbound <- expected * (1 - 0.01*delta)
  ubound <- expected * (1 + 0.01*delta)
  status <- ifelse(counts < lbound, "under", 
     ifelse(counts > ubound, "over", "okay"))
  data.frame(value=names(counts), counts=as.vector(counts), status=status)   

} distcheck(dice7.vec)</lang>

Ruby

Translation of: Tcl

<lang ruby>def distcheck(n, delta=1)

 unless block_given?
   raise ArgumentError, "pass a block to this method"
 end
 
 h = Hash.new(0)
 n.times {h[ yield ] += 1}
 
 target = 1.0 * n / h.length
 h.each do |key, value| 
   if (value - target).abs > 0.01 * delta * n
     raise StandardError,
       "distribution potentially skewed for '#{key}': expected around #{target}, got #{value}"
   end
 end
 
 h.keys.sort.each {|k| print "#{k} #{h[k]} "}
 puts

end

if __FILE__ == $0

 begin
   distcheck(100_000) {rand(10)}
   distcheck(100_000) {rand > 0.95} 
 rescue StandardError => e
   p e
 end

end</lang>

output:

0 9986 1 9826 2 9861 3 10034 4 9876 5 10114 6 10329 7 9924 8 10123 9 9927 
#<StandardError: distribution potentially skewed for 'false': expected around 50000.0, got 94841>

Tcl

<lang tcl>proc distcheck {random times {delta 1}} {

   for {set i 0} {$i<$times} {incr i} {incr vals([uplevel 1 $random])}
   set target [expr {$times / [array size vals]}]
   foreach {k v} [array get vals] {
       if {abs($v - $target) > $times  * $delta / 100.0} {
          error "distribution potentially skewed for $k: expected around $target, got $v"
       }
   }
   foreach k [lsort -integer [array names vals]] {lappend result $k $vals($k)}
   return $result

}</lang> Demonstration: <lang tcl># First, a uniformly distributed random variable puts [distcheck {expr {int(10*rand())}} 100000]

  1. Now, one that definitely isn't!

puts [distcheck {expr {rand()>0.95}} 100000]</lang> Which produces this output (error in red):

0 10003 1 9851 2 10058 3 10193 4 10126 5 10002 6 9852 7 9964 8 9957 9 9994
distribution potentially skewed for 0: expected around 50000, got 94873