Verify distribution uniformity/Naive: Difference between revisions

Content added Content deleted

Inline

Revision as of 11:01, 11 August 2009

This task is an adjunct to Seven-dice from Five-dice.

Create a function to check that the random integers returned from a small-integer generator function have uniform distribution.

The function should take as arguments:

The function producing random integers.
The number of times to call the integer generator.
A 'delta' value of some sort that indicates how close to a flat distribution is close enough.

The function should produce:

Some indication of the distribution achieved.
An 'error' if the distribution is not flat enough.

Show the distribution checker working when the produced distribution is flat enough and when it is not. (Use a generator from Seven-dice from Five-dice).

See also:

Verify Distribution Uniformity with Chi-Squared Test‎

C

Library: Judy

<lang c>#include <stdio.h>

include <stdlib.h>
include <stdbool.h>
include <math.h>

include <Judy.h>

bool distcheck(int (*dist)(), int n, double D) {

 Pvoid_t h = (Pvoid_t) NULL;
 PWord_t value;
 PWord_t element;

 Word_t i;
 int h_length;

 // populate hashes
 for(i=0; i < n; i++) {
   int rn = dist();
   JLI(value, h, rn);
   ++*value;
 }

 JLC(h_length, h, 0, -1);

 double target = 1.0 * n / (double)h_length;

 i = 0;
 JLF(element, h, i);
 while(element != NULL) {
   if ( abs(*element - target) > 0.01*n*D ) {
     fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",

i, (int)target, *element);

     JudyLFreeArray(&h, PJE0);
     return false; // bad distr.
   }
   JLN(element, h, i);
 }

 JudyLFreeArray(&h, PJE0);
 return true; // distr. ok

}

int main() {

 distcheck(rand, 1000000, 1);
 return 0;

}</lang>

OCaml

<lang ocaml>let distcheck fn n ?(delta=1.0) () =

 let h = Hashtbl.create 5 in
 for i = 1 to n do
   let v = fn() in
   let n = 
     try Hashtbl.find h v 
     with Not_found -> 0 
   in
   Hashtbl.replace h v (n+1)
 done;
 Hashtbl.iter (fun v n -> Printf.printf "%d => %d\n%!" v n) h;
 let target = (float n) /. float (Hashtbl.length h) in
 Hashtbl.iter (fun key value ->
   if abs_float(float value -. target) > 0.01 *. delta *. (float n)
   then (Printf.eprintf
     "distribution potentially skewed for '%d': expected around %f, got %d\n%!"
      key target value)
 ) h;

</lang>

Python

<lang python>from collections import Counter from pprint import pprint as pp

def distcheck(fn, repeats, delta):

   \
   Bin the answers to fn() and check bin counts are within +/- delta %
   of repeats/bincount
   bin = Counter(fn() for i in range(repeats))
   target = repeats // len(bin)
   deltacount = int(delta / 100. * target)
   assert all( abs(target - count) < deltacount
               for count in bin.values() ), "Bin distribution skewed from %i +/- %i: %s" % (
                   target, deltacount, [ (key, target - count)
                                         for key, count in sorted(bin.items()) ]
                   )
   pp(dict(bin))</lang>

Sample output:

>>> distcheck(dice5, 1000000, 1)
{1: 200244, 2: 199831, 3: 199548, 4: 199853, 5: 200524}
>>> distcheck(dice5, 1000, 1)
Traceback (most recent call last):
  File "<pyshell#30>", line 1, in <module>
    distcheck(dice5, 1000, 1)
  File "C://Paddys/rand7fromrand5.py", line 54, in distcheck
    for key, count in sorted(bin.items()) ]
AssertionError: Bin distribution skewed from 200 +/- 2: [(1, 4), (2, -33), (3, 6), (4, 11), (5, 12)]

Ruby

Translation of: Tcl

<lang ruby>def distcheck(n, delta=1)

 unless block_given?
   raise ArgumentError, "pass a block to this method"
 end
 
 h = Hash.new(0)
 n.times {h[ yield ] += 1}
 
 target = 1.0 * n / h.length
 h.each do |key, value| 
   if (value - target).abs > 0.01 * delta * n
     raise StandardError,
       "distribution potentially skewed for '#{key}': expected around #{target}, got #{value}"
   end
 end
 
 h.keys.sort.each {|k| print "#{k} #{h[k]} "}
 puts

end

if __FILE__ == $0

 begin
   distcheck(100_000) {rand(10)}
   distcheck(100_000) {rand > 0.95} 
 rescue StandardError => e
   p e
 end

end</lang>

output:

0 9986 1 9826 2 9861 3 10034 4 9876 5 10114 6 10329 7 9924 8 10123 9 9927 
#<StandardError: distribution potentially skewed for 'false': expected around 50000.0, got 94841>

Tcl

<lang tcl>proc distcheck {random times {delta 1}} {

   for {set i 0} {$i<$times} {incr i} {incr vals([uplevel 1 $random])}
   set target [expr {$times / [array size vals]}]
   foreach {k v} [array get vals] {
       if {abs($v - $target) > $times  * $delta / 100.0} {
          error "distribution potentially skewed for $k: expected around $target, got $v"
       }
   }
   foreach k [lsort -integer [array names vals]] {lappend result $k $vals($k)}
   return $result

}</lang> Demonstration: <lang tcl># First, a uniformly distributed random variable puts [distcheck {expr {int(10*rand())}} 100000]

Now, one that definitely isn't!

puts [distcheck {expr {rand()>0.95}} 100000]</lang> Which produces this output (error in red):

0 10003 1 9851 2 10058 3 10193 4 10126 5 10002 6 9852 7 9964 8 9957 9 9994
distribution potentially skewed for 0: expected around 50000, got 94873

@@ Line 19: / Line 19: @@
 =={{header|C}}==
-{{incorrect|C|I suppose there's some error in inserting / getting the values; it must be checked in relationship to Dice5/Dice7, since it gives always skewed, which is odd...}}
 {{libheader|Judy}}
 <lang c>#include <stdio.h>
@@ Line 33: / Line 31: @@
   Pvoid_t h = (Pvoid_t) NULL;
   PWord_t value;
+  PWord_t element;
-  Pvoid_t hi = (Pvoid_t)NULL;
+  Word_t i;
-  Word_t t; // temp for lazyness in using JudyLFreeArray directly
-  int i, j, h_length;
+  int h_length;
-  // populate hashes; the hi hash is needed to
+  // populate hashes
+  for(i=0; i < n; i++) {
-  // be able to iterate later on key(rn) - value pairs; indeed
-  // we could use JLF (first index in array) and then JLN (next index)
-  for(i=0, j=0; i < n; i++) {
     int rn = dist();
-    JLG(value, h, rn);
-    if ( value == NULL ) {
-      JLI(value, hi, j);
-      *value = rn;
-      j++;
-    }
     JLI(value, h, rn);
-    *value++;
+    ++*value;
   }
+  JLC(h_length, h, 0, -1);
-  h_length = j; // we could use JLC too to count how many idx are
-                // into the h array.
   double target = 1.0 * n / (double)h_length;
+  i = 0;
-  for(i=0; i < h_length; i++) {
-    JLG(value, hi, i);
+  JLF(element, h, i);
+  while(element != NULL) {
-    int k = *value;
+    if ( abs(*element - target) > 0.01*n*D ) {
-    JLG(value, h, k);
-    int v = *value; // now we have couple key-value
-    if ( abs(v - target) > 0.01*n*D ) {
       fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",
-	      k, (int)target, v);
+	      i, (int)target, *element);
-      JLFA(t, h); JLFA(t, hi);
+      JudyLFreeArray(&h, PJE0);
       return false; // bad distr.
     }
+    JLN(element, h, i);
   }
-  JLFA(t, h); JLFA(t, hi);
+  JudyLFreeArray(&h, PJE0);
   return true; // distr. ok
-}
-int frand()
-{
-  return rand() % 10;
 }
 int main()
 {
-  distcheck(frand, 1000000, 1);
+  distcheck(rand, 1000000, 1);
   return 0;
 }</lang>