Verify distribution uniformity/Naive: Difference between revisions

From Rosetta Code
Content added Content deleted
(→‎{{header|C}}: it was a banal *value++ instead of ++*value or similar ;) however, rewritten to avoid the use of two hashes)
Line 19: Line 19:


=={{header|C}}==
=={{header|C}}==
{{incorrect|C|I suppose there's some error in inserting / getting the values; it must be checked in relationship to Dice5/Dice7, since it gives always skewed, which is odd...}}

{{libheader|Judy}}
{{libheader|Judy}}
<lang c>#include <stdio.h>
<lang c>#include <stdio.h>
Line 33: Line 31:
Pvoid_t h = (Pvoid_t) NULL;
Pvoid_t h = (Pvoid_t) NULL;
PWord_t value;
PWord_t value;
PWord_t element;
Pvoid_t hi = (Pvoid_t)NULL;


Word_t i;
Word_t t; // temp for lazyness in using JudyLFreeArray directly
int i, j, h_length;
int h_length;


// populate hashes; the hi hash is needed to
// populate hashes
for(i=0; i < n; i++) {
// be able to iterate later on key(rn) - value pairs; indeed
// we could use JLF (first index in array) and then JLN (next index)
for(i=0, j=0; i < n; i++) {
int rn = dist();
int rn = dist();
JLG(value, h, rn);
if ( value == NULL ) {
JLI(value, hi, j);
*value = rn;
j++;
}
JLI(value, h, rn);
JLI(value, h, rn);
*value++;
++*value;
}
}


JLC(h_length, h, 0, -1);
h_length = j; // we could use JLC too to count how many idx are

// into the h array.
double target = 1.0 * n / (double)h_length;
double target = 1.0 * n / (double)h_length;


i = 0;
for(i=0; i < h_length; i++) {
JLG(value, hi, i);
JLF(element, h, i);
while(element != NULL) {
int k = *value;
if ( abs(*element - target) > 0.01*n*D ) {
JLG(value, h, k);
int v = *value; // now we have couple key-value
if ( abs(v - target) > 0.01*n*D ) {
fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",
fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",
k, (int)target, v);
i, (int)target, *element);
JLFA(t, h); JLFA(t, hi);
JudyLFreeArray(&h, PJE0);
return false; // bad distr.
return false; // bad distr.
}
}
JLN(element, h, i);
}
}
JLFA(t, h); JLFA(t, hi);


JudyLFreeArray(&h, PJE0);
return true; // distr. ok
return true; // distr. ok
}


int frand()
{
return rand() % 10;
}
}


int main()
int main()
{
{
distcheck(frand, 1000000, 1);
distcheck(rand, 1000000, 1);
return 0;
return 0;
}</lang>
}</lang>

Revision as of 11:01, 11 August 2009

Task
Verify distribution uniformity/Naive
You are encouraged to solve this task according to the task description, using any language you may know.

This task is an adjunct to Seven-dice from Five-dice.

Create a function to check that the random integers returned from a small-integer generator function have uniform distribution.

The function should take as arguments:

  • The function producing random integers.
  • The number of times to call the integer generator.
  • A 'delta' value of some sort that indicates how close to a flat distribution is close enough.

The function should produce:

  • Some indication of the distribution achieved.
  • An 'error' if the distribution is not flat enough.

Show the distribution checker working when the produced distribution is flat enough and when it is not. (Use a generator from Seven-dice from Five-dice).

See also:

C

Library: Judy

<lang c>#include <stdio.h>

  1. include <stdlib.h>
  2. include <stdbool.h>
  3. include <math.h>
  1. include <Judy.h>

bool distcheck(int (*dist)(), int n, double D) {

 Pvoid_t h = (Pvoid_t) NULL;
 PWord_t value;
 PWord_t element;
 Word_t i;
 int h_length;
 // populate hashes
 for(i=0; i < n; i++) {
   int rn = dist();
   JLI(value, h, rn);
   ++*value;
 }
 JLC(h_length, h, 0, -1);
 double target = 1.0 * n / (double)h_length;
 i = 0;
 JLF(element, h, i);
 while(element != NULL) {
   if ( abs(*element - target) > 0.01*n*D ) {
     fprintf(stderr, "distribution potentially skewed for '%d': expected '%d', got '%d'\n",

i, (int)target, *element);

     JudyLFreeArray(&h, PJE0);
     return false; // bad distr.
   }
   JLN(element, h, i);
 }
 JudyLFreeArray(&h, PJE0);
 return true; // distr. ok

}

int main() {

 distcheck(rand, 1000000, 1);
 return 0;

}</lang>

OCaml

<lang ocaml>let distcheck fn n ?(delta=1.0) () =

 let h = Hashtbl.create 5 in
 for i = 1 to n do
   let v = fn() in
   let n = 
     try Hashtbl.find h v 
     with Not_found -> 0 
   in
   Hashtbl.replace h v (n+1)
 done;
 Hashtbl.iter (fun v n -> Printf.printf "%d => %d\n%!" v n) h;
 let target = (float n) /. float (Hashtbl.length h) in
 Hashtbl.iter (fun key value ->
   if abs_float(float value -. target) > 0.01 *. delta *. (float n)
   then (Printf.eprintf
     "distribution potentially skewed for '%d': expected around %f, got %d\n%!"
      key target value)
 ) h;
</lang>

Python

<lang python>from collections import Counter from pprint import pprint as pp

def distcheck(fn, repeats, delta):

   \
   Bin the answers to fn() and check bin counts are within +/- delta %
   of repeats/bincount
   bin = Counter(fn() for i in range(repeats))
   target = repeats // len(bin)
   deltacount = int(delta / 100. * target)
   assert all( abs(target - count) < deltacount
               for count in bin.values() ), "Bin distribution skewed from %i +/- %i: %s" % (
                   target, deltacount, [ (key, target - count)
                                         for key, count in sorted(bin.items()) ]
                   )
   pp(dict(bin))</lang>

Sample output:

>>> distcheck(dice5, 1000000, 1)
{1: 200244, 2: 199831, 3: 199548, 4: 199853, 5: 200524}
>>> distcheck(dice5, 1000, 1)
Traceback (most recent call last):
  File "<pyshell#30>", line 1, in <module>
    distcheck(dice5, 1000, 1)
  File "C://Paddys/rand7fromrand5.py", line 54, in distcheck
    for key, count in sorted(bin.items()) ]
AssertionError: Bin distribution skewed from 200 +/- 2: [(1, 4), (2, -33), (3, 6), (4, 11), (5, 12)]

Ruby

Translation of: Tcl

<lang ruby>def distcheck(n, delta=1)

 unless block_given?
   raise ArgumentError, "pass a block to this method"
 end
 
 h = Hash.new(0)
 n.times {h[ yield ] += 1}
 
 target = 1.0 * n / h.length
 h.each do |key, value| 
   if (value - target).abs > 0.01 * delta * n
     raise StandardError,
       "distribution potentially skewed for '#{key}': expected around #{target}, got #{value}"
   end
 end
 
 h.keys.sort.each {|k| print "#{k} #{h[k]} "}
 puts

end

if __FILE__ == $0

 begin
   distcheck(100_000) {rand(10)}
   distcheck(100_000) {rand > 0.95} 
 rescue StandardError => e
   p e
 end

end</lang>

output:

0 9986 1 9826 2 9861 3 10034 4 9876 5 10114 6 10329 7 9924 8 10123 9 9927 
#<StandardError: distribution potentially skewed for 'false': expected around 50000.0, got 94841>

Tcl

<lang tcl>proc distcheck {random times {delta 1}} {

   for {set i 0} {$i<$times} {incr i} {incr vals([uplevel 1 $random])}
   set target [expr {$times / [array size vals]}]
   foreach {k v} [array get vals] {
       if {abs($v - $target) > $times  * $delta / 100.0} {
          error "distribution potentially skewed for $k: expected around $target, got $v"
       }
   }
   foreach k [lsort -integer [array names vals]] {lappend result $k $vals($k)}
   return $result

}</lang> Demonstration: <lang tcl># First, a uniformly distributed random variable puts [distcheck {expr {int(10*rand())}} 100000]

  1. Now, one that definitely isn't!

puts [distcheck {expr {rand()>0.95}} 100000]</lang> Which produces this output (error in red):

0 10003 1 9851 2 10058 3 10193 4 10126 5 10002 6 9852 7 9964 8 9957 9 9994
distribution potentially skewed for 0: expected around 50000, got 94873