Verify distribution uniformity/Chi-squared test

Revision as of 22:30, 17 June 2014 by Underscore (talk | contribs) (Added Hy.)

In this task, write a function to verify that a given distribution of values is uniform by using the test to see if the distribution has a likelihood of happening of at least the significance level (conventionally 5%). The function should return a boolean that is true if the distribution is one that a uniform distribution (with appropriate number of degrees of freedom) may be expected to produce.

Task
Verify distribution uniformity/Chi-squared test
You are encouraged to solve this task according to the task description, using any language you may know.

More information about the distribution may be found at mathworld.

Ada

First, we specifay a simple package to compute the Chi-Square Distance from the uniform distribution: <lang Ada>package Chi_Square is

  type Flt is digits 18;
  type Bins_Type is array(Positive range <>) of Natural;
  
  function Distance(Bins: Bins_Type) return Flt;
  

end Chi_Square;</lang>

Next, we implement that package:

<lang Ada>package body Chi_Square is

  function Distance(Bins: Bins_Type) return Flt is
     Bad_Bins: Natural := 0;
     Sum: Natural := 0;
     Expected: Flt;
     Result: Flt;
  begin
     for I in Bins'Range loop
        if Bins(I) < 5 then 
           Bad_Bins := Bad_Bins + 1;
        end if;
        Sum := Sum + Bins(I);
     end loop;
     if 5*Bad_Bins > Bins'Length then 
        raise Program_Error with "too many (almost) empty bins";
     end if;
     
     Expected := Flt(Sum) / Flt(Bins'Length);
     Result := 0.0;
     for I in Bins'Range loop
        Result := Result + ((Flt(Bins(I)) - Expected)**2) / Expected;
     end loop;
     return Result;
  end Distance;
  

end Chi_Square;</lang>

Finally, we actually implement the Chi-square test. We do not actually compute the Chi-square probability; rather we hardcode a table of values for 5% significance level, which has been picked from Wikipedia [1]: <lang Ada>with Ada.Text_IO, Ada.Command_Line, Chi_Square; use Ada.Text_IO;

procedure Test_Chi_Square is

  package Ch2 renames Chi_Square; use Ch2;
  package FIO is new Float_IO(Flt);
  
  B: Bins_Type(1 .. Ada.Command_Line.Argument_Count);
  Bound_For_5_Per_Cent: constant array(Positive range <>) of Flt :=
    ( 1 => 3.84,   2 =>  5.99,  3 =>  7.82,  4 => 9.49,   5 =>  11.07,
      6 => 12.59,  7 => 14.07,  8 => 15.51,  9 => 16.92, 10 =>  18.31);
    -- picked from http://en.wikipedia.org/wiki/Chi-squared_distribution
  
  Dist: Flt; 
  

begin

  for I in B'Range loop
     B(I) := Natural'Value(Ada.Command_Line.Argument(I));
  end loop;
  Dist := Distance(B);
  Put("Degrees of Freedom:" & Integer'Image(B'Length-1) & ", Distance: ");
  FIO.Put(Dist, Fore => 6, Aft => 2, Exp => 0);
  if Dist <= Bound_For_5_Per_Cent(B'Length-1) then
     Put_Line("; (apparently uniform)");
  else
     Put_Line("; (deviates significantly from uniform)");
  end if;

end;</lang>

Output:
$ ./Test_Chi_Square 199809 200665 199607 200270 199649  
Degrees of Freedom: 4, Distance:      4.15; (apparently uniform)
$ ./Test_Chi_Square 522573 244456 139979 71531 21461
Degrees of Freedom: 4, Distance: 790063.28; (deviates significantly from uniform)



C

This first sections contains the functions required to compute the Chi-Squared probability. These are not needed if a library containing the necessary function is availabile (e.g. see Numerical Integration, Gamma function). <lang c>#include <stdlib.h>

  1. include <stdio.h>
  2. include <math.h>
  3. ifndef M_PI
  4. define M_PI 3.14159265358979323846
  5. endif

typedef double (* Ifctn)( double t); /* Numerical integration method */ double Simpson3_8( Ifctn f, double a, double b, int N) {

   int j;
   double l1;
   double h = (b-a)/N;
   double h1 = h/3.0;
   double sum = f(a) + f(b);
   for (j=3*N-1; j>0; j--) {
       l1 = (j%3)? 3.0 : 2.0;
       sum += l1*f(a+h1*j) ;
   }
   return h*sum/8.0;

}

  1. define A 12

double Gamma_Spouge( double z ) {

   int k;
   static double cspace[A];
   static double *coefs = NULL;
   double accum;
   double a = A;
   if (!coefs) {
       double k1_factrl = 1.0;
       coefs = cspace;
       coefs[0] = sqrt(2.0*M_PI);
       for(k=1; k<A; k++) {
           coefs[k] = exp(a-k) * pow(a-k,k-0.5) / k1_factrl;
           k1_factrl *= -k;
       }
   }
   accum = coefs[0];
   for (k=1; k<A; k++) {
       accum += coefs[k]/(z+k);
   }
   accum *= exp(-(z+a)) * pow(z+a, z+0.5);
   return accum/z;

}

double aa1; double f0( double t) {

   return  pow(t, aa1)*exp(-t); 

}

double GammaIncomplete_Q( double a, double x) {

   double y, h = 1.5e-2;  /* approximate integration step size */
   /* this cuts off the tail of the integration to speed things up */
   y = aa1 = a-1;
   while((f0(y) * (x-y) > 2.0e-8) && (y < x))   y += .4;
   if (y>x) y=x;
   return 1.0 - Simpson3_8( &f0, 0, y, (int)(y/h))/Gamma_Spouge(a);

}</lang> This section contains the functions specific to the task. <lang c>double chi2UniformDistance( double *ds, int dslen) {

   double expected = 0.0;
   double sum = 0.0;
   int k;
   for (k=0; k<dslen; k++) 
       expected += ds[k];
   expected /= k;
   for (k=0; k<dslen; k++) {
       double x = ds[k] - expected;
       sum += x*x;
   }
   return sum/expected;

}

double chi2Probability( int dof, double distance) {

   return GammaIncomplete_Q( 0.5*dof, 0.5*distance);

}

int chiIsUniform( double *dset, int dslen, double significance) {

   int dof = dslen -1;
   double dist = chi2UniformDistance( dset, dslen);
   return chi2Probability( dof, dist ) > significance;

}</lang> Testing <lang c>int main(int argc, char **argv) {

   double dset1[] = { 199809., 200665., 199607., 200270., 199649. };
   double dset2[] = { 522573., 244456., 139979.,  71531.,  21461. };
   double *dsets[] = { dset1, dset2 };
   int     dslens[] = { 5, 5 };
   int k, l;
   double  dist, prob;
   int dof;
   for (k=0; k<2; k++) {
       printf("Dataset: [ ");
       for(l=0;l<dslens[k]; l++) 
           printf("%.0f, ", dsets[k][l]);
           
       printf("]\n");
       dist = chi2UniformDistance(dsets[k], dslens[k]);
       dof = dslens[k]-1;
       printf("dof: %d  distance: %.4f", dof, dist);
       prob = chi2Probability( dof, dist );
       printf(" probability: %.6f", prob);
       printf(" uniform? %s\n", chiIsUniform(dsets[k], dslens[k], 0.05)? "Yes":"No");
   }
   return 0;

}</lang>

D

<lang d>import std.stdio, std.algorithm, std.mathspecial;

real x2Dist(T)(in T[] data) pure nothrow @safe @nogc {

   immutable avg = data.sum / data.length;
   immutable sqs = reduce!((a, b) => a + (b - avg) ^^ 2)(0.0L, data);
   return sqs / avg;

}

real x2Prob(in real dof, in real distance) pure nothrow @safe @nogc {

   return gammaIncompleteCompl(dof / 2, distance / 2);

}

bool x2IsUniform(T)(in T[] data, in real significance=0.05L) pure nothrow @safe @nogc {

   return x2Prob(data.length - 1.0L, x2Dist(data)) > significance;

}

void main() {

   immutable dataSets = [[199809, 200665, 199607, 200270, 199649],
                         [522573, 244456, 139979,  71531,  21461]];
   writefln(" %4s %12s  %12s %8s   %s",
            "dof", "distance", "probability", "Uniform?", "dataset");
   foreach (immutable ds; dataSets) {
       immutable dof = ds.length - 1;
       immutable dist = ds.x2Dist;
       immutable prob = x2Prob(dof, dist);
       writefln("%4d %12.3f  %12.8f    %5s    %6s",
                dof, dist, prob, ds.x2IsUniform ? "YES" : "NO", ds);
   }

}</lang>

Output:
  dof     distance   probability Uniform?   dataset
   4        4.146    0.38657083      YES    [199809, 200665, 199607, 200270, 199649]
   4   790063.276    0.00000000       NO    [522573, 244456, 139979,  71531,  21461]

Fortran

Works with: Fortran version 95

Instead of implementing the incomplete gamma function by ourselves, we bind to GNU Scientific Library; so we need a module to interface to the function we need (gsl_sf_gamma_inc)

<lang fortran>module GSLMiniBind

 implicit none
 interface
    real(c_double) function gsl_sf_gamma_inc(a, x) bind(C)
      use iso_c_binding
      real(c_double), value, intent(in) :: a, x
    end function gsl_sf_gamma_inc
 end interface

end module GSLMiniBind</lang>

Now we're ready to complete the task.

<lang fortran>program ChiTest

 use GSLMiniBind
 use iso_c_binding
 implicit none
 real, dimension(5) :: dset1 = (/ 199809., 200665., 199607., 200270., 199649. /)
 real, dimension(5) :: dset2 = (/ 522573., 244456., 139979.,  71531.,  21461. /)
 real :: dist, prob
 integer :: dof
 print *, "Dataset 1:"
 print *, dset1
 dist = chi2UniformDistance(dset1)
 dof = size(dset1) - 1
 write(*, '(A,I4,A,F12.4)') 'dof: ', dof, '   distance: ', dist
 prob = chi2Probability(dof, dist)
 write(*, '(A,F9.6)') 'probability: ', prob
 write(*, '(A,L)') 'uniform? ', chiIsUniform(dset1, 0.05)
 ! Lazy copy/past :|
 print *, "Dataset 2:"
 print *, dset2
 dist = chi2UniformDistance(dset2)
 dof = size(dset2) - 1
 write(*, '(A,I4,A,F12.4)') 'dof: ', dof, '   distance: ', dist
 prob = chi2Probability(dof, dist)
 write(*, '(A,F9.6)') 'probability: ', prob
 write(*, '(A,L)') 'uniform? ', chiIsUniform(dset2, 0.05)

contains

function chi2Probability(dof, distance)

 real :: chi2Probability
 integer, intent(in) :: dof
 real, intent(in) :: distance
 ! in order to make this work, we need linking with GSL library
 chi2Probability = gsl_sf_gamma_inc(real(0.5*dof, c_double), real(0.5*distance, c_double))

end function chi2Probability

function chiIsUniform(dset, significance)

 logical :: chiIsUniform
 real, dimension(:), intent(in) :: dset
 real, intent(in) :: significance
 integer :: dof
 real :: dist
 dof = size(dset) - 1
 dist = chi2UniformDistance(dset)
 chiIsUniform = chi2Probability(dof, dist) > significance

end function chiIsUniform

function chi2UniformDistance(ds)

 real :: chi2UniformDistance
 real, dimension(:), intent(in) :: ds
 real :: expected, summa = 0.0
 expected = sum(ds) / size(ds)
 summa = sum( (ds - expected) ** 2 )
 
 chi2UniformDistance = summa / expected

end function chi2UniformDistance

end program ChiTest</lang>

Go

Translation of: C

Go has a nice gamma function in the library. Otherwise, it's mostly a port from C. Note, this implementation of the incomplete gamma function works for these two test cases, but, I believe, has serious limitations. See talk page. <lang go>package main

import (

   "fmt"
   "math"

)

type ifctn func(float64) float64

func simpson38(f ifctn, a, b float64, n int) float64 {

   h := (b - a) / float64(n)
   h1 := h / 3
   sum := f(a) + f(b)
   for j := 3*n - 1; j > 0; j-- {
       if j%3 == 0 {
           sum += 2 * f(a+h1*float64(j))
       } else {
           sum += 3 * f(a+h1*float64(j))
       }
   }
   return h * sum / 8

}

func gammaIncQ(a, x float64) float64 {

   aa1 := a - 1
   var f ifctn = func(t float64) float64 {
       return math.Pow(t, aa1) * math.Exp(-t)
   }
   y := aa1
   h := 1.5e-2
   for f(y)*(x-y) > 2e-8 && y < x {
       y += .4
   }
   if y > x {
       y = x
   }
   return 1 - simpson38(f, 0, y, int(y/h/math.Gamma(a)))

}

func chi2ud(ds []int) float64 {

   var sum, expected float64
   for _, d := range ds {
       expected += float64(d)
   }
   expected /= float64(len(ds))
   for _, d := range ds {
       x := float64(d) - expected
       sum += x * x
   }
   return sum / expected

}

func chi2p(dof int, distance float64) float64 {

   return gammaIncQ(.5*float64(dof), .5*distance)

}

const sigLevel = .05

func main() {

   for _, dset := range [][]int{
       {199809, 200665, 199607, 200270, 199649},
       {522573, 244456, 139979, 71531, 21461},
   } {
       utest(dset)
   }

}

func utest(dset []int) {

   fmt.Println("Uniform distribution test")
   var sum int
   for _, c := range dset {
       sum += c
   }
   fmt.Println(" dataset:", dset)
   fmt.Println(" samples:                      ", sum)
   fmt.Println(" categories:                   ", len(dset))
   
   dof := len(dset) - 1
   fmt.Println(" degrees of freedom:           ", dof)
   dist := chi2ud(dset)
   fmt.Println(" chi square test statistic:    ", dist)
   
   p := chi2p(dof, dist)
   fmt.Println(" p-value of test statistic:    ", p)
   sig := p < sigLevel
   fmt.Printf(" significant at %2.0f%% level?      %t\n", sigLevel*100, sig)
   fmt.Println(" uniform?                      ", !sig, "\n")

}</lang> Output:

Uniform distribution test
 dataset: [199809 200665 199607 200270 199649]
 samples:                       1000000
 categories:                    5
 degrees of freedom:            4
 chi square test statistic:     4.14628
 p-value of test statistic:     0.3865708330827673
 significant at  5% level?      false
 uniform?                       true 

Uniform distribution test
 dataset: [522573 244456 139979 71531 21461]
 samples:                       1000000
 categories:                    5
 degrees of freedom:            4
 chi square test statistic:     790063.27594
 p-value of test statistic:     2.3528290427066167e-11
 significant at  5% level?      true
 uniform?                       false 

Hy

<lang lisp>(import

 [scipy.stats [chisquare]]
 [collections [Counter]])

(defn uniform? [f repeats &optional [alpha .05]]

 "Call 'f' 'repeats' times and do a chi-squared test for uniformity
 of the resulting discrete distribution. Return false iff the
 null hypothesis of uniformity is rejected for the test with
 size 'alpha'."
 (<= alpha (second (chisquare
   (.values (Counter (take repeats (repeatedly f))))))))</lang>

Examples of use:

<lang lisp>(import [random [randint]])

(for [f [

   (fn [] (randint 1 10))
   (fn [] (if (randint 0 1) (randint 1 9) (randint 1 10)))]]
 (print (uniform? f 5000)))</lang>

J

Solution (Tacit): <lang j>require 'stats/base'

countCats=: #@~. NB. counts the number of unique items getExpected=: #@] % [ NB. divides no of items by category count getObserved=: #/.~@] NB. counts frequency for each category calcX2=: [: +/ *:@(getObserved - getExpected) % getExpected NB. calculates test statistic calcDf=: <:@[ NB. calculates degrees of freedom for uniform distribution

NB.*isUniform v Tests (5%) whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniform=: (countCats $: ]) : (0.95 > calcDf chisqcdf :: 1: calcX2)</lang>

Solution (Explicit): <lang j>require 'stats/base'

NB.*isUniformX v Tests (5%) whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniformX=: verb define

 (#~. y) isUniformX y
 signif=. 0.95                    NB. set significance level
 expected=. (#y) % x              NB. number of items divided by the category count
 observed=. #/.~ y                NB. frequency count for each category
 X2=. +/ (*: observed - expected) % expected  NB. the test statistic
 degfreedom=. <: x                NB. degrees of freedom
 signif > degfreedom chisqcdf :: 1: X2

)</lang>

Example Usage: <lang j> FairDistrib=: 1e6 ?@$ 5

  UnfairDistrib=: (9.5e5 ?@$ 5) , (5e4 ?@$ 4)
  isUniformX FairDistrib

1

  isUniformX UnfairDistrib

0

  isUniform 4 4 4 5 5 5 5 5 5 5     NB. uniform if only 2 categories possible

1

  4 isUniform 4 4 4 5 5 5 5 5 5 5   NB. not uniform if 4 categories possible

0</lang>

Mathematica

This code explicity assumes a discrete uniform distribution since the chi square test is a poor test choice for continuous distributions and requires Mathematica version 2 or later <lang Mathematica>discreteUniformDistributionQ[data_, {min_Integer, max_Integer}, confLevel_: .05] := If[$VersionNumber >= 8,

 confLevel <= PearsonChiSquareTest[data, DiscreteUniformDistribution[{min, max}]],
 Block[{v, k = max - min, n = Length@data},
  v = (k + 1) (Plus @@ (((Length /@ Split[Sort@data]))^2))/n - n;
  GammaRegularized[k/2, 0, v/2] <= 1 - confLevel]]

discreteUniformDistributionQ[data_] :=discreteUniformDistributionQ[data, data[[Ordering[data][[{1, -1}]]]]]</lang> code used to create test data requires Mathematica version 6 or later <lang Mathematica>uniformData = RandomInteger[10, 100]; nonUniformData = Total@RandomInteger[10, {5, 100}];</lang> <lang Mathematica>{discreteUniformDistributionQ[uniformData],discreteUniformDistributionQ[nonUniformData]}</lang>

Output:
{True,False}

OCaml

This code needs to be compiled with library gsl.cma.

<lang ocaml>let sqr x = x *. x

let chi2UniformDistance distrib =

 let count, len = Array.fold_left (fun (s, c) e -> s + e, succ c)
 				   (0, 0) distrib in
 let expected = float count /. float len in
 let distance = Array.fold_left (fun s e ->
   s +. sqr (float e -. expected) /. expected
 ) 0. distrib in
 let dof = float (pred len) in
 dof, distance

let chi2Proba dof distance =

 Gsl_sf.gamma_inc_Q (0.5 *. dof) (0.5 *. distance)

let chi2IsUniform distrib significance =

 let dof, distance = chi2UniformDistance distrib in
 let likelihoodOfRandom = chi2Proba dof distance in
 likelihoodOfRandom > significance

let _ =

 List.iter (fun distrib ->
   let dof, distance = chi2UniformDistance distrib in
   Printf.printf "distribution ";
   Array.iter (Printf.printf "\t%d") distrib;
   Printf.printf "\tdistance %g" distance;
   Printf.printf "\t[%g > 0.05]" (chi2Proba dof distance);
   if chi2IsUniform distrib 0.05 then Printf.printf " fair\n"
   else Printf.printf " unfair\n"
 ) 
 [
   [| 199809; 200665; 199607; 200270; 199649 |];
   [| 522573; 244456; 139979; 71531; 21461 |]
 ]</lang>

Output

distribution    199809  200665  199607  200270  199649  distance 4.14628        [0.386571 > 0.05] fair
distribution    522573  244456  139979  71531   21461   distance 790063 [0 > 0.05] unfair

PARI/GP

The solution is very easy in GP since PARI includes a good incomplete gamma implementation; the sum function is also useful for clarity. Most of the code is just for displaying results.

The sample data for the test was taken from Go. <lang parigp>cumChi2(chi2,dof)={ my(g=gamma(dof/2)); incgam(dof/2,chi2/2,g)/g }; test(v,alpha=.05)={ my(chi2,p,s=sum(i=1,#v,v[i]),ave=s/#v); print("chi^2 statistic: ",chi2=sum(i=1,#v,(v[i]-ave)^2)/ave); print("p-value: ",p=cumChi2(chi2,#v-1)); if(p<alpha, print("Significant at the alpha = "alpha" level: not uniform"); , print("Not significant at the alpha = "alpha" level: uniform"); ) };

test([199809, 200665, 199607, 200270, 199649]) test([522573, 244456, 139979, 71531, 21461])</lang>

Perl 6

For the incomplete gamma function we use a series expansion related to Kummer's confluent hypergeometric function (see http://en.wikipedia.org/wiki/Incomplete_gamma_function#Evaluation_formulae). The gamma function is calculated in closed form, as we only need its value at integers and half integers.

<lang perl6>sub incomplete-γ-series($s, $z) {

   my \numers = $z X** 1..*;
   my \denoms = [\*] $s X+ 1..*;
   my $M = 1 + [+] (numers Z/ denoms) ... * < 1e-6;
   $z**$s / $s * exp(-$z) * $M;

}

sub postfix:<!>(Int $n) { [*] 2..$n }

sub Γ-of-half(Int $n where * > 0) {

   ($n %% 2) ?? (($_-1)!                            given  $n    div 2)
             !! ((2*$_)! / (4**$_ * $_!) * sqrt(pi) given ($n-1) div 2);

}

  1. degrees of freedom constrained due to numerical limitations

sub chi-squared-cdf(Int $k where 1..200, $x where * >= 0) {

   my $f = $k < 20 ?? 20 !! 10;
   given $x {
       when 0                    { 0.0 }
       when * < $k + $f*sqrt($k) { incomplete-γ-series($k/2, $x/2) / Γ-of-half($k) }
       default                   { 1.0 }
   }

}

sub chi-squared-test(@bins, :$significance = 0.05) {

   my $n = +@bins;
   my $N = [+] @bins;
   my $expected = $N / $n;
   my $chi-squared = [+] @bins.map: { ($^bin - $expected)**2 / $expected }
   my $p-value = 1 - chi-squared-cdf($n-1, $chi-squared);
   return (:$chi-squared, :$p-value, :uniform($p-value > $significance));

}

for [< 199809 200665 199607 200270 199649 >],

   [< 522573 244456 139979  71531  21461 >]
   -> $dataset

{

   my %t = chi-squared-test($dataset);
   say 'data: ', $dataset;
   say "χ² = {%t<chi-squared>}, p-value = {%t<p-value>.fmt('%.4f')}, uniform = {%t<uniform>}";

}</lang>

Output:
data: 199809 200665 199607 200270 199649
χ² = 4.14628, p-value = 0.3866, uniform = True
data: 522573 244456 139979 71531 21461
χ² = 790063.27594, p-value = 0.0000, uniform = False

Python

Implements the Chi Square Probability function with an integration. I'm sure there are better ways to do this. Compare to OCaml implementation. <lang python>import math import random

def GammaInc_Q( a, x):

   a1 = a-1
   a2 = a-2
   def f0( t ):
       return t**a1*math.exp(-t)
   def df0(t):
       return (a1-t)*t**a2*math.exp(-t)
   
   y = a1
   while f0(y)*(x-y) >2.0e-8 and y < x: y += .3
   if y > x: y = x
   h = 3.0e-4
   n = int(y/h)
   h = y/n
   hh = 0.5*h
   gamax = h * sum( f0(t)+hh*df0(t) for t in ( h*j for j in xrange(n-1, -1, -1)))
   return gamax/gamma_spounge(a)

c = None def gamma_spounge( z):

   global c
   a = 12
   if c is None:
      k1_factrl = 1.0
      c = []
      c.append(math.sqrt(2.0*math.pi))
      for k in range(1,a):
         c.append( math.exp(a-k) * (a-k)**(k-0.5) / k1_factrl )
         k1_factrl *= -k
   
   accm = c[0]
   for k in range(1,a):
       accm += c[k] / (z+k)
   accm *= math.exp( -(z+a)) * (z+a)**(z+0.5)
   return accm/z;

def chi2UniformDistance( dataSet ):

   expected = sum(dataSet)*1.0/len(dataSet)
   cntrd = (d-expected for d in dataSet)
   return sum(x*x for x in cntrd)/expected

def chi2Probability(dof, distance):

   return 1.0 - GammaInc_Q( 0.5*dof, 0.5*distance)

def chi2IsUniform(dataSet, significance):

   dof = len(dataSet)-1
   dist = chi2UniformDistance(dataSet)
   return chi2Probability( dof, dist ) > significance

dset1 = [ 199809, 200665, 199607, 200270, 199649 ] dset2 = [ 522573, 244456, 139979, 71531, 21461 ]

for ds in (dset1, dset2):

   print "Data set:", ds
   dof = len(ds)-1
   distance =chi2UniformDistance(ds)
   print "dof: %d distance: %.4f" % (dof, distance),
   prob = chi2Probability( dof, distance)
   print "probability: %.4f"%prob,
   print "uniform? ", "Yes"if chi2IsUniform(ds,0.05) else "No"</lang>

Output:

Data set: [199809, 200665, 199607, 200270, 199649]
dof: 4 distance: 4.146280 probability: 0.3866 uniform?  Yes
Data set: [522573, 244456, 139979, 71531, 21461]
dof: 4 distance: 790063.275940 probability: 0.0000 uniform?  No

R

R being a statistical computating language, the chi-squared test is built in with the function "chisq.test" <lang tcl> dset1=c(199809,200665,199607,200270,199649) dset2=c(522573,244456,139979,71531,21461)

chi2IsUniform<-function(dataset,significance=0.05){

 chi2IsUniform=(chisq.test(dataset)$p.value>significance)

}

for (ds in list(dset1,dset2)){

 print(c("Data set:",ds))
 print(chisq.test(ds))
 print(paste("uniform?",chi2IsUniform(ds)))

} </lang>

Output:

[1] "Data set:" "199809"    "200665"    "199607"    "200270"    "199649"   

        Chi-squared test for given probabilities

data:  ds 
X-squared = 4.1463, df = 4, p-value = 0.3866

[1] "uniform? TRUE"
[1] "Data set:" "522573"    "244456"    "139979"    "71531"     "21461"    

        Chi-squared test for given probabilities

data:  ds 
X-squared = 790063.3, df = 4, p-value < 2.2e-16

[1] "uniform? FALSE"

Racket

<lang racket>

  1. lang racket

(require

racket/flonum (planet williams/science:4:5/science)
(only-in (planet williams/science/unsafe-ops-utils) real->float))
(chi^2-goodness-of-fit-test observed expected df)
Given
observed, a sequence of observed frequencies
expected, a sequence of expected frequencies
df, the degrees of freedom
Result
P-value = 1-chi^2cdf(X^2,df) , the p-value

(define (chi^2-goodness-of-fit-test observed expected df)

 (define X^2 (for/sum ([o observed] [e expected])
               (/ (sqr (- o e)) e)))
 (- 1.0 (chi-squared-cdf X^2 df)))

(define (is-uniform? rand n α)

 ; Use significance level α to test whether
 ; n small random numbers generated by rand
 ; have a uniform distribution.
 ; Observed values:
 (define o (make-vector 10 0))
 ; generate n random integers from 0 to 9.
 (for ([_ (+ n 1)])
   (define r (rand 10))
   (vector-set! o r (+ (vector-ref o r) 1)))
 ; Expected values:
 (define ex (make-vector 10 (/ n 10)))
 ; Calculate the P-value:
 (define P (chi^2-goodness-of-fit-test o ex (- n 1)))
 
 ; If the P-value is larger than α we accept the
 ; hypothesis that the numbers are distributed uniformly.
 (> P α))
Test whether the builtin generator is uniform

(is-uniform? random 1000 0.05)

Test whether the constant generator fails

(is-uniform? (λ(_) 5) 1000 0.05) </lang> Output: <lang racket>

  1. t
  2. f

</lang>

Ruby

Translation of: Python

<lang ruby>def gammaInc_Q(a, x)

 a1, a2 = a-1, a-2
 f0  = lambda {|t| t**a1 * Math.exp(-t)}
 df0 = lambda {|t| (a1-t) * t**a2 * Math.exp(-t)}
 
 y = a1
 y += 0.3  while f0[y]*(x-y) > 2.0e-8 and y < x
 y = x  if y > x
 
 h = 3.0e-4
 n = (y/h).to_i
 h = y/n
 hh = 0.5 * h
 sum = 0
 (n-1).step(0, -1) do |j|
   t = h * j
   sum += f0[t] + hh * df0[t]
 end
 h * sum / gamma_spounge(a)

end

A = 12 k1_factrl = 1.0 coef = [Math.sqrt(2.0*Math::PI)] COEF = (1...A).each_with_object(coef) do |k,c|

 c << Math.exp(A-k) * (A-k)**(k-0.5) / k1_factrl
 k1_factrl *= -k

end

def gamma_spounge(z)

 accm = (1...A).inject(COEF[0]){|res,k| res += COEF[k] / (z+k)}
 accm * Math.exp(-(z+A)) * (z+A)**(z+0.5) / z

end

def chi2UniformDistance(dataSet)

 expected = dataSet.inject(:+).to_f / dataSet.size
 dataSet.map{|d|(d-expected)**2}.inject(:+) / expected

end

def chi2Probability(dof, distance)

 1.0 - gammaInc_Q(0.5*dof, 0.5*distance)

end

def chi2IsUniform(dataSet, significance=0.05)

 dof = dataSet.size - 1
 dist = chi2UniformDistance(dataSet)
 chi2Probability(dof, dist) > significance

end

dsets = [ [ 199809, 200665, 199607, 200270, 199649 ],

         [ 522573, 244456, 139979,  71531,  21461 ] ]

for ds in dsets

 puts "Data set:#{ds}"
 dof = ds.size - 1
 puts "  degrees of freedom: %d" % dof
 distance = chi2UniformDistance(ds)
 puts "  distance:           %.4f" % distance
 puts "  probability:        %.4f" % chi2Probability(dof, distance)
 puts "  uniform?            %s" % (chi2IsUniform(ds) ? "Yes" : "No")

end</lang>

Output:
Data set:[199809, 200665, 199607, 200270, 199649]
  degrees of freedom: 4
  distance:           4.1463
  probability:        0.3866
  uniform?            Yes
Data set:[522573, 244456, 139979, 71531, 21461]
  degrees of freedom: 4
  distance:           790063.2759
  probability:        -0.0000
  uniform?            No

Tcl

Works with: Tcl version 8.5
Library: Tcllib (Package: math::statistics)

<lang tcl>package require Tcl 8.5 package require math::statistics

proc isUniform {distribution {significance 0.05}} {

   set count [tcl::mathop::+ {*}[dict values $distribution]]
   set expected [expr {double($count) / [dict size $distribution]}]
   set X2 0.0
   foreach value [dict values $distribution] {

set X2 [expr {$X2 + ($value - $expected)**2 / $expected}]

   }
   set degreesOfFreedom [expr {[dict size $distribution] - 1}]
   set likelihoodOfRandom [::math::statistics::incompleteGamma \

[expr {$degreesOfFreedom / 2.0}] [expr {$X2 / 2.0}]]

   expr {$likelihoodOfRandom > $significance}

}</lang> Testing: <lang tcl>proc makeDistribution {operation {count 1000000}} {

   for {set i 0} {$i<$count} {incr i} {incr distribution([uplevel 1 $operation])}
   return [array get distribution]

}

set distFair [makeDistribution {expr int(rand()*5)}] puts "distribution \"$distFair\" assessed as [expr [isUniform $distFair]?{fair}:{unfair}]" set distUnfair [makeDistribution {expr int(rand()*rand()*5)}] puts "distribution \"$distUnfair\" assessed as [expr [isUniform $distUnfair]?{fair}:{unfair}]"</lang> Output:

distribution "0 199809 4 199649 1 200665 2 199607 3 200270" assessed as fair
distribution "4 21461 0 522573 1 244456 2 139979 3 71531" assessed as unfair