Verify distribution uniformity/Chi-squared test

Task

Write a function to verify that a given distribution of values is uniform by using the $\chi ^{2}$ test to see if the distribution has a likelihood of happening of at least the significance level (conventionally 5%). The function should return a boolean that is true if the distribution is one that a uniform distribution (with appropriate number of degrees of freedom) may be expected to produce.

Reference

an entry at the MathWorld website: chi-squared distribution.

Ada

First, we specify a simple package to compute the Chi-Square Distance from the uniform distribution: <lang Ada>package Chi_Square is

  type Flt is digits 18;
  type Bins_Type is array(Positive range <>) of Natural;
  
  function Distance(Bins: Bins_Type) return Flt;

end Chi_Square;</lang>

Next, we implement that package:

<lang Ada>package body Chi_Square is

  function Distance(Bins: Bins_Type) return Flt is
     Bad_Bins: Natural := 0;
     Sum: Natural := 0;
     Expected: Flt;
     Result: Flt;
  begin
     for I in Bins'Range loop
        if Bins(I) < 5 then 
           Bad_Bins := Bad_Bins + 1;
        end if;
        Sum := Sum + Bins(I);
     end loop;
     if 5*Bad_Bins > Bins'Length then 
        raise Program_Error with "too many (almost) empty bins";
     end if;
     
     Expected := Flt(Sum) / Flt(Bins'Length);
     Result := 0.0;
     for I in Bins'Range loop
        Result := Result + ((Flt(Bins(I)) - Expected)**2) / Expected;
     end loop;
     return Result;
  end Distance;

end Chi_Square;</lang>

Finally, we actually implement the Chi-square test. We do not actually compute the Chi-square probability; rather we hardcode a table of values for 5% significance level, which has been picked from Wikipedia [1]: <lang Ada>with Ada.Text_IO, Ada.Command_Line, Chi_Square; use Ada.Text_IO;

procedure Test_Chi_Square is

  package Ch2 renames Chi_Square; use Ch2;
  package FIO is new Float_IO(Flt);
  
  B: Bins_Type(1 .. Ada.Command_Line.Argument_Count);
  Bound_For_5_Per_Cent: constant array(Positive range <>) of Flt :=
    ( 1 => 3.84,   2 =>  5.99,  3 =>  7.82,  4 => 9.49,   5 =>  11.07,
      6 => 12.59,  7 => 14.07,  8 => 15.51,  9 => 16.92, 10 =>  18.31);
    -- picked from http://en.wikipedia.org/wiki/Chi-squared_distribution
  
  Dist: Flt;

begin

  for I in B'Range loop
     B(I) := Natural'Value(Ada.Command_Line.Argument(I));
  end loop;
  Dist := Distance(B);
  Put("Degrees of Freedom:" & Integer'Image(B'Length-1) & ", Distance: ");
  FIO.Put(Dist, Fore => 6, Aft => 2, Exp => 0);
  if Dist <= Bound_For_5_Per_Cent(B'Length-1) then
     Put_Line("; (apparently uniform)");
  else
     Put_Line("; (deviates significantly from uniform)");
  end if;

end;</lang>

Output:

$ ./Test_Chi_Square 199809 200665 199607 200270 199649  
Degrees of Freedom: 4, Distance:      4.15; (apparently uniform)
$ ./Test_Chi_Square 522573 244456 139979 71531 21461
Degrees of Freedom: 4, Distance: 790063.28; (deviates significantly from uniform)

C

This first sections contains the functions required to compute the Chi-Squared probability. These are not needed if a library containing the necessary function is availabile (e.g. see Numerical Integration, Gamma function). <lang c>#include <stdlib.h>

include <stdio.h>
include <math.h>
ifndef M_PI
define M_PI 3.14159265358979323846
endif

typedef double (* Ifctn)( double t); /* Numerical integration method */ double Simpson3_8( Ifctn f, double a, double b, int N) {

   int j;
   double l1;
   double h = (b-a)/N;
   double h1 = h/3.0;
   double sum = f(a) + f(b);

   for (j=3*N-1; j>0; j--) {
       l1 = (j%3)? 3.0 : 2.0;
       sum += l1*f(a+h1*j) ;
   }
   return h*sum/8.0;

}

define A 12

double Gamma_Spouge( double z ) {

   int k;
   static double cspace[A];
   static double *coefs = NULL;
   double accum;
   double a = A;

   if (!coefs) {
       double k1_factrl = 1.0;
       coefs = cspace;
       coefs[0] = sqrt(2.0*M_PI);
       for(k=1; k<A; k++) {
           coefs[k] = exp(a-k) * pow(a-k,k-0.5) / k1_factrl;
           k1_factrl *= -k;
       }
   }

   accum = coefs[0];
   for (k=1; k<A; k++) {
       accum += coefs[k]/(z+k);
   }
   accum *= exp(-(z+a)) * pow(z+a, z+0.5);
   return accum/z;

}

double aa1; double f0( double t) {

   return  pow(t, aa1)*exp(-t);

}

double GammaIncomplete_Q( double a, double x) {

   double y, h = 1.5e-2;  /* approximate integration step size */

   /* this cuts off the tail of the integration to speed things up */
   y = aa1 = a-1;
   while((f0(y) * (x-y) > 2.0e-8) && (y < x))   y += .4;
   if (y>x) y=x;

   return 1.0 - Simpson3_8( &f0, 0, y, (int)(y/h))/Gamma_Spouge(a);

}</lang> This section contains the functions specific to the task. <lang c>double chi2UniformDistance( double *ds, int dslen) {

   double expected = 0.0;
   double sum = 0.0;
   int k;

   for (k=0; k<dslen; k++) 
       expected += ds[k];
   expected /= k;

   for (k=0; k<dslen; k++) {
       double x = ds[k] - expected;
       sum += x*x;
   }
   return sum/expected;

}

double chi2Probability( int dof, double distance) {

   return GammaIncomplete_Q( 0.5*dof, 0.5*distance);

}

int chiIsUniform( double *dset, int dslen, double significance) {

   int dof = dslen -1;
   double dist = chi2UniformDistance( dset, dslen);
   return chi2Probability( dof, dist ) > significance;

}</lang> Testing <lang c>int main(int argc, char **argv) {

   double dset1[] = { 199809., 200665., 199607., 200270., 199649. };
   double dset2[] = { 522573., 244456., 139979.,  71531.,  21461. };
   double *dsets[] = { dset1, dset2 };
   int     dslens[] = { 5, 5 };
   int k, l;
   double  dist, prob;
   int dof;

   for (k=0; k<2; k++) {
       printf("Dataset: [ ");
       for(l=0;l<dslens[k]; l++) 
           printf("%.0f, ", dsets[k][l]);
           
       printf("]\n");
       dist = chi2UniformDistance(dsets[k], dslens[k]);
       dof = dslens[k]-1;
       printf("dof: %d  distance: %.4f", dof, dist);
       prob = chi2Probability( dof, dist );
       printf(" probability: %.6f", prob);
       printf(" uniform? %s\n", chiIsUniform(dsets[k], dslens[k], 0.05)? "Yes":"No");
   }
   return 0;

}</lang>

D

<lang d>import std.stdio, std.algorithm, std.mathspecial;

real x2Dist(T)(in T[] data) pure nothrow @safe @nogc {

   immutable avg = data.sum / data.length;
   immutable sqs = reduce!((a, b) => a + (b - avg) ^^ 2)(0.0L, data);
   return sqs / avg;

}

real x2Prob(in real dof, in real distance) pure nothrow @safe @nogc {

   return gammaIncompleteCompl(dof / 2, distance / 2);

}

bool x2IsUniform(T)(in T[] data, in real significance=0.05L) pure nothrow @safe @nogc {

   return x2Prob(data.length - 1.0L, x2Dist(data)) > significance;

}

void main() {

   immutable dataSets = [[199809, 200665, 199607, 200270, 199649],
                         [522573, 244456, 139979,  71531,  21461]];
   writefln(" %4s %12s  %12s %8s   %s",
            "dof", "distance", "probability", "Uniform?", "dataset");
   foreach (immutable ds; dataSets) {
       immutable dof = ds.length - 1;
       immutable dist = ds.x2Dist;
       immutable prob = x2Prob(dof, dist);
       writefln("%4d %12.3f  %12.8f    %5s    %6s",
                dof, dist, prob, ds.x2IsUniform ? "YES" : "NO", ds);
   }

}</lang>

Output:

  dof     distance   probability Uniform?   dataset
   4        4.146    0.38657083      YES    [199809, 200665, 199607, 200270, 199649]
   4   790063.276    0.00000000       NO    [522573, 244456, 139979,  71531,  21461]

Elixir

Translation of: Ruby

<lang elixir>defmodule Verify do

 defp gammaInc_Q(a, x) do
   a1 = a-1
   f0  = fn t -> :math.pow(t, a1) * :math.exp(-t) end
   df0 = fn t -> (a1-t) * :math.pow(t, a-2) * :math.exp(-t) end
   y = while_loop(f0, x, a1)
   n = trunc(y / 3.0e-4)
   h = y / n
   hh = 0.5 * h
   sum = Enum.reduce(n-1 .. 0, 0, fn j,sum ->
     t = h * j
     sum + f0.(t) + hh * df0.(t)
   end)
   h * sum / gamma_spounge(a, make_coef)
 end
 
 defp while_loop(f, x, y) do
   if f.(y)*(x-y) > 2.0e-8 and y < x, do: while_loop(f, x, y+0.3), else: min(x, y)
 end
 
 @a  12
 defp make_coef do
   coef0 = [:math.sqrt(2.0 * :math.pi)]
   {_, coef} = Enum.reduce(1..@a-1, {1.0, coef0}, fn k,{k1_factrl,c} ->
     h = :math.exp(@a-k) * :math.pow(@a-k, k-0.5) / k1_factrl
     {-k1_factrl*k, [h | c]}
   end)
   Enum.reverse(coef) |> List.to_tuple
 end
 
 defp gamma_spounge(z, coef) do
   accm = Enum.reduce(1..@a-1, elem(coef,0), fn k,res -> res + elem(coef,k) / (z+k) end)
   accm * :math.exp(-(z+@a)) * :math.pow(z+@a, z+0.5) / z
 end
 
 def chi2UniformDistance(dataSet) do
   expected = Enum.sum(dataSet) / length(dataSet)
   Enum.reduce(dataSet, 0, fn d,sum -> sum + (d-expected)*(d-expected) end) / expected
 end
 
 def chi2Probability(dof, distance) do
   1.0 - gammaInc_Q(0.5*dof, 0.5*distance)
 end
 
 def chi2IsUniform(dataSet, significance\\0.05) do
   dof = length(dataSet) - 1
   dist = chi2UniformDistance(dataSet)
   chi2Probability(dof, dist) > significance
 end

end

dsets = [ [ 199809, 200665, 199607, 200270, 199649 ],

         [ 522573, 244456, 139979,  71531,  21461 ] ]

Enum.each(dsets, fn ds ->

 IO.puts "Data set:#{inspect ds}"
 dof = length(ds) - 1
 IO.puts "  degrees of freedom: #{dof}"
 distance = Verify.chi2UniformDistance(ds)
 :io.fwrite "  distance:           ~.4f~n", [distance]
 :io.fwrite "  probability:        ~.4f~n", [Verify.chi2Probability(dof, distance)]
 :io.fwrite "  uniform?            ~s~n", [(if Verify.chi2IsUniform(ds), do: "Yes", else: "No")]

end)</lang>

Output:

Data set:[199809, 200665, 199607, 200270, 199649]
  degrees of freedom: 4
  distance:           4.1463
  probability:        0.3866
  uniform?            Yes
Data set:[522573, 244456, 139979, 71531, 21461]
  degrees of freedom: 4
  distance:           790063.2759
  probability:        -0.0000
  uniform?            No

Fortran

Works with: Fortran version 95

Library: GNU Scientific Library

Instead of implementing the incomplete gamma function by ourselves, we bind to GNU Scientific Library; so we need a module to interface to the function we need (gsl_sf_gamma_inc)

<lang fortran>module GSLMiniBind

 implicit none

 interface
    real(c_double) function gsl_sf_gamma_inc(a, x) bind(C)
      use iso_c_binding
      real(c_double), value, intent(in) :: a, x
    end function gsl_sf_gamma_inc
 end interface

end module GSLMiniBind</lang>

Now we're ready to complete the task.

<lang fortran>program ChiTest

 use GSLMiniBind
 use iso_c_binding
 implicit none

 real, dimension(5) :: dset1 = (/ 199809., 200665., 199607., 200270., 199649. /)
 real, dimension(5) :: dset2 = (/ 522573., 244456., 139979.,  71531.,  21461. /)

 real :: dist, prob
 integer :: dof

 print *, "Dataset 1:"
 print *, dset1
 dist = chi2UniformDistance(dset1)
 dof = size(dset1) - 1
 write(*, '(A,I4,A,F12.4)') 'dof: ', dof, '   distance: ', dist
 prob = chi2Probability(dof, dist)
 write(*, '(A,F9.6)') 'probability: ', prob
 write(*, '(A,L)') 'uniform? ', chiIsUniform(dset1, 0.05)

 ! Lazy copy/past :|
 print *, "Dataset 2:"
 print *, dset2
 dist = chi2UniformDistance(dset2)
 dof = size(dset2) - 1
 write(*, '(A,I4,A,F12.4)') 'dof: ', dof, '   distance: ', dist
 prob = chi2Probability(dof, dist)
 write(*, '(A,F9.6)') 'probability: ', prob
 write(*, '(A,L)') 'uniform? ', chiIsUniform(dset2, 0.05)

contains

function chi2Probability(dof, distance)

 real :: chi2Probability
 integer, intent(in) :: dof
 real, intent(in) :: distance

 ! in order to make this work, we need linking with GSL library
 chi2Probability = gsl_sf_gamma_inc(real(0.5*dof, c_double), real(0.5*distance, c_double))

end function chi2Probability

function chiIsUniform(dset, significance)

 logical :: chiIsUniform
 real, dimension(:), intent(in) :: dset
 real, intent(in) :: significance

 integer :: dof
 real :: dist

 dof = size(dset) - 1
 dist = chi2UniformDistance(dset)
 chiIsUniform = chi2Probability(dof, dist) > significance

end function chiIsUniform

function chi2UniformDistance(ds)

 real :: chi2UniformDistance
 real, dimension(:), intent(in) :: ds

 real :: expected, summa = 0.0

 expected = sum(ds) / size(ds)
 summa = sum( (ds - expected) ** 2 )
 
 chi2UniformDistance = summa / expected

end function chi2UniformDistance

end program ChiTest</lang>

Go

Translation of: C

Go has a nice gamma function in the library. Otherwise, it's mostly a port from C. Note, this implementation of the incomplete gamma function works for these two test cases, but, I believe, has serious limitations. See talk page. <lang go>package main

import (

   "fmt"
   "math"

)

type ifctn func(float64) float64

func simpson38(f ifctn, a, b float64, n int) float64 {

   h := (b - a) / float64(n)
   h1 := h / 3
   sum := f(a) + f(b)
   for j := 3*n - 1; j > 0; j-- {
       if j%3 == 0 {
           sum += 2 * f(a+h1*float64(j))
       } else {
           sum += 3 * f(a+h1*float64(j))
       }
   }
   return h * sum / 8

}

func gammaIncQ(a, x float64) float64 {

   aa1 := a - 1
   var f ifctn = func(t float64) float64 {
       return math.Pow(t, aa1) * math.Exp(-t)
   }
   y := aa1
   h := 1.5e-2
   for f(y)*(x-y) > 2e-8 && y < x {
       y += .4
   }
   if y > x {
       y = x
   }
   return 1 - simpson38(f, 0, y, int(y/h/math.Gamma(a)))

}

func chi2ud(ds []int) float64 {

   var sum, expected float64
   for _, d := range ds {
       expected += float64(d)
   }
   expected /= float64(len(ds))
   for _, d := range ds {
       x := float64(d) - expected
       sum += x * x
   }
   return sum / expected

}

func chi2p(dof int, distance float64) float64 {

   return gammaIncQ(.5*float64(dof), .5*distance)

}

const sigLevel = .05

func main() {

   for _, dset := range [][]int{
       {199809, 200665, 199607, 200270, 199649},
       {522573, 244456, 139979, 71531, 21461},
   } {
       utest(dset)
   }

}

func utest(dset []int) {

   fmt.Println("Uniform distribution test")
   var sum int
   for _, c := range dset {
       sum += c
   }
   fmt.Println(" dataset:", dset)
   fmt.Println(" samples:                      ", sum)
   fmt.Println(" categories:                   ", len(dset))
   
   dof := len(dset) - 1
   fmt.Println(" degrees of freedom:           ", dof)

   dist := chi2ud(dset)
   fmt.Println(" chi square test statistic:    ", dist)
   
   p := chi2p(dof, dist)
   fmt.Println(" p-value of test statistic:    ", p)

   sig := p < sigLevel
   fmt.Printf(" significant at %2.0f%% level?      %t\n", sigLevel*100, sig)
   fmt.Println(" uniform?                      ", !sig, "\n")

}</lang> Output:

Uniform distribution test
 dataset: [199809 200665 199607 200270 199649]
 samples:                       1000000
 categories:                    5
 degrees of freedom:            4
 chi square test statistic:     4.14628
 p-value of test statistic:     0.3865708330827673
 significant at  5% level?      false
 uniform?                       true 

Uniform distribution test
 dataset: [522573 244456 139979 71531 21461]
 samples:                       1000000
 categories:                    5
 degrees of freedom:            4
 chi square test statistic:     790063.27594
 p-value of test statistic:     2.3528290427066167e-11
 significant at  5% level?      true
 uniform?                       false

Hy

<lang lisp>(import

 [scipy.stats [chisquare]]
 [collections [Counter]])

(defn uniform? [f repeats &optional [alpha .05]]

 "Call 'f' 'repeats' times and do a chi-squared test for uniformity
 of the resulting discrete distribution. Return false iff the
 null hypothesis of uniformity is rejected for the test with
 size 'alpha'."
 (<= alpha (second (chisquare
   (.values (Counter (take repeats (repeatedly f))))))))</lang>

Examples of use:

<lang lisp>(import [random [randint]])

(for [f [

   (fn [] (randint 1 10))
   (fn [] (if (randint 0 1) (randint 1 9) (randint 1 10)))]]
 (print (uniform? f 5000)))</lang>

J

Solution (Tacit): <lang j>require 'stats/base'

countCats=: #@~. NB. counts the number of unique items getExpected=: #@] % [ NB. divides no of items by category count getObserved=: #/.~@] NB. counts frequency for each category calcX2=: [: +/ *:@(getObserved - getExpected) % getExpected NB. calculates test statistic calcDf=: <:@[ NB. calculates degrees of freedom for uniform distribution

NB.*isUniform v Tests (5%) whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniform=: (countCats $: ]) : (0.95 > calcDf chisqcdf :: 1: calcX2)</lang>

Solution (Explicit): <lang j>require 'stats/base'

NB.*isUniformX v Tests (5%) whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniformX=: verb define

 (#~. y) isUniformX y

 signif=. 0.95                    NB. set significance level
 expected=. (#y) % x              NB. number of items divided by the category count
 observed=. #/.~ y                NB. frequency count for each category
 X2=. +/ (*: observed - expected) % expected  NB. the test statistic
 degfreedom=. <: x                NB. degrees of freedom
 signif > degfreedom chisqcdf :: 1: X2

)</lang>

Example Usage: <lang j> FairDistrib=: 1e6 ?@$ 5

  UnfairDistrib=: (9.5e5 ?@$ 5) , (5e4 ?@$ 4)
  isUniformX FairDistrib

1

  isUniformX UnfairDistrib

0

  isUniform 4 4 4 5 5 5 5 5 5 5     NB. uniform if only 2 categories possible

1

  4 isUniform 4 4 4 5 5 5 5 5 5 5   NB. not uniform if 4 categories possible

0</lang>

Java

Translation of: D

Works with: Java version 8

<lang java>import static java.lang.Math.pow; import java.util.Arrays; import static java.util.Arrays.stream; import org.apache.commons.math3.special.Gamma;

public class Test {

   static double x2Dist(double[] data) {
       double avg = stream(data).sum() / data.length;
       double sqs = stream(data).reduce(0, (a, b) -> a + pow((b - avg), 2));
       return sqs / avg;
   }

   static double x2Prob(double dof, double distance) {
       return Gamma.regularizedGammaQ(dof / 2, distance / 2);
   }

   static boolean x2IsUniform(double[] data, double significance) {
       return x2Prob(data.length - 1.0, x2Dist(data)) > significance;
   }

   public static void main(String[] a) {
       double[][] dataSets = {{199809, 200665, 199607, 200270, 199649},
       {522573, 244456, 139979, 71531, 21461}};

       System.out.printf(" %4s %12s  %12s %8s   %s%n",
               "dof", "distance", "probability", "Uniform?", "dataset");

       for (double[] ds : dataSets) {
           int dof = ds.length - 1;
           double dist = x2Dist(ds);
           double prob = x2Prob(dof, dist);
           System.out.printf("%4d %12.3f  %12.8f    %5s    %6s%n",
                   dof, dist, prob, x2IsUniform(ds, 0.05) ? "YES" : "NO",
                   Arrays.toString(ds));
       }
   }

}</lang>

  dof     distance   probability Uniform?   dataset
   4        4,146    0,38657083      YES    [199809.0, 200665.0, 199607.0, 200270.0, 199649.0]
   4   790063,276    0,00000000       NO    [522573.0, 244456.0, 139979.0, 71531.0, 21461.0]

Julia

<lang julia># v0.6

using Distributions

function eqdist(data::Vector{T}, α::Float64=0.05)::Bool where T <: Real

   if ! (0 ≤ α ≤ 1); error("α must be in [0, 1]") end
   exp = mean(data)
   chisqval = sum((x - exp) ^ 2 for x in data) / exp
   pval = ccdf(Chisq(2), chisqval)
   return pval > α

end

data1 = [199809, 200665, 199607, 200270, 199649] data2 = [522573, 244456, 139979, 71531, 21461]

for data in (data1, data2)

   println("Data:\n$data")
   println("Hypothesis test: the original population is ", (eqdist(data) ? "" : "not "), "uniform.\n")

end</lang>

Output:

Data:
[199809, 200665, 199607, 200270, 199649]
Hypothesis test: the original population is uniform.

Data:
[522573, 244456, 139979, 71531, 21461]
Hypothesis test: the original population is not uniform.

Kotlin

This program reuses Kotlin code from the Gamma function and Numerical Integration tasks but otherwise is a translation of the C entry for this task. <lang scala>// version 1.1.51

typealias Func = (Double) -> Double

fun gammaLanczos(x: Double): Double {

   var xx = x
   val p = doubleArrayOf(
       0.99999999999980993,
     676.5203681218851,
   -1259.1392167224028,
     771.32342877765313,
    -176.61502916214059,
      12.507343278686905,
      -0.13857109526572012,
       9.9843695780195716e-6,
       1.5056327351493116e-7
   )
   val g = 7
   if (xx < 0.5) return Math.PI / (Math.sin(Math.PI * xx) * gammaLanczos(1.0 - xx))
   xx--
   var a = p[0]
   val t = xx + g + 0.5
   for (i in 1 until p.size) a += p[i] / (xx + i)
   return Math.sqrt(2.0 * Math.PI) * Math.pow(t, xx + 0.5) * Math.exp(-t) * a

}

fun integrate(a: Double, b: Double, n: Int, f: Func): Double {

   val h = (b - a) / n
   var sum = 0.0
   for (i in 0 until n) {
       val x = a + i * h
       sum += (f(x) + 4.0 * f(x + h / 2.0) + f(x + h)) / 6.0
   }
   return sum * h

}

fun gammaIncompleteQ(a: Double, x: Double): Double {

   val aa1 = a - 1.0
   fun f0(t: Double) = Math.pow(t, aa1) * Math.exp(-t)
   val h = 1.5e-2
   var y = aa1
   while ((f0(y) * (x - y) > 2.0e-8) && y < x) y += 0.4
   if (y > x) y = x
   return 1.0 - integrate(0.0, y, (y / h).toInt(), ::f0) / gammaLanczos(a)

}

fun chi2UniformDistance(ds: DoubleArray): Double {

   val expected = ds.average()
   val sum = ds.map { val x = it - expected; x * x }.sum()
   return sum / expected

}

fun chi2Probability(dof: Int, distance: Double) =

   gammaIncompleteQ(0.5 * dof, 0.5 * distance)

fun chiIsUniform(ds: DoubleArray, significance: Double):Boolean {

   val dof = ds.size - 1
   val dist = chi2UniformDistance(ds)
   return chi2Probability(dof, dist) > significance

}

fun main(args: Array<String>) {

   val dsets = listOf(
       doubleArrayOf(199809.0, 200665.0, 199607.0, 200270.0, 199649.0),
       doubleArrayOf(522573.0, 244456.0, 139979.0,  71531.0,  21461.0)
   )
   for (ds in dsets) {
       println("Dataset: ${ds.asList()}")
       val dist = chi2UniformDistance(ds)
       val dof = ds.size - 1
       print("DOF: $dof  Distance: ${"%.4f".format(dist)}")
       val prob = chi2Probability(dof, dist)
       print("  Probability: ${"%.6f".format(prob)}")
       val uniform = if (chiIsUniform(ds, 0.05)) "Yes" else "No"
       println("  Uniform? $uniform\n")
   }

}</lang>

Output:

Dataset: [199809.0, 200665.0, 199607.0, 200270.0, 199649.0]
DOF: 4  Distance: 4.1463  Probability: 0.386571  Uniform? Yes

Dataset: [522573.0, 244456.0, 139979.0, 71531.0, 21461.0]
DOF: 4  Distance: 790063.2759  Probability: 0.000000  Uniform? No

Mathematica

This code explicity assumes a discrete uniform distribution since the chi square test is a poor test choice for continuous distributions and requires Mathematica version 2 or later <lang Mathematica>discreteUniformDistributionQ[data_, {min_Integer, max_Integer}, confLevel_: .05] := If[$VersionNumber >= 8,

 confLevel <= PearsonChiSquareTest[data, DiscreteUniformDistribution[{min, max}]],
 Block[{v, k = max - min, n = Length@data},
  v = (k + 1) (Plus @@ (((Length /@ Split[Sort@data]))^2))/n - n;
  GammaRegularized[k/2, 0, v/2] <= 1 - confLevel]]

discreteUniformDistributionQ[data_] :=discreteUniformDistributionQ[data, data[[Ordering[data][[{1, -1}]]]]]</lang> code used to create test data requires Mathematica version 6 or later <lang Mathematica>uniformData = RandomInteger[10, 100]; nonUniformData = Total@RandomInteger[10, {5, 100}];</lang> <lang Mathematica>{discreteUniformDistributionQ[uniformData],discreteUniformDistributionQ[nonUniformData]}</lang>

Output:

{True,False}

OCaml

This code needs to be compiled with library gsl.cma.

<lang ocaml>let sqr x = x *. x

let chi2UniformDistance distrib =

 let count, len = Array.fold_left (fun (s, c) e -> s + e, succ c)
 				   (0, 0) distrib in
 let expected = float count /. float len in
 let distance = Array.fold_left (fun s e ->
   s +. sqr (float e -. expected) /. expected
 ) 0. distrib in
 let dof = float (pred len) in
 dof, distance

let chi2Proba dof distance =

 Gsl_sf.gamma_inc_Q (0.5 *. dof) (0.5 *. distance)

let chi2IsUniform distrib significance =

 let dof, distance = chi2UniformDistance distrib in
 let likelihoodOfRandom = chi2Proba dof distance in
 likelihoodOfRandom > significance

let _ =

 List.iter (fun distrib ->
   let dof, distance = chi2UniformDistance distrib in
   Printf.printf "distribution ";
   Array.iter (Printf.printf "\t%d") distrib;
   Printf.printf "\tdistance %g" distance;
   Printf.printf "\t[%g > 0.05]" (chi2Proba dof distance);
   if chi2IsUniform distrib 0.05 then Printf.printf " fair\n"
   else Printf.printf " unfair\n"
 ) 
 [
   [| 199809; 200665; 199607; 200270; 199649 |];
   [| 522573; 244456; 139979; 71531; 21461 |]
 ]</lang>

Output

distribution    199809  200665  199607  200270  199649  distance 4.14628        [0.386571 > 0.05] fair
distribution    522573  244456  139979  71531   21461   distance 790063 [0 > 0.05] unfair

PARI/GP

The solution is very easy in GP since PARI includes a good incomplete gamma implementation; the sum function is also useful for clarity. Most of the code is just for displaying results.

The sample data for the test was taken from Go. <lang parigp>cumChi2(chi2,dof)={ my(g=gamma(dof/2)); incgam(dof/2,chi2/2,g)/g }; test(v,alpha=.05)={ my(chi2,p,s=sum(i=1,#v,v[i]),ave=s/#v); print("chi^2 statistic: ",chi2=sum(i=1,#v,(v[i]-ave)^2)/ave); print("p-value: ",p=cumChi2(chi2,#v-1)); if(p<alpha, print("Significant at the alpha = "alpha" level: not uniform"); , print("Not significant at the alpha = "alpha" level: uniform"); ) };

test([199809, 200665, 199607, 200270, 199649]) test([522573, 244456, 139979, 71531, 21461])</lang>

Perl

Translation of: Raku

<lang perl>use List::Util qw(sum reduce); use constant pi => 3.14159265;

sub incomplete_G_series {

   my($s, $z) = @_;
   my $n = 10;
   push @numers, $z**$_ for 1..$n;
   my @denoms = $s+1;
   push @denoms, $denoms[-1]*($s+$_) for 2..$n;
   my $M = 1;
   $M += $numers[$_-1]/$denoms[$_-1] for 1..$n;
   $z**$s / $s * exp(-$z) * $M;

}

sub G_of_half {

   my($n) = @_;
   if ($n % 2) { f(2*$_) / (4**$_ * f($_)) * sqrt(pi) for int ($n-1) / 2 }
   else        { f(($n/2)-1) }

}

sub f { reduce { $a * $b } 1, 1 .. $_[0] } # factorial

sub chi_squared_cdf {

   my($k, $x) = @_;
   my $f = $k < 20 ? 20 : 10;
   if ($x == 0)                  { 0.0 }
   elsif ($x < $k + $f*sqrt($k)) { incomplete_G_series($k/2, $x/2) / G_of_half($k) }
   else                          { 1.0 }

} sub chi_squared_test {

   my(@bins) = @_;
   $significance = 0.05;
   my $n = @bins;
   my $N = sum @bins;
   my $expected = $N / $n;
   my $chi_squared = sum map { ($_ - $expected)**2 / $expected } @bins;
   my $p_value = 1 - chi_squared_cdf($n-1, $chi_squared);
   return $chi_squared, $p_value, $p_value > $significance ? 'True' : 'False';

}

for $dataset ([199809, 200665, 199607, 200270, 199649], [522573, 244456, 139979, 71531, 21461]) {

   printf "C2 = %10.3f, p-value = %.3f, uniform = %s\n", chi_squared_test(@$dataset);

}</lang>

Output:

C2 =      4.146, p-value = 0.387, uniform = True
C2 = 790063.276, p-value = 0.000, uniform = False

Phix

Translation of: Go

using gamma() from Gamma_function#Phix <lang Phix>function f(atom aa1, t)

   return power(t, aa1) * exp(-t)

end function

function simpson38(atom aa1, a, b, integer n)

   atom h := (b - a) / n,
        h1 := h / 3,
        tot := f(aa1,a) + f(aa1,b)
   for j=3*n-1 to 1 by -1 do
       tot += (3-(mod(j,3)=0)) * f(aa1,a+h1*j)
   end for
   return h * tot / 8

end function

function gammaIncQ(atom a, x)

   atom aa1 := a - 1,
        y := aa1,
        h := 1.5e-2
   while f(aa1,y)*(x-y) > 2e-8 and y < x do
       y += 0.4
   end while
   if y > x then y = x end if
   return 1 - simpson38(aa1, 0, y, floor(y/h/gamma(a)))

end function

function chi2ud(sequence ds)

   atom expected = sum(ds)/length(ds),
        tot = sum(sq_power(sq_sub(ds,expected),2))
   return tot / expected

end function

function chi2p(integer dof, atom distance)

   return gammaIncQ(0.5*dof, 0.5*distance)

end function

constant sigLevel = 0.05 constant tf = {"true","false"}

procedure utest(sequence dset)

   printf(1,"Uniform distribution test\n")
   integer tot = sum(dset)
   printf(1," dataset:%s\n",{sprint(dset)})
   printf(1," samples:                      %d\n", tot)
   printf(1," categories:                   %d\n", length(dset))

   integer dof := length(dset) - 1
   printf(1," degrees of freedom:           %d\n", dof)

   atom dist := chi2ud(dset)
   printf(1," chi square test statistic:    %g\n", dist)

   atom p := chi2p(dof, dist)
   printf(1," p-value of test statistic:    %g\n", p)

   bool sig := p < sigLevel
   printf(1," significant at %2.0f%% level?     %s\n", {sigLevel*100, tf[2-sig]})
   printf(1," uniform?                      %s\n",{tf[sig+1]})

end procedure

utest({199809, 200665, 199607, 200270, 199649}) utest({522573, 244456, 139979, 71531, 21461})</lang>

Output:

Uniform distribution test
 dataset:{199809,200665,199607,200270,199649}
 samples:                      1000000
 categories:                   5
 degrees of freedom:           4
 chi square test statistic:    4.14628
 p-value of test statistic:    0.386571
 significant at  5% level?     false
 uniform?                      true
Uniform distribution test
 dataset:{522573,244456,139979,71531,21461}
 samples:                      1000000
 categories:                   5
 degrees of freedom:           4
 chi square test statistic:    790063
 p-value of test statistic:    2.35282e-11
 significant at  5% level?     true
 uniform?                      false

Python

Python: Using only standard libraries

Implements the Chi Square Probability function with an integration. I'm sure there are better ways to do this. Compare to OCaml implementation. <lang python>import math import random

def GammaInc_Q( a, x):

   a1 = a-1
   a2 = a-2
   def f0( t ):
       return t**a1*math.exp(-t)

   def df0(t):
       return (a1-t)*t**a2*math.exp(-t)
   
   y = a1
   while f0(y)*(x-y) >2.0e-8 and y < x: y += .3
   if y > x: y = x

   h = 3.0e-4
   n = int(y/h)
   h = y/n
   hh = 0.5*h
   gamax = h * sum( f0(t)+hh*df0(t) for t in ( h*j for j in xrange(n-1, -1, -1)))

   return gamax/gamma_spounge(a)

c = None def gamma_spounge( z):

   global c
   a = 12

   if c is None:
      k1_factrl = 1.0
      c = []
      c.append(math.sqrt(2.0*math.pi))
      for k in range(1,a):
         c.append( math.exp(a-k) * (a-k)**(k-0.5) / k1_factrl )
         k1_factrl *= -k
   
   accm = c[0]
   for k in range(1,a):
       accm += c[k] / (z+k)
   accm *= math.exp( -(z+a)) * (z+a)**(z+0.5)
   return accm/z;

def chi2UniformDistance( dataSet ):

   expected = sum(dataSet)*1.0/len(dataSet)
   cntrd = (d-expected for d in dataSet)
   return sum(x*x for x in cntrd)/expected

def chi2Probability(dof, distance):

   return 1.0 - GammaInc_Q( 0.5*dof, 0.5*distance)

def chi2IsUniform(dataSet, significance):

   dof = len(dataSet)-1
   dist = chi2UniformDistance(dataSet)
   return chi2Probability( dof, dist ) > significance

dset1 = [ 199809, 200665, 199607, 200270, 199649 ] dset2 = [ 522573, 244456, 139979, 71531, 21461 ]

for ds in (dset1, dset2):

   print "Data set:", ds
   dof = len(ds)-1
   distance =chi2UniformDistance(ds)
   print "dof: %d distance: %.4f" % (dof, distance),
   prob = chi2Probability( dof, distance)
   print "probability: %.4f"%prob,
   print "uniform? ", "Yes"if chi2IsUniform(ds,0.05) else "No"</lang>

Output:

Data set: [199809, 200665, 199607, 200270, 199649]
dof: 4 distance: 4.146280 probability: 0.3866 uniform?  Yes
Data set: [522573, 244456, 139979, 71531, 21461]
dof: 4 distance: 790063.275940 probability: 0.0000 uniform?  No

Python: Using scipy

This uses the library routine scipy.stats.chisquare.

<lang python>from scipy.stats import chisquare

if __name__ == '__main__':

   dataSets = [[199809, 200665, 199607, 200270, 199649],
               [522573, 244456, 139979,  71531,  21461]]
   print(f"{'Distance':^12} {'pvalue':^12} {'Uniform?':^8} {'Dataset'}")
   for ds in dataSets:
       dist, pvalue = chisquare(ds)
       uni = 'YES' if pvalue > 0.05 else 'NO'
       print(f"{dist:12.3f} {pvalue:12.8f} {uni:^8} {ds}")</lang>

Output:

  Distance      pvalue    Uniform? Dataset
       4.146   0.38657083   YES    [199809, 200665, 199607, 200270, 199649]
  790063.276   0.00000000    NO    [522573, 244456, 139979, 71531, 21461]

R

R being a statistical computating language, the chi-squared test is built in with the function "chisq.test" <lang tcl> dset1=c(199809,200665,199607,200270,199649) dset2=c(522573,244456,139979,71531,21461)

chi2IsUniform<-function(dataset,significance=0.05){

 chi2IsUniform=(chisq.test(dataset)$p.value>significance)

}

for (ds in list(dset1,dset2)){

 print(c("Data set:",ds))
 print(chisq.test(ds))
 print(paste("uniform?",chi2IsUniform(ds)))

} </lang>

Output:

[1] "Data set:" "199809"    "200665"    "199607"    "200270"    "199649"   

        Chi-squared test for given probabilities

data:  ds 
X-squared = 4.1463, df = 4, p-value = 0.3866

[1] "uniform? TRUE"
[1] "Data set:" "522573"    "244456"    "139979"    "71531"     "21461"    

        Chi-squared test for given probabilities

data:  ds 
X-squared = 790063.3, df = 4, p-value < 2.2e-16

[1] "uniform? FALSE"

Racket

lang racket

(require

racket/flonum (planet williams/science:4:5/science)
(only-in (planet williams/science/unsafe-ops-utils) real->float))

(chi^2-goodness-of-fit-test observed expected df)
Given: observed, a sequence of observed frequencies
expected, a sequence of expected frequencies
df, the degrees of freedom
Result: P-value = 1-chi^2cdf(X^2,df) , the p-value

(define (chi^2-goodness-of-fit-test observed expected df)

 (define X^2 (for/sum ([o observed] [e expected])
               (/ (sqr (- o e)) e)))
 (- 1.0 (chi-squared-cdf X^2 df)))

(define (is-uniform? rand n α)

 ; Use significance level α to test whether
 ; n small random numbers generated by rand
 ; have a uniform distribution.

 ; Observed values:
 (define o (make-vector 10 0))
 ; generate n random integers from 0 to 9.
 (for ([_ (+ n 1)])
   (define r (rand 10))
   (vector-set! o r (+ (vector-ref o r) 1)))
 ; Expected values:
 (define ex (make-vector 10 (/ n 10)))

 ; Calculate the P-value:
 (define P (chi^2-goodness-of-fit-test o ex (- n 1)))
 
 ; If the P-value is larger than α we accept the
 ; hypothesis that the numbers are distributed uniformly.
 (> P α))

Test whether the builtin generator is uniform

(is-uniform? random 1000 0.05)

Test whether the constant generator fails

(is-uniform? (λ(_) 5) 1000 0.05) </lang> Output: <lang racket>

t
f

</lang>

Raku

(formerly Perl 6)

For the incomplete gamma function we use a series expansion related to Kummer's confluent hypergeometric function (see http://en.wikipedia.org/wiki/Incomplete_gamma_function#Evaluation_formulae). The gamma function is calculated in closed form, as we only need its value at integers and half integers.

<lang perl6>sub incomplete-γ-series($s, $z) {

   my \numers = $z X** 1..*;
   my \denoms = [\*] $s X+ 1..*;
   my $M = 1 + [+] (numers Z/ denoms) ... * < 1e-6;
   $z**$s / $s * exp(-$z) * $M;

}

sub postfix:<!>(Int $n) { [*] 2..$n }

sub Γ-of-half(Int $n where * > 0) {

   ($n %% 2) ?? (($_-1)!                            given  $n    div 2)
             !! ((2*$_)! / (4**$_ * $_!) * sqrt(pi) given ($n-1) div 2);

}

degrees of freedom constrained due to numerical limitations

sub chi-squared-cdf(Int $k where 1..200, $x where * >= 0) {

   my $f = $k < 20 ?? 20 !! 10;
   given $x {
       when 0                    { 0.0 }
       when * < $k + $f*sqrt($k) { incomplete-γ-series($k/2, $x/2) / Γ-of-half($k) }
       default                   { 1.0 }
   }

}

sub chi-squared-test(@bins, :$significance = 0.05) {

   my $n = +@bins;
   my $N = [+] @bins;
   my $expected = $N / $n;
   my $chi-squared = [+] @bins.map: { ($^bin - $expected)**2 / $expected }
   my $p-value = 1 - chi-squared-cdf($n-1, $chi-squared);
   return (:$chi-squared, :$p-value, :uniform($p-value > $significance));

}

for [< 199809 200665 199607 200270 199649 >],

   [< 522573 244456 139979  71531  21461 >]
   -> $dataset

{

   my %t = chi-squared-test($dataset);
   say 'data: ', $dataset;
   say "χ² = {%t<chi-squared>}, p-value = {%t<p-value>.fmt('%.4f')}, uniform = {%t<uniform>}";

}</lang>

Output:

data: 199809 200665 199607 200270 199649
χ² = 4.14628, p-value = 0.3866, uniform = True
data: 522573 244456 139979 71531 21461
χ² = 790063.27594, p-value = 0.0000, uniform = False

REXX

Translation of: Go

Programming notes:

The use of the pow was elided as it can just be replaced with t**(a-1).

The gamma was replaced with a simple version. The argument for gamma is (in the cases used herein) always
positive, and is either an integer, or a number which is a multiple of ¹/₂, both of these cases can be calculated with
a straight─forward calculation. <lang rexx>/*REXX program performs a chi─squared test to verify a given distribution is uniform. */ numeric digits length( pi() ) - length(.) /*enough decimal digs for calculations.*/ @.=; @.1= 199809 200665 199607 200270 199649

                                           @.2= 522573 244456 139979  71531  21461
       do s=1  while @.s\==;  call uTest @.s  /*invoke  uTest with a data set of #'s.*/
       end   /*s*/

exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ !: procedure; parse arg x; p=1; do j=2 to x; p= p*j; end /*j*/; return p chi2p: procedure; parse arg dof, distance; return gammaI( dof/2, distance/2 ) f: parse arg t; if t=0 then return 0; return t ** (a-1) * exp(-t) e: e =2.718281828459045235360287471352662497757247093699959574966967627724; return e pi: pi=3.141592653589793238462643383279502884197169399375105820974944592308; return pi /*──────────────────────────────────────────────────────────────────────────────────────*/ !!: procedure; parse arg x; if x<2 then return 1; p= x

                    do k=2+x//2  to x-1  by 2;   p= p*k;   end  /*k*/;          return p

/*──────────────────────────────────────────────────────────────────────────────────────*/ chi2ud: procedure: parse arg ds; sum=0; expect= 0

                                      do j=1  for words(ds); expect= expect + word(ds, j)
                                      end   /*j*/
       expect = expect / words(ds)
                                      do k=1  for words(ds)
                                      sum= sum   +   (word(ds, k) - expect) **2
                                      end   /*k*/
       return sum / expect

/*──────────────────────────────────────────────────────────────────────────────────────*/ exp: procedure; parse arg x; ix= x%1; if abs(x-ix)>.5 then ix= ix + sign(x); x= x-ix

       z=1; _=1; w=z;    do j=1;  _= _*x/j;  z= (z + _)/1;  if z==w  then leave;      w=z
                         end  /*j*/;         if z\==0  then z= z * e()**ix;      return z

/*──────────────────────────────────────────────────────────────────────────────────────*/ gamma: procedure; parse arg x; if datatype(x, 'W') then return !(x-1) /*Int? Use fact*/

       n= trunc(x)                     /*at this point, X is pos and a multiple of 1/2.*/
       d= !!(n+n - 1)                  /*compute the double factorial of:    2*n - 1.  */
       if n//2  then p= -1             /*if  N  is  odd,   then use a negative unity.  */
                else p=  1             /*if  N  is even,   then use a positive unity.  */
       if x>0   then return p * d * sqrt(pi()) / (2**n)
                     return p * (2**n) * sqrt(pi()) / d

/*──────────────────────────────────────────────────────────────────────────────────────*/ gammaI: procedure; parse arg a,x; y= a-1; do while f(y)*(x-y) > 2e-8 & y<x; y= y + .4

                                            end  /*while*/
       y= min(x, y)
       return 1   -   simp38(0, y,   y / 0.015 / gamma(a-1) % 1)

/*──────────────────────────────────────────────────────────────────────────────────────*/ simp38: procedure; parse arg a, b, n; h= (b-a) / n; h1= h / 3

       sum= f(a) + f(b)
                            do j=3*n-1   by -1   while j>0
                            if j//3 == 0  then sum= sum   +   2 * f(a + h1*j)
                                          else sum= sum   +   3 * f(a + h1*j)
                            end   /*j*/
       return h * sum / 8

/*──────────────────────────────────────────────────────────────────────────────────────*/ sqrt: procedure; parse arg x; if x=0 then return 0; d=digits(); numeric digits; h= d+6

       numeric form; m.=9; parse value format(x,2,1,,0) 'E0' with g "E" _ .;g=g *.5'e'_%2
         do j=0  while h>9;      m.j=h;               h=h%2+1;       end  /*j*/
         do k=j+5  to 0  by -1;  numeric digits m.k;  g=(g+x/g)*.5;  end  /*k*/; return g

/*──────────────────────────────────────────────────────────────────────────────────────*/ uTest: procedure; parse arg dset; sum= 0; pad= left(, 11); sigLev= 1/20 /*5%*/

       say;   say '     '   center(" Uniform distribution test ", 75, '═')
       #= words(dset);                                              sigPC= sigLev*100/1
                            do j=1  for #;      sum= sum  +  word(dset, j)
                            end   /*j*/
                 say pad "                  dataset: "  dset
                 say pad "                  samples: "  sum
                 say pad "               categories: "  #
                 say pad "       degrees of freedom: "  # - 1
       dist= chi2ud(dset)
       P= chi2p(# - 1,  dist)
       sig = (abs(P) < dist * sigLev)
                 say pad "significant at " sigPC'%  level? '  word('no yes',    sig  + 1)
                 say pad "   is the dataset uniform? "        word('no yes', (\(sig))+ 1)
       return</lang>

output when using the default inputs:

      ════════════════════════ Uniform distribution test ════════════════════════
                              dataset:  199809 200665 199607 200270 199649
                              samples:  1000000
                           categories:  5
                   degrees of freedom:  4
            significant at  5%  level?  no
               is the dataset uniform?  yes

      ════════════════════════ Uniform distribution test ════════════════════════
                              dataset:  522573 244456 139979 71531 21461
                              samples:  1000000
                           categories:  5
                   degrees of freedom:  4
            significant at  5%  level?  yes
               is the dataset uniform?  no

Ruby

Translation of: Python

<lang ruby>def gammaInc_Q(a, x)

 a1, a2 = a-1, a-2
 f0  = lambda {|t| t**a1 * Math.exp(-t)}
 df0 = lambda {|t| (a1-t) * t**a2 * Math.exp(-t)}
 
 y = a1
 y += 0.3  while f0[y]*(x-y) > 2.0e-8 and y < x
 y = x  if y > x
 
 h = 3.0e-4
 n = (y/h).to_i
 h = y/n
 hh = 0.5 * h
 sum = 0
 (n-1).step(0, -1) do |j|
   t = h * j
   sum += f0[t] + hh * df0[t]
 end
 h * sum / gamma_spounge(a)

end

A = 12 k1_factrl = 1.0 coef = [Math.sqrt(2.0*Math::PI)] COEF = (1...A).each_with_object(coef) do |k,c|

 c << Math.exp(A-k) * (A-k)**(k-0.5) / k1_factrl
 k1_factrl *= -k

end

def gamma_spounge(z)

 accm = (1...A).inject(COEF[0]){|res,k| res += COEF[k] / (z+k)}
 accm * Math.exp(-(z+A)) * (z+A)**(z+0.5) / z

end

def chi2UniformDistance(dataSet)

 expected = dataSet.inject(:+).to_f / dataSet.size
 dataSet.map{|d|(d-expected)**2}.inject(:+) / expected

end

def chi2Probability(dof, distance)

 1.0 - gammaInc_Q(0.5*dof, 0.5*distance)

end

def chi2IsUniform(dataSet, significance=0.05)

 dof = dataSet.size - 1
 dist = chi2UniformDistance(dataSet)
 chi2Probability(dof, dist) > significance

end

dsets = [ [ 199809, 200665, 199607, 200270, 199649 ],

         [ 522573, 244456, 139979,  71531,  21461 ] ]

for ds in dsets

 puts "Data set:#{ds}"
 dof = ds.size - 1
 puts "  degrees of freedom: %d" % dof
 distance = chi2UniformDistance(ds)
 puts "  distance:           %.4f" % distance
 puts "  probability:        %.4f" % chi2Probability(dof, distance)
 puts "  uniform?            %s" % (chi2IsUniform(ds) ? "Yes" : "No")

end</lang>

Output:

Data set:[199809, 200665, 199607, 200270, 199649]
  degrees of freedom: 4
  distance:           4.1463
  probability:        0.3866
  uniform?            Yes
Data set:[522573, 244456, 139979, 71531, 21461]
  degrees of freedom: 4
  distance:           790063.2759
  probability:        -0.0000
  uniform?            No

Scala

Output:

See it yourself by running in your browser Scastie (remote JVM).

Library: Scala Math Statistic

Library: Scastie qualified

Works with: Scala version 2.13

<lang Scala>import org.apache.commons.math3.special.Gamma.regularizedGammaQ

object ChiSquare extends App {

 private val dataSets: Seq[Seq[Double]] =
   Seq(
     Seq(199809, 200665, 199607, 200270, 199649),
     Seq(522573, 244456, 139979, 71531, 21461)
   )

 private def χ2IsUniform(data: Seq[Double], significance: Double) =
   χ2Prob(data.size - 1.0, χ2Dist(data)) > significance

 private def χ2Dist(data: Seq[Double]) = {
   val avg = data.sum / data.size

   data.reduce((a, b) => a + math.pow(b - avg, 2)) / avg
 }

 private def χ2Prob(dof: Double, distance: Double) =
   regularizedGammaQ(dof / 2, distance / 2)

 printf(" %4s %10s  %12s %8s   %s%n",
   "d.f.", "χ²distance", "χ²probability", "Uniform?", "dataset")
 dataSets.foreach { ds =>
   val (dist, dof) = (χ2Dist(ds), ds.size - 1)

   printf("%4d %11.3f  %13.8f    %5s    %6s%n",
     dof, dist, χ2Prob(dof.toDouble, dist), if (χ2IsUniform(ds, 0.05)) "YES" else "NO", ds.mkString(", "))
 }

}</lang>

Sidef

<lang ruby># Confluent hypergeometric function of the first kind F_1(a;b;z) func F1(a, b, z, limit=100) {

   sum(0..limit, {|k|
       rising_factorial(a, k) / rising_factorial(b, k) * z**k / k!
   })

}

func γ(a,x) { # lower incomplete gamma function γ(a,x)

   #a**(-1) * x**a * F1(a, a+1, -x)            # simpler formula
   a**(-1) * x**a * exp(-x) * F1(1, a+1, x)    # slightly better convergence

}

func P(a,z) { # regularized gamma function P(a,z)

   γ(a,z) / Γ(a)

}

func chi_squared_cdf (k, x) {

   var f = (k<20 ? 20 : 10)
   given(x) {
       when (0) { 0 }
       case (. < (k + f*sqrt(k))) { P(k/2, x/2) }
       else { 1 }
   }

}

func chi_squared_test(arr, significance = 0.05) {

   var n = arr.len
   var N = arr.sum
   var expected = N/n
   var χ_squared = arr.sum_by {|v| (v-expected)**2 / expected }
   var p_value = (1 - chi_squared_cdf(n-1, χ_squared))
   [χ_squared, p_value, p_value > significance]

}

[

   %n< 199809 200665 199607 200270 199649 >,
   %n< 522573 244456 139979  71531  21461 >,

].each {|dataset|

   var r = chi_squared_test(dataset)
   say "data: #{dataset}"
   say "χ² = #{r[0]}, p-value = #{r[1].round(-4)}, uniform = #{r[2]}\n"

}</lang>

Output:

data: [199809, 200665, 199607, 200270, 199649]
χ² = 4.14628, p-value = 0.3866, uniform = true

data: [522573, 244456, 139979, 71531, 21461]
χ² = 790063.27594, p-value = 0, uniform = false

Tcl

Works with: Tcl version 8.5

Library: Tcllib (Package: math::statistics)

<lang tcl>package require Tcl 8.5 package require math::statistics

proc isUniform {distribution {significance 0.05}} {

   set count [tcl::mathop::+ {*}[dict values $distribution]]
   set expected [expr {double($count) / [dict size $distribution]}]
   set X2 0.0
   foreach value [dict values $distribution] {

set X2 [expr {$X2 + ($value - $expected)**2 / $expected}]

   }
   set degreesOfFreedom [expr {[dict size $distribution] - 1}]
   set likelihoodOfRandom [::math::statistics::incompleteGamma \

[expr {$degreesOfFreedom / 2.0}] [expr {$X2 / 2.0}]]

   expr {$likelihoodOfRandom > $significance}

}</lang> Testing: <lang tcl>proc makeDistribution {operation {count 1000000}} {

   for {set i 0} {$i<$count} {incr i} {incr distribution([uplevel 1 $operation])}
   return [array get distribution]

}

set distFair [makeDistribution {expr int(rand()*5)}] puts "distribution \"$distFair\" assessed as [expr [isUniform $distFair]?{fair}:{unfair}]" set distUnfair [makeDistribution {expr int(rand()*rand()*5)}] puts "distribution \"$distUnfair\" assessed as [expr [isUniform $distUnfair]?{fair}:{unfair}]"</lang> Output:

distribution "0 199809 4 199649 1 200665 2 199607 3 200270" assessed as fair
distribution "4 21461 0 522573 1 244456 2 139979 3 71531" assessed as unfair

VBA

The built in worksheetfunction ChiSq_Dist of Excel VBA is used. Output formatted like R. <lang vb>Private Function Test4DiscreteUniformDistribution(ObservationFrequencies() As Variant, Significance As Single) As Boolean

   'Returns true if the observed frequencies pass the Pearson Chi-squared test at the required significance level.
   Dim Total As Long, Ei As Long, i As Integer
   Dim ChiSquared As Double, DegreesOfFreedom As Integer, p_value As Double
   Debug.Print "[1] ""Data set:"" ";
   For i = LBound(ObservationFrequencies) To UBound(ObservationFrequencies)
       Total = Total + ObservationFrequencies(i)
       Debug.Print ObservationFrequencies(i); " ";
   Next i
   DegreesOfFreedom = UBound(ObservationFrequencies) - LBound(ObservationFrequencies)
   'This is exactly the number of different categories minus 1
   Ei = Total / (DegreesOfFreedom + 1)
   For i = LBound(ObservationFrequencies) To UBound(ObservationFrequencies)
       ChiSquared = ChiSquared + (ObservationFrequencies(i) - Ei) ^ 2 / Ei
   Next i
   p_value = 1 - WorksheetFunction.ChiSq_Dist(ChiSquared, DegreesOfFreedom, True)
   Debug.Print
   Debug.Print "   Chi-squared test for given frequencies"
   Debug.Print "X-squared ="; ChiSquared; ", ";
   Debug.Print "df ="; DegreesOfFreedom; ", ";
   Debug.Print "p-value = "; Format(p_value, "0.0000")
   Test4DiscreteUniformDistribution = p_value > Significance

End Function Public Sub test()

   Dim O() As Variant
   O = [{199809,200665,199607,200270,199649}]
   Debug.Print "[1] ""Uniform? "; Test4DiscreteUniformDistribution(O, 0.05); """"
   O = [{522573,244456,139979,71531,21461}]
   Debug.Print "[1] ""Uniform? "; Test4DiscreteUniformDistribution(O, 0.05); """"

End Sub</lang>

{{out}

[1] "Data set:"  199809   200665   199607   200270   199649  
   Chi-squared test for given frequencies
X-squared = 4.14628 , df = 4 , p-value = 0.3866
[1] "Uniform? True"
[1] "Data set:"  522573   244456   139979   71531   21461  
   Chi-squared test for given frequencies
X-squared = 790063.27594 , df = 4 , p-value = 0.0000
[1] "Uniform? False"

Wren

Translation of: Kotlin

Library: Wren-math

Library: Wren-fmt

<lang ecmascript>import "/math" for Math, Nums import "/fmt" for Fmt

var integrate = Fn.new { |a, b, n, f|

   var h = (b - a) / n
   var sum = 0
   for (i in 0...n) {
       var x = a + i*h
       sum = sum + (f.call(x) + 4 * f.call(x + h/2) + f.call(x + h)) / 6
   }
   return sum * h

}

var gammaIncomplete = Fn.new { |a, x|

   var am1 = a - 1
   var f0 = Fn.new { |t| t.pow(am1) * Math.exp(-t) }
   var h = 1.5e-2
   var y = am1
   while ((f0.call(y) * (x - y) > 2e-8) && y < x) y = y + 0.4
   if (y > x) y = x
   return 1 - integrate.call(0, y, (y/h).truncate, f0) / Math.gamma(a)

}

var chi2UniformDistance = Fn.new { |ds|

   var expected = Nums.mean(ds)
   var sum = Nums.sum(ds.map { |d| (d - expected).pow(2) }.toList)
   return sum / expected

}

var chi2Probability = Fn.new { |dof, dist| gammaIncomplete.call(0.5*dof, 0.5*dist) }

var chiIsUniform = Fn.new { |ds, significance|

   var dof = ds.count - 1
   var dist = chi2UniformDistance.call(ds)
   return chi2Probability.call(dof, dist) > significance

}

var dsets = [

   [199809, 200665, 199607, 200270, 199649],
   [522573, 244456, 139979,  71531,  21461]

] for (ds in dsets) {

   System.print("Dataset: %(ds)")
   var dist = chi2UniformDistance.call(ds)
   var dof = ds.count - 1
   Fmt.write("DOF: $d  Distance: $.4f", dof, dist)
   var prob = chi2Probability.call(dof, dist)
   Fmt.write("  Probability: $.6f", prob)
   var uniform = chiIsUniform.call(ds, 0.05) ? "Yes" : "No"
   System.print("  Uniform? %(uniform)\n")

}</lang>

Output:

Dataset: [199809, 200665, 199607, 200270, 199649]
DOF: 4  Distance: 4.1463  Probability: 0.386571  Uniform? Yes

Dataset: [522573, 244456, 139979, 71531, 21461]
DOF: 4  Distance: 790063.2759  Probability: 0.000000  Uniform? No

zkl

Translation of: C

Translation of: D

<lang zkl>/* Numerical integration method */ fcn Simpson3_8(f,a,b,N){ // fcn,double,double,Int --> double

  h,h1:=(b - a)/N, h/3.0;
  h*[1..3*N - 1].reduce('wrap(sum,j){
     l1:=(if(j%3) 3.0 else 2.0);
     sum + l1*f(a + h1*j);
  },f(a) + f(b))/8.0;

}

const A=12; fcn Gamma_Spouge(z){ // double --> double

  var coefs=fcn{  // this runs only once, at construction time
     a,coefs:=A.toFloat(),(A).pump(List(),0.0);
     k1_factrl:=1.0;
     coefs[0]=(2.0*(0.0).pi).sqrt();
     foreach k in ([1.0..A-1]){
        coefs[k]=(a - k).exp() * (a - k).pow(k - 0.5) / k1_factrl;

k1_factrl*=-k;

     }
     coefs
  }();

  ( [1..A-1].reduce('wrap(accum,k){ accum + coefs[k]/(z + k) },coefs[0])
    * (-(z + A)).exp()*(z + A).pow(z + 0.5) )
  / z;

}

fcn f0(t,aa1){ t.pow(aa1)*(-t).exp() }

fcn GammaIncomplete_Q(a,x){ // double,double --> double

  h:=1.5e-2;  /* approximate integration step size */
  /* this cuts off the tail of the integration to speed things up */
  y:=a - 1; f:=f0.fp1(y);
  while((f(y)*(x - y)>2.0e-8) and (y<x)){ y+=0.4; }
  if(y>x) y=x;
  1.0 - Simpson3_8(f,0.0,y,(y/h).toInt())/Gamma_Spouge(a);

}</lang> <lang zkl>fcn chi2UniformDistance(ds){ // --> double

  dslen   :=ds.len();
  expected:=dslen.reduce('wrap(sum,k){ sum + ds[k] },0.0)/dslen;
  sum     := dslen.reduce('wrap(sum,k){ x:=ds[k] - expected; sum + x*x },0.0);
  sum/expected

}

fcn chi2Probability(dof,distance){ GammaIncomplete_Q(0.5*dof, 0.5*distance) }

fcn chiIsUniform(dset,significance=0.05){

  significance < chi2Probability(-1.0 + dset.len(),chi2UniformDistance(dset))

}</lang> <lang zkl>datasets:=T( T(199809.0, 200665.0, 199607.0, 200270.0, 199649.0),

            T(522573.0, 244456.0, 139979.0,  71531.0,  21461.0) );

println(" %4s %12s %12s %8s %s".fmt(

       "dof", "distance", "probability", "Uniform?", "dataset"));

foreach ds in (datasets){

  dof :=ds.len() - 1;
  dist:=chi2UniformDistance(ds);
  prob:=chi2Probability(dof,dist);
  println("%4d %12.3f  %12.8f    %5s    %6s".fmt(
           dof, dist, prob, chiIsUniform(ds) and "YES" or "NO",

ds.concat(","))); }</lang>

Output:

  dof     distance   probability Uniform?   dataset
   4        4.146    0.38657083      YES    199809,200665,199607,200270,199649
   4   790063.276    0.00000000       NO    522573,244456,139979,71531,21461