Verify distribution uniformity/Chi-squared test: Difference between revisions
(forgot header) |
(→{{header|C}}: improvements, comments) |
||
Line 5: | Line 5: | ||
[http://mathworld.wolfram.com/Chi-SquaredDistribution.html mathworld]. |
[http://mathworld.wolfram.com/Chi-SquaredDistribution.html mathworld]. |
||
=={{header|C}}== |
=={{header|C}}== |
||
This first sections contains the functions required to computer the Chi-Squared probability. |
|||
These are not needed if a library containing the necessary function is availabile. |
|||
<lang c>#include <stdlib.h> |
<lang c>#include <stdlib.h> |
||
#include <stdio.h> |
#include <stdio.h> |
||
Line 11: | Line 13: | ||
typedef double (* Ifctn)( double t); |
typedef double (* Ifctn)( double t); |
||
/* Numerical integration method */ |
|||
double Simpson3_8( Ifctn f, double a, double b, int N) |
double Simpson3_8( Ifctn f, double a, double b, int N) |
||
{ |
{ |
||
Line 65: | Line 67: | ||
double a1 = a-1; |
double a1 = a-1; |
||
double a2 = a-2; |
double a2 = a-2; |
||
double y, h = 1.5e-2; |
double y, h = 1.5e-2; /* approximate integration step size */ |
||
double gma; |
|||
int n; |
|||
/* this will cut of tail of the integration to speed things up */ |
|||
y = aa1 = a1; |
y = aa1 = a1; |
||
while((f0(y) * (x-y) > 2.0e-8) && (y < x)) y += .4; |
while((f0(y) * (x-y) > 2.0e-8) && (y < x)) y += .4; |
||
if (y>x) y=x; |
if (y>x) y=x; |
||
return 1.0 - Simpson3_8( &f0, 0, y, (int)(y/h))/Gamma_Spounge(a); |
|||
}</lang> |
|||
gma = Simpson3_8( &f0, 0, y, n); |
|||
This section contains the functions specific to the task. |
|||
return 1.0 - gma/Gamma_Spounge(a); |
|||
⚫ | |||
} |
|||
⚫ | |||
{ |
{ |
||
double expected = 0.0; |
double expected = 0.0; |
Revision as of 23:28, 11 January 2010
You are encouraged to solve this task according to the task description, using any language you may know.
In this task, write a function to verify that a given distribution of values is uniform by using the test to see if the distribution has a likelihood of happening of at least the significance level (conventionally 5%). The function should return a boolean that is true if the distribution is one that a uniform distribution (with appropriate number of degrees of freedom) may be expected to produce.
More information about the distribution may be found at mathworld.
C
This first sections contains the functions required to computer the Chi-Squared probability. These are not needed if a library containing the necessary function is availabile. <lang c>#include <stdlib.h>
- include <stdio.h>
- include <math.h>
- define M_PI 3.14159265359
typedef double (* Ifctn)( double t); /* Numerical integration method */ double Simpson3_8( Ifctn f, double a, double b, int N) {
int j; double l1; double rng = b-a; double h = rng/N; double h1 = h/3.0; double sum = f(a) + f(b);
for (j=3*N-1; j>0; j--) { l1 = (j%3)? 3.0 : 2.0; sum += l1*f(a+h1*j) ; } return h*sum/8.0;
}
- define A 12
double Gamma_Spounge( double z ) {
int k; double cspace[A]; double *coefs = NULL; double accum; double a = A;
if (!coefs) { double k1_factrl = 1.0; coefs = cspace; coefs[0] = sqrt(2.0*M_PI); for(k=1; k<A; k++) { coefs[k] = exp(a-k) * pow(a-k,k-0.5) / k1_factrl; k1_factrl *= -k; } }
accum = coefs[0]; for (k=1; k<A; k++) { accum += coefs[k]/(z+k); } accum *= exp(-(z+a)) * pow(z+a, z+0.5); return accum/z;
}
double aa1; double f0( double t) {
return pow(t, aa1)*exp(-t);
}
double GammaIncomplete_Q( double a, double x) {
double a1 = a-1; double a2 = a-2; double y, h = 1.5e-2; /* approximate integration step size */
/* this will cut of tail of the integration to speed things up */ y = aa1 = a1; while((f0(y) * (x-y) > 2.0e-8) && (y < x)) y += .4; if (y>x) y=x;
return 1.0 - Simpson3_8( &f0, 0, y, (int)(y/h))/Gamma_Spounge(a);
}</lang> This section contains the functions specific to the task. <lang c>double chi2UniformDistance( double *ds, int dslen) {
double expected = 0.0; double sum = 0.0; int k;
for (k=0; k<dslen; k++) expected += ds[k]; expected /= k;
for (k=0; k<dslen; k++) { double x = ds[k] - expected; sum += x*x; } return sum/expected;
}
double chi2Probability( int dof, double distance) {
return GammaIncomplete_Q( 0.5*dof, 0.5*distance);
}
int chiIsUniform( double *dset, int dslen, double significance) {
int dof = dslen -1; double dist = chi2UniformDistance( dset, dslen); return chi2Probability( dof, dist ) > significance;
}</lang> Testing <lang c>int main(int argc, char **argv) {
double dset1[] = { 199809., 200665., 199607., 200270., 199649. }; double dset2[] = { 522573., 244456., 139979., 71531., 21461. }; double *dsets[] = { dset1, dset2 }; int dslens[] = { 5, 5 }; int k, l; double dist, prob; int dof;
for (k=0; k<2; k++) { printf("Dataset: [ "); for(l=0;l<dslens[k]; l++) printf("%.0f, ", dsets[k][l]); printf("]\n"); dist = chi2UniformDistance(dsets[k], dslens[k]); dof = dslens[k]-1; printf("dof: %d distance: %.4f", dof, dist); prob = chi2Probability( dof, dist ); printf(" probability: %.6f", prob); printf(" uniform? %s\n", chiIsUniform(dsets[k], dslens[k], 0.05)? "Yes":"No"); } return 0;
}</lang>
J
J has two types of verb definitions, Explicit and Tacit.
This is an Explicit solution: <lang j>require 'stats/base'
NB.*isUniform v Tests whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniform=: verb define
(#~. y) isUniform y
signif=. 0.95 NB. set significance level expected=. (#y) % x NB. number of items divided by the category count observed=. #/.~ y NB. frequency count for each category X2=. +/ (*: observed - expected) % expected NB. the test statistic degfreedom=. <: x NB. degrees of freedom signif > degfreedom chisqcdf :: 1: X2
)</lang> This is the equivalent Tacit solution: <lang j>require 'stats/base'
SIGNIF=: 0.95 NB. set significance level countCats=: #@~. NB. counts the number of unique items getExpected=: #@] % [ NB. divides no of items by category count getObserved=: #/.~@] NB. counts frequency for each category calcX2=: [: +/ *:@(getObserved - getExpected) % getExpected NB. calculates test statistic calcDf=: <:@[ NB. calculates degrees of freedom for uniform distribution
NB.*isUniformT v Tests whether y is uniformly distributed NB. result is: boolean describing if distribution y is uniform NB. y is: distribution to test NB. x is: optionally specify number of categories possible isUniformT=: (countCats $: ]) : (SIGNIF > calcDf chisqcdf :: 1: calcX2)</lang>
Verbs and Distributions for testing: <lang j>freqCount=: ~. ,. #/.~
testFair=: monad define
'distribution ', (":,freqCount y) , ' assessed as ', (isUniform y) {:: ;:'unfair fair'
)
FairDistrib=: 1e6 ?@$ 5 UnfairDistrib=: (9.5e5 ?@$ 5) , (5e4 ?@$ 4)</lang>
Usage: <lang j> testFair FairDistrib distribution 4 143155 0 143085 1 143111 2 142706 6 142666 5 142365 3 142912 assessed as fair
testFair UnfairDistrib
distribution 0 203086 1 201897 2 202648 3 202388 4 189981 assessed as unfair
isUniformT 4 4 4 5 5 5 5 5 5 5 NB. uniform if only 2 categories possible
1
4 isUniformT 4 4 4 5 5 5 5 5 5 5 NB. not uniform if 4 categories possible
0</lang>
OCaml
This code needs to be compiled with library gsl.cma.
<lang ocaml>let sqr x = x *. x
let chi2UniformDistance distrib =
let count, len = Array.fold_left (fun (s, c) e -> s + e, succ c) (0, 0) distrib in let expected = float count /. float len in let distance = Array.fold_left (fun s e -> s +. sqr (float e -. expected) /. expected ) 0. distrib in let dof = float (pred len) in dof, distance
let chi2Proba dof distance =
Gsl_sf.gamma_inc_Q (0.5 *. dof) (0.5 *. distance)
let chi2IsUniform distrib significance =
let dof, distance = chi2UniformDistance distrib in let likelihoodOfRandom = chi2Proba dof distance in likelihoodOfRandom > significance
let _ =
List.iter (fun distrib -> let dof, distance = chi2UniformDistance distrib in Printf.printf "distribution "; Array.iter (Printf.printf "\t%d") distrib; Printf.printf "\tdistance %g" distance; Printf.printf "\t[%g > 0.05]" (chi2Proba dof distance); if chi2IsUniform distrib 0.05 then Printf.printf " fair\n" else Printf.printf " unfair\n" ) [ [| 199809; 200665; 199607; 200270; 199649 |]; [| 522573; 244456; 139979; 71531; 21461 |] ]</lang>
Output
distribution 199809 200665 199607 200270 199649 distance 4.14628 [0.386571 > 0.05] fair distribution 522573 244456 139979 71531 21461 distance 790063 [0 > 0.05] unfair
Python
Implements the Chi Square Probability function with an integration. I'm sure there are better ways to do this. Compare to OCaml implementation. <lang python>import math import random
def GammaInc_Q( a, x):
a1 = a-1 a2 = a-2 def f0( t ): return pow(t,a1)*math.exp(-t)
def df0(t): return (a1-t)*pow(t,a2)*math.exp(-t) y = a1 while( f0(y)*(x-y) >2.0e-8 ) and (y < x): y += .3 if y > x: y = x
h = 3.0e-4 n = int(y/h) h = y/n hh = 0.5*h gamax = h * sum( f0(t)+hh*df0(t) for t in ( h*j for j in xrange(n-1, -1, -1)))
return gamax/gamma_spounge(a)
c = None def gamma_spounge( z):
global c a = 12
if c is None: k1_factrl = 1.0 c = [] c.append(math.sqrt(2.0*math.pi)) for k in range(1,a): c.append( math.exp(a-k) * pow(a-k, k-0.5) / k1_factrl ) k1_factrl *= -k accm = c[0] for k in range(1,a): accm += c[k] / (z+k) accm *= math.exp( -(z+a)) * pow(z+a, z+0.5) return accm/z;
def chi2UniformDistance( dataSet ):
expected = sum(dataSet)*1.0/len(dataSet) cntrd = (d-expected for d in dataSet) return sum(x*x for x in cntrd)/expected
def chi2Probability(dof, distance):
return 1.0 - GammaInc_Q( 0.5*dof, 0.5*distance)
def chi2IsUniform(dataSet, significance):
dof = len(dataSet)-1 dist = chi2UniformDistance(dataSet) return chi2Probability( dof, dist ) > significance
dset1 = [ 199809, 200665, 199607, 200270, 199649 ] dset2 = [ 522573, 244456, 139979, 71531, 21461 ]
for ds in (dset1, dset2):
print "Data set:", ds dof = len(ds)-1 distance =chi2UniformDistance(ds) print "dof: %d distance: %.4f" % (dof, distance), prob = chi2Probability( dof, distance) print "probability: %.4f"%prob, print "uniform? ", "Yes"if chi2IsUniform(ds,0.05) else "No"</lang>
Output:
Data set: [199809, 200665, 199607, 200270, 199649] dof: 4 distance: 4.146280 probability: 0.3866 uniform? Yes Data set: [522573, 244456, 139979, 71531, 21461] dof: 4 distance: 790063.275940 probability: 0.0000 uniform? No
Tcl
<lang tcl>package require Tcl 8.5 package require math::statistics
proc isUniform {distribution {significance 0.05}} {
set count [tcl::mathop::+ {*}[dict values $distribution]] set expected [expr {double($count) / [dict size $distribution]}] set X2 0.0 foreach value [dict values $distribution] {
set X2 [expr {$X2 + ($value - $expected)**2 / $expected}]
} set degreesOfFreedom [expr {[dict size $distribution] - 1}] set likelihoodOfRandom [::math::statistics::incompleteGamma \
[expr {$degreesOfFreedom / 2.0}] [expr {$X2 / 2.0}]]
expr {$likelihoodOfRandom > $significance}
}</lang> Testing: <lang tcl>proc makeDistribution {operation {count 1000000}} {
for {set i 0} {$i<$count} {incr i} {incr distribution([uplevel 1 $operation])} return [array get distribution]
}
set distFair [makeDistribution {expr int(rand()*5)}] puts "distribution \"$distFair\" assessed as [expr [isUniform $distFair]?{fair}:{unfair}]" set distUnfair [makeDistribution {expr int(rand()*rand()*5)}] puts "distribution \"$distUnfair\" assessed as [expr [isUniform $distUnfair]?{fair}:{unfair}]"</lang> Output:
distribution "0 199809 4 199649 1 200665 2 199607 3 200270" assessed as fair distribution "4 21461 0 522573 1 244456 2 139979 3 71531" assessed as unfair