Verify distribution uniformity/Chi-squared test: Difference between revisions

← Older edit

Verify distribution uniformity/Chi-squared test (view source)

Revision as of 10:29, 16 February 2024

9,954 bytes added , 3 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,485

edits

Revision as of 22:27, 14 December 2022 (view source) Peak (talk \| contribs) (preferably a two-tailed test) ← Older edit		Latest revision as of 10:29, 16 February 2024 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(10 intermediate revisions by 3 users not shown)
Line 10: ;Reference: :*   an entry at the MathWorld website:   [http://mathworld.wolfram.com/Chi-SquaredDistribution.html chi-squared distribution]. ; Related task: :* [[Statistics/Chi-squared_distribution]] <br><br> =={{header\|11l}}== {{trans\|Python}} Line 289 ⟶ 290: return 0; }</syntaxhighlight> =={{header\|C#}}== {{trans\|Go}} <syntaxhighlight lang="C#"> using System; class Program { public delegate double Func(double x); public static double Simpson38(Func f, double a, double b, int n) { double h = (b - a) / n; double h1 = h / 3; double sum = f(a) + f(b); for (int j = 3 * n - 1; j > 0; j--) { if (j % 3 == 0) { sum += 2 * f(a + h1 * j); } else { sum += 3 * f(a + h1 * j); } } return h * sum / 8; } // Lanczos Approximation for Gamma Function private static double SpecialFunctionGamma(double z) { double[] p = { 676.5203681218851, -1259.1392167224028, 771.32342877765313, -176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7 }; if (z < 0.5) return Math.PI / (Math.Sin(Math.PI * z) * SpecialFunctionGamma(1 - z)); z -= 1; double x = 0.99999999999980993; for (int i = 0; i < p.Length; i++) { x += p[i] / (z + i + 1); } double t = z + p.Length - 0.5; return Math.Sqrt(2 * Math.PI) * Math.Pow(t, z + 0.5) * Math.Exp(-t) * x; } public static double GammaIncQ(double a, double x) { double aa1 = a - 1; Func f = t => Math.Pow(t, aa1) * Math.Exp(-t); double y = aa1; double h = 1.5e-2; while (f(y) * (x - y) > 2e-8 && y < x) { y += .4; } if (y > x) { y = x; } return 1 - Simpson38(f, 0, y, (int)(y / h / SpecialFunctionGamma(a))); } public static double Chi2Ud(int[] ds) { double sum = 0, expected = 0; foreach (var d in ds) { expected += d; } expected /= ds.Length; foreach (var d in ds) { double x = d - expected; sum += x * x; } return sum / expected; } public static double Chi2P(int dof, double distance) { return GammaIncQ(.5 * dof, .5 * distance); } const double SigLevel = .05; static void Main(string[] args) { int[][] datasets = new int[][] { new int[] { 199809, 200665, 199607, 200270, 199649 }, new int[] { 522573, 244456, 139979, 71531, 21461 }, }; foreach (var dset in datasets) { UTest(dset); } } static void UTest(int[] dset) { Console.WriteLine("Uniform distribution test"); int sum = 0; foreach (var c in dset) { sum += c; } Console.WriteLine($" dataset: {string.Join(", ", dset)}"); Console.WriteLine($" samples: {sum}"); Console.WriteLine($" categories: {dset.Length}"); int dof = dset.Length - 1; Console.WriteLine($" degrees of freedom: {dof}"); double dist = Chi2Ud(dset); Console.WriteLine($" chi square test statistic: {dist}"); double p = Chi2P(dof, dist); Console.WriteLine($" p-value of test statistic: {p}"); bool sig = p < SigLevel; Console.WriteLine($" significant at {SigLevel * 100}% level? {sig}"); Console.WriteLine($" uniform? {!sig}\n"); } } </syntaxhighlight> {{out}} <pre> Uniform distribution test dataset: 199809, 200665, 199607, 200270, 199649 samples: 1000000 categories: 5 degrees of freedom: 4 chi square test statistic: 4.14628 p-value of test statistic: 0.386570833082767 significant at 5% level? False uniform? True Uniform distribution test dataset: 522573, 244456, 139979, 71531, 21461 samples: 1000000 categories: 5 degrees of freedom: 4 chi square test statistic: 790063.27594 p-value of test statistic: 2.35282904270662E-11 significant at 5% level? True uniform? False </pre> =={{header\|C++}}== <syntaxhighlight lang="c++"> #include <iostream> #include <vector> #include <cmath> #include <iomanip> void print_vector(const std::vector<int32_t>& list) { std::cout << "["; for ( uint64_t i = 0; i < list.size() - 1; ++i ) { std::cout << list[i] << ", "; } std::cout << list.back() << "]" << std::endl; } bool is_significant(const double p_value, const double significance_level) { return p_value > significance_level; } // The normalised lower incomplete gamma function. double gamma_cdf(const double aX, const double aK) { double result = 0.0; for ( uint32_t m = 0; m <= 99; ++m ) { result += pow(aX, m) / tgamma(aK + m + 1); } result = pow(aX, aK) exp(-aX); return std::isnan(result) ? 1.0 : result; } // The cumulative probability function of the Chi-squared distribution. double cdf(const double aX, const double aK) { if ( aX > 1'000 && aK < 100 ) { return 1.0; } return ( aX > 0.0 && aK > 0.0 ) ? gamma_cdf(aX / 2, aK / 2) : 0.0; } void chi_squared_test(const std::vector<int32_t>& observed) { double sum = 0.0; for ( uint64_t i = 0; i < observed.size(); ++i ) { sum += observed[i]; } const double expected = sum / observed.size(); const int32_t degree_freedom = observed.size() - 1; double test_statistic = 0.0; for ( uint64_t i = 0; i < observed.size(); ++i ) { test_statistic += pow(observed[i] - expected, 2) / expected; } const double p_value = 1.0 - cdf(test_statistic, degree_freedom); std::cout << "\nUniform distribution test" << std::setprecision(6) << std::endl; std::cout << " observed values : "; print_vector(observed); std::cout << " expected value : " << expected << std::endl; std::cout << " degrees of freedom: " << degree_freedom << std::endl; std::cout << " test statistic : " << test_statistic << std::endl; std::cout.setf(std::ios::fixed); std::cout << " p-value : " << p_value << std::endl; std::cout.unsetf(std::ios::fixed); std::cout << " is 5% significant?: " << std::boolalpha << is_significant(p_value, 0.05) << std::endl; } int main() { const std::vector<std::vector<int32_t>> datasets = { { 199809, 200665, 199607, 200270, 199649 }, { 522573, 244456, 139979, 71531, 21461 } }; for ( std::vector<int32_t> dataset : datasets ) { chi_squared_test(dataset); } } </syntaxhighlight> {{ out }} <pre> Uniform distribution test observed values : [199809, 200665, 199607, 200270, 199649] expected value : 200000 degrees of freedom: 4 test statistic : 4.14628 p-value : 0.386571 is 5% significant?: true Uniform distribution test observed values : [522573, 244456, 139979, 71531, 21461] expected value : 200000 degrees of freedom: 4 test statistic : 790063 p-value : 0.000000 is 5% significant?: false </pre> =={{header\|D}}== Line 732 ⟶ 1,000: 4 4,146 0,38657083 YES [199809.0, 200665.0, 199607.0, 200270.0, 199649.0] 4 790063,276 0,00000000 NO [522573.0, 244456.0, 139979.0, 71531.0, 21461.0]</pre> =={{header\|jq}}== {{works with\|jq}} '''Also works with gojq, the Go implementation of jq.''' This entry uses a two-tailed test, as is appropriate for this type of problem as illustrated by the last example. The test is based on the assumption that the sample size is large enough for the χ2 distribution to be used. The implementation of `Chi2_cdf` here uses the recursion relation for the gamma function and should be both fast, accurate and quite robust. For an industrial-strength algorithm, see e.g. https://people.sc.fsu.edu/~jburkardt/c_src/asa239/asa239.c '''Generic Functions''' <syntaxhighlight lang=jq> def round($dec): if type == "string" then . else pow(10;$dec) as $m \| . * $m \| floor / $m end; # sum of squares def ss(s): reduce s as $x (0; . + ($x * $x)); # Cumulative density function of the chi-squared distribution with $k # degrees of freedom # The recursion formula for gamma is used for efficiency and robustness. def Chi2_cdf($x; $k): if $x == 0 then 0 elif $x > (1e3 * $k) then 1 else 1e-15 as $tol # for example \| { s: 0, m: 0, term: (1 / ((($k/2)+1)\|gamma)) } \| until (.term\|length < $tol; # length here is abs .s += .term \| .m += 1 \| .term = (($x/2) / (($k/2) + .m )) ) \| .s ( ((-$x/2) + ($k/2)*(($x/2)\|log)) \| exp) end ; </syntaxhighlight> '''Tasks''' <syntaxhighlight lang=jq> # Input: array of frequencies def chi2UniformDistance: (add / length) as $expected \| ss(.[] - $expected) / $expected; # Input: a number # Output: an indication of the probability of observing this value or higher # assuming the value is drawn from a chi-squared distribution with $dof degrees # of freedom def chi2Probability($dof): (1 - Chi2_cdf(.; $dof)) \| if . < 1e-10 then "< 1e-10" else . end; # Input: array of frequencies # Output: result of a two-tailed test based on the chi-squared statistic # assuming the sample size is large enough def chiIsUniform($significance): (length - 1) as $dof \| chi2UniformDistance \| Chi2_cdf(.; $dof) as $cdf \| if $cdf then ($significance/2) as $s \| $cdf > $s and $cdf < (1-$s) else false end; def dsets: [ [199809, 200665, 199607, 200270, 199649], [522573, 244456, 139979, 71531, 21461], [19,14,6,18,7,5,1], # low entropy [9,11,9,10,15,11,5], # high entropy [20,20,20] # made-up ]; def task: dsets[] \| "Dataset: \(.)", ( chi2UniformDistance as $dist \| (length - 1) as $dof \| "DOF: \($dof) D (Distance): \($dist)", " Estimated probability of observing a value >= D: \($dist\|chi2Probability($dof)\|round(2))", " Uniform? \( (select(chiIsUniform(0.05)) \| "Yes") // "No" )\n" ) ; task </syntaxhighlight> <pre> Dataset: [199809,200665,199607,200270,199649] DOF: 4 D (Distance): 4.14628 Estimated probability of observing a value >= D: 0.38 Uniform? Yes Dataset: [522573,244456,139979,71531,21461] DOF: 4 D (Distance): 790063.27594 Estimated probability of observing a value >= D: < 1e-10 Uniform? No Dataset: [19,14,6,18,7,5,1] DOF: 6 D (Distance): 29.2 Estimated probability of observing a value >= D: 0 Uniform? No Dataset: [9,11,9,10,15,11,5] DOF: 6 D (Distance): 5.4 Estimated probability of observing a value >= D: 0.49 Uniform? Yes Dataset: [20,20,20] DOF: 2 D (Distance): 0 Estimated probability of observing a value >= D: 1 Uniform? No </pre> =={{header\|Julia}}== Line 1,917 ⟶ 2,300: {{libheader\|Wren-math}} {{libheader\|Wren-fmt}} <syntaxhighlight lang="~~ecmascript~~wren">import "./math" for Math, Nums import "./fmt" for Fmt var integrate = Fn.new { \|a, b, n, f\|