Talk:Welch's t-test: Difference between revisions

→‎cannot run python libraries: move to Hailholyghost's talk page (the problem was unrelated to this task)
m (Eoraptor moved page Talk:Calculate P-Value to Talk:Welch's t-test: "p-value" alone is meaningless, it could apply to any statistical test)
(→‎cannot run python libraries: move to Hailholyghost's talk page (the problem was unrelated to this task))
 
(62 intermediate revisions by 3 users not shown)
Line 458:
 
:: Sounds like Math::MPFR needs to get cpan to give better diagnostics for that kind of failure? (But why not show both versions? Are they too big or something?) --[[User:Rdm|Rdm]] ([[User talk:Rdm|talk]]) 02:01, 15 November 2017 (UTC)
 
I have a more portable version of the current perl code, using the new integration by reduction method, and verifying against the R code as I did with the C code:
 
<lang perl>
#!/usr/bin/env perl
 
use strict; use warnings; use Cwd 'getcwd';# use feature 'say';
my $TOP_DIRECTORY = getcwd();
local $SIG{__WARN__} = sub {#kill the program if there are any warnings
my $message = shift;
my $fail_filename = "$TOP_DIRECTORY/$0.FAIL";
open my $fh, '>', $fail_filename or die "Can't write $fail_filename: $!";
printf $fh ("$message @ %s\n", getcwd());
close $fh;
die "$message\n";
};#http:#perlmaven.com/how-to-capture-and-save-warnings-in-perl
 
 
sub betain {
my $x = shift;
my $p = shift;
my $q = shift;
my $beta = shift;
#modified from John Burkardt
my $acu = 10**(-15);
my $ai;
my $cx;
my $indx;#int
my $ns;#int
my $pp;
my $psq;
my $qq;
my $rx;
my $temp;
my $term;
my $value;
my $xx;
 
$value = $x;
# ifault = 0;
#Check the input arguments.
if ( ($p <= 0.0) || ($q <= 0.0 )){
# *ifault = 1;
return $value;
}
if ( $x < 0.0 || 1.0 < $x )
{
# *ifault = 2;
return $value;
}
#/*
# Special cases.
#*/
if ( $x == 0.0 || $x == 1.0 ) {
return $value;
}
$psq = $p + $q;
$cx = 1.0 - $x;
 
if ( $p < $psq * $x )
{
$xx = $cx;
$cx = $x;
$pp = $q;
$qq = $p;
$indx = 1;
}
else
{
$xx = $x;
$pp = $p;
$qq = $q;
$indx = 0;
}
 
$term = 1.0;
$ai = 1.0;
$value = 1.0;
$ns = sprintf("%d",( $qq + $cx * $psq ));#this evaluates differently, because $ns is a C integer
#
# Use the Soper reduction formula.
#
$rx = $xx / $cx;
$temp = $qq - $ai;
if ( $ns == 0 )
{
$rx = $xx;
}
 
for ( ; ; )
{
$term = $term * $temp * $rx / ( $pp + $ai );
$value = $value + $term;
$temp = abs ( $term );
if ( $temp <= $acu && $temp <= $acu * $value )
{
$value = $value * exp ( $pp * log ( $xx )
+ ( $qq - 1.0 ) * log ( $cx ) - $beta ) / $pp;
 
if ( $indx )
{
$value = 1.0 - $value;
}
last;
}
 
$ai = $ai + 1.0;
$ns = $ns - 1;
 
if ( 0 <= $ns )
{
$temp = $qq - $ai;
if ( $ns == 0 )
{
$rx = $xx;
}
}
else
{
$temp = $psq;
$psq = $psq + 1.0;
}
}
 
return $value;
}
 
#lgamma subroutine is less precise than "use POSIX 'lgamma'" but Macs don't have POSIX
sub lgamma {#http://hea-www.harvard.edu/~alexey/calc-src.txt
my $xx = shift;
my ($j, $ser, $tmp, $x, $y);
my @cof = (0.0, 76.18009172947146, -86.50532032941677,
24.01409824083091, -1.231739572450155, 0.1208650973866179e-2,
-0.5395239384953e-5);
my $stp = 2.5066282746310005;
$x = $xx; $y = $x;
$tmp = $x + 5.5;
$tmp = ($x+0.5)*log($tmp)-$tmp;
$ser = 1.000000000190015;
foreach $j ( 1 .. 6 ) {
$y+=1.0;
$ser+=$cof[$j]/$y;
}
return $tmp + log($stp*$ser/$x);
}
 
use List::Util 'sum';
sub calculate_Pvalue {
my $array1 = shift;
my $array2 = shift;
if (scalar @$array1 <= 1) {
return 1.0;
}
if (scalar @$array2 <= 1) {
return 1.0;
}
my $mean1 = sum(@{ $array1 });
$mean1 /= scalar @$array1;
my $mean2 = sum(@{ $array2 });
$mean2 /= scalar @$array2;
if ($mean1 == $mean2) {
return 1.0;
}
# say "mean1 = $mean1 mean2 = $mean2";
my $variance1 = 0.0;
my $variance2 = 0.0;
foreach my $x (@$array1) {
$variance1 += ($x-$mean1)*($x-$mean1);
}
foreach my $x (@$array2) {
$variance2 += ($x-$mean2)*($x-$mean2);
}
if (($variance1 == 0.0) && ($variance2 == 0.0)) {
return 1.0;
}
$variance1 = $variance1/(scalar @$array1-1);
$variance2 = $variance2/(scalar @$array2-1);
my $array1_size = scalar @$array1;
my $array2_size = scalar @$array2;
my $WELCH_T_STATISTIC = ($mean1-$mean2)/sqrt($variance1/$array1_size+$variance2/$array2_size);
my $DEGREES_OF_FREEDOM = (($variance1/$array1_size+$variance2/(scalar @$array2))**2)
/
(
($variance1*$variance1)/($array1_size*$array1_size*($array1_size-1))+
($variance2*$variance2)/($array2_size*$array2_size*($array2_size-1))
);
# say "$WELCH_T_STATISTIC $DEGREES_OF_FREEDOM";
my $A = $DEGREES_OF_FREEDOM/2;
my $x = $DEGREES_OF_FREEDOM/($WELCH_T_STATISTIC*$WELCH_T_STATISTIC+$DEGREES_OF_FREEDOM);
return betain($x, $A, 0.5, lgamma($A)+0.57236494292470009-lgamma($A+0.5));
}
 
my @d1 = (27.5,21.0,19.0,23.6,17.0,17.9,16.9,20.1,21.9,22.6,23.1,19.6,19.0,21.7,21.4);
my @d2 = (27.1,22.0,20.8,23.4,23.4,23.5,25.8,22.0,24.8,20.2,21.9,22.1,22.9,20.5,24.4);
my @d3 = (17.2,20.9,22.6,18.1,21.7,21.4,23.5,24.2,14.7,21.8);
my @d4 = (21.5,22.8,21.0,23.0,21.6,23.6,22.5,20.7,23.4,21.8,20.7,21.7,21.5,22.5,23.6,21.5,22.5,23.5,21.5,21.8);
my @d5 = (19.8,20.4,19.6,17.8,18.5,18.9,18.3,18.9,19.5,22.0);
my @d6 = (28.2,26.6,20.1,23.3,25.2,22.1,17.7,27.6,20.6,13.7,23.2,17.5,20.6,18.0,23.9,21.6,24.3,20.4,24.0,13.2);
my @d7 = (30.02,29.99,30.11,29.97,30.01,29.99);
my @d8 = (29.89,29.93,29.72,29.98,30.02,29.98);
my @x = (3.0,4.0,1.0,2.1);
my @y = (490.2,340.0,433.9);
my @s1 = (1.0/15,10.0/62.0);
my @s2 = (1.0/10,2/50.0);
my @v1 = (0.010268,0.000167,0.000167);
my @v2 = (0.159258,0.136278,0.122389);
my @z1 = (9/23.0,21/45.0,0/38.0);
my @z2 = (0/44.0,42/94.0,0/22.0);
 
my @CORRECT_ANSWERS = (0.021378001462867,
0.148841696605327,
0.0359722710297968,
0.090773324285671,
0.0107515611497845,
0.00339907162713746,
0.52726574965384,
0.545266866977794);
 
my $pvalue = calculate_Pvalue(\@d1,\@d2);
my $error = abs($pvalue - $CORRECT_ANSWERS[0]);
printf("Test sets 1 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@d3,\@d4);
$error += abs($pvalue - $CORRECT_ANSWERS[1]);
printf("Test sets 2 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@d5,\@d6);
$error += abs($pvalue - $CORRECT_ANSWERS[2]);
printf("Test sets 3 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@d7,\@d8);
$error += abs($pvalue - $CORRECT_ANSWERS[3]);
printf("Test sets 4 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@x,\@y);
$error += abs($pvalue - $CORRECT_ANSWERS[4]);
printf("Test sets 5 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@v1,\@v2);
$error += abs($pvalue - $CORRECT_ANSWERS[5]);
printf("Test sets 6 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@s1,\@s2);
$error += abs($pvalue - $CORRECT_ANSWERS[6]);
printf("Test sets 7 p-value = %g\n",$pvalue);
 
$pvalue = calculate_Pvalue(\@z1,\@z2);
$error += abs($pvalue - $CORRECT_ANSWERS[7]);
printf("Test sets z p-value = %g\n",$pvalue);
 
printf("the cumulative error is %g\n", $error);</lang>
 
I think this should replace the current perl code (because frankly, I can't get all the modules surrounding the current one to work). I realize that Macs don't have POSIX, so I replaced the 'lgamma' with something more portable. It should work anywhere with Perl 5.0 and up.
Let me know what you think.--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 01:49, 5 January 2018 (UTC)
 
: I like the existing code. Try the following simple changes to the current code:
<pre>
3c3
< use Math::AnyNum qw(gamma pi);
---
> use Math::Cephes qw(gamma $PI);
38c38
< (gamma($sa + 1) * sqrt(pi) / gamma($sa + 1.5)))->numify;
---
> (gamma($sa + 1) * sqrt($PI) / gamma($sa + 1.5)));
</pre>
It gives similar results and the dependency chain should be much easier. There are a lot of machines that don't have MPFR installed, and some that don't even have GMP. The downside is that Math::AnyNum is a much more functional module that builds on others. But Math::Cephes is pretty cool as a standalone native precision library. I've thought about adding gamma to the ntheory module but it's complicated (for me at least). [[User:Danaj|Danaj]] ([[User talk:Danaj|talk]]) 22:31, 5 January 2018 (UTC)
 
I tried the existing module again, and it still won't even install.
The entire purpose of creating this Rosetta Code page was because I had numerous issues installing libraries in the first place, and I kept getting responses like "but the libraries do work!" despite evidence to the contrary. If a task solution is not portable, it shouldn't be shown. Also, the Perl translation I've done works very differently than the current one, more accurately, as I show. Unlike the current code, I can actually run my code, and it should run anywhere.
What is the Rosetta code policy for using an "alternative solution" on Rosetta Code?
 
: It's unclear with your message above as to whether you tried installing Math::AnyNum again or are replying to my comment about using Math::Cephes. Without the MPFR library installed in your O/S, Math::MPFR can't install, and therefore Math::AnyNum can't install. The libraries ''do'' work for those who have the requisite software installed. Your argument about portability turns into "don't ever use modules outside of core" which I don't believe is the way many people use Perl.
: This is mostly moot as I've added a translation of the C code which uses only List::Util's sum, based in part on your translation above. Hopefully this will resolve your issues with the page.
: I don't know what sort of official policies there are, but my take for Perl, which is intimately tied with CPAN, is that I like to see a version that uses no modules or only core modules (e.g. List::Util), followed by, if appropriate, versions using modules. The first version is useful in that it can run everywhere with no module installation, and it shows all the work. The versions using modules are one or more of faster, shorter, clearer, more functional, or more idiomatic. Especially if the no-module version is very long, I think it's best to reverse the order -- show the 2-10 line easy solutions, then the long "if you want to write everything by hand:" method.
: It is also not unusual, especially as new tasks get more complex, to see the first code added to a page using a module. This lets us get a Perl solution up quickly and succinctly, and later someone can write a huge everything-by-hand version. I suppose there is another discussion of whether it would make sense to have a "common library" pointer so we don't have to keep pasting in the same helper functions on 10+ tasks, which would let one concentrate more on the task.
 
thank you for your work, this new Perl version is better than what I had done.--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 12:47, 10 January 2018 (UTC)
 
== New title ==
someone changed the title of the page to something which doesn't fit the page... this calculates p-value, it *uses* the Welch's t-test, it doesn't return it. This is wrong.--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 20:11, 7 December 2017 (UTC)
:This sentence does not even mean anything. Please define "using Welch's t-test", and "returning Welch's t-test". [[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 15:55, 13 December 2017 (UTC)
 
== Task's goal, and pretending to do numerical integration when it has nothing to do with the task ==
 
Funnily, while the creator of the page disagrees with its renaming to "Welch's t-test", arguing that it's mainly about integration, he keeps giving arguments to the contrary. His last addition to the task description states: "This is meant to translate R's <code>t.test(vector1,vector2, alternative="two.sided", var.equal=FALSE)</code> for calculation of the p-value." That is, this is precisely Welch's t-test. And by the way, this has '''nothing''' to do with integration, in any well-thought implementation of the incomplete gamma function in software libraries (used by R or any other statistical package). If Rosetta Code is in any way supposed to show good practice, using numerical integration in such a case is just... silly.
 
But even the C program, by the creator of the page, shows an extremely poor implementation of numerical integration, with fixed step-size (65535), probably because it looked large enough to him, even though the integrand is not bounded when the number of degrees of freedoms increases: upper bound of the integral gets nearer to 1, and the denominator sqrt(1-r) gets nearer to 0 near the upper integration bound. With no step control? Seriously? I tried to explain. Now I'm tired of trying and I will simply suggest to open a book on numerical analysis and start learning stuff. Maybe it's not too late.
 
The sad part is: while Hailholyghost does not seem to understand the task he as created, nor the basics of the mathematics really involved, the task itself is perfectly acceptable and quite interesting. But it's really about Welch's t-test, dude.
 
[[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 15:44, 13 December 2017 (UTC)
 
Eoraptor, if you know how to do the integration better than I did, using the algorithm you posted https://dl.acm.org/citation.cfm?id=131776, while still keeping everything in GNU99 C standard (without unusable libraries, as many do), I'm all ears. In fact I would appreciate the benefit of your evidently advanced knowledge. I am a chemist, not a programmer.
 
Let me clarify why I created this page in the first place: I could not find how to get the p-value in GNU99 C after weeks of google-searching. I spent hours trying to get existing libraries to work (none would work) and I wanted something portable that would still work in a few months or years. I couldn't find or understand R's source code for how it calculates this. So I wrote it in a portable, stable language like C, just based on my memory of high school calculus and Wikipedia pages. I spent weeks looking for how to do this on the internet, and wanted to save the work I had done. I am aware that the integration could be better. I would appreciate your help if you can offer it to make the C implementation better (faster and/or more accurate) using the numerical analysis algorithms you had mentioned.--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 19:54, 13 December 2017 (UTC)
 
How can I add this page to a list of pages needing attention, specifically implementing Algorithm 708 https://dl.acm.org/citation.cfm?id=131776 integration into the C example?--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 20:02, 13 December 2017 (UTC)
:Regarding the algorithm, I'll have a look. It's only a matter of translating Fortran code to C. I do not consider it's mandatory to solve the task, as many languages have built-in features to do that (most "matrix languages" have also common CDF and/or special functions). However, it could be useful in C and others. [http://people.sc.fsu.edu/~jburkardt/ John Burkardt] has done a great work at translating Fortran code to different languages, but TOMS 708 does not seem to be in the list. The original code may even be simplified, given the constraints on the arguments (I need to check, but since there are different code paths according to arguments, it's likely). Even direct integration could be considered (not in production code, but on RC it's acceptable if other solutions are really too complicated), but it requires some care: either transform the integral into something smoother, or control the error. A caution note, too: I'll have to check the copyright. If it's not clear, there are other libraries out there (SLATEC, DCDFLIB, or whatever is used by R, scipy or Octave, which are all open source).
:
:Considering your own problem, did you consider linking Fortran code to your program? It's a very common way to use time-proven Fortran libraries without rewriting them (which is time consuming and error prone). For instance, R, scipy and Scilab heavily use Fortran behind the scene. And nobody would translate LAPACK, for instance. Another possibility would be to use [http://www.netlib.org/f2c/ f2c]. Another, obvious remark: GSL has the [https://www.gnu.org/software/gsl/manual/html_node/Incomplete-Beta-Function.html incomplete beta function]and the [https://www.gnu.org/software/gsl/manual/html_node/The-t_002ddistribution.html t distribution]. On Linux you just have to get it with your favourite package manager, and on Windows you can compile it. Relatively difficult, but doable: you will need MinGW-w64 and MSYS2. It's probably easier to compile a Fortran library and use it with your C program, though.
:
:A piece of general advice for your projects: don't reinvent the wheel, especially for numerical code. It's extremely difficult to get it right. And there are tons of free good libraries (start with Netlib, and the free matrix languages). And by the way, if the task is not too heavy, consider a matrix language in the first place, it's relatively slow, but it's extremely easy to work with. MATLAB is the gold standard, but the free ones are good too. If you have anything related to statistics, I strongly suggest R (and good news: linking C or Fortran code to R is fairly easy).
:[[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 06:59, 14 December 2017 (UTC)
 
I always have difficulties with libraries, in a lot of ways they're like mythical creatures/unicorns to me. Consider http://people.sc.fsu.edu/~jburkardt/c_src/asa063/asa063.html which has C code (supposedly). None of it compiles! It's a lot like the user who overwrote my working Perl code to use some library I can't install. I couldn't get GSL to work either. What I wrote for the p-value calculation here may be awkward, but it at least does what it says it would do.
 
Consider: <code>con@e:~/Scripts/ASA$ gcc -o asa063 asa063.c -Wall -pedantic -std=c99 -lm
asa063.c: In function ‘betain’:
asa063.c:357:10: warning: unused variable ‘betain’ [-Wunused-variable]
double betain;
^
asa063.c: In function ‘timestamp’:
asa063.c:502:10: warning: variable ‘len’ set but not used [-Wunused-but-set-variable]
size_t len;
^
/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o: In function `_start':
(.text+0x20): undefined reference to `main'
collect2: error: ld returned 1 exit status</code>
 
I modified none of this. Perhaps you know how his asa063.c could compile?--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 13:28, 14 December 2017 (UTC)
:Actually, it compiles, but it does not contain a "main" function, and you are trying to compile to an executable. You can't. Use "gcc -c" to only compile to an object file, that you will be able to link later. For instance:
:gcc -c aux1.c
:gcc -c aux2.c
:gcc prog.c aux1.o aux1.o
:There is also a warning, but it's probably harmless. [[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 15:35, 14 December 2017 (UTC)
 
==Implementing Algorithm... existing libraries don't calculate correct numbers==
 
I'm attempting to integrate what I got from the library, but its function <code>betain</code> does not produce the same ratio as the integral (which I know is approximately correct).
How can I use the Betain function in my function <code>Pvalue</code>?
 
<lang c>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <limits.h>
 
/******************************************************************************/
// x =? nu / (t^2 + nu)
// p =? nu/2
// q =? 1/2
//
 
double betain ( double x, double p, double q, double beta, int *restrict ifault )
 
/******************************************************************************/
/*
Purpose:
 
BETAIN computes the incomplete Beta function ratio.
 
Licensing:
 
This code is distributed under the GNU LGPL license.
 
Modified:
 
05 November 2010
 
Author:
 
Original FORTRAN77 version by KL Majumder, GP Bhattacharjee.
C version by John Burkardt.
 
Reference:
 
KL Majumder, GP Bhattacharjee,
Algorithm AS 63:
The incomplete Beta Integral,
Applied Statistics,
Volume 22, Number 3, 1973, pages 409-411.
 
Parameters:
https://www.jstor.org/stable/2346797?seq=1#page_scan_tab_contents
Input, double X, the argument, between 0 and 1.
 
Input, double P, Q, the parameters, which
must be positive.
 
Input, double BETA, the logarithm of the complete
beta function.
 
Output, int *IFAULT, error flag.
0, no error.
nonzero, an error occurred.
 
Output, double BETAIN, the value of the incomplete
Beta function ratio.
*/
{
double acu = 0.1E-14;
double ai;
// double betain;
double cx;
int indx;
int ns;
double pp;
double psq;
double qq;
double rx;
double temp;
double term;
double value;
double xx;
 
value = x;
ifault = 0;
//Check the input arguments.
if ( (p <= 0.0) || (q <= 0.0 )){
// *ifault = 1;
return value;
}
if ( x < 0.0 || 1.0 < x )
{
*ifault = 2;
return value;
}
/*
Special cases.
*/
if ( x == 0.0 || x == 1.0 ) {
return value;
}
psq = p + q;
cx = 1.0 - x;
 
if ( p < psq * x )
{
xx = cx;
cx = x;
pp = q;
qq = p;
indx = 1;
}
else
{
xx = x;
pp = p;
qq = q;
indx = 0;
}
 
term = 1.0;
ai = 1.0;
value = 1.0;
ns = ( int ) ( qq + cx * psq );
/*
Use the Soper reduction formula.
*/
rx = xx / cx;
temp = qq - ai;
if ( ns == 0 )
{
rx = xx;
}
 
for ( ; ; )
{
term = term * temp * rx / ( pp + ai );
value = value + term;;
temp = fabs ( term );
 
if ( temp <= acu && temp <= acu * value )
{
value = value * exp ( pp * log ( xx )
+ ( qq - 1.0 ) * log ( cx ) - beta ) / pp;
 
if ( indx )
{
value = 1.0 - value;
}
break;
}
 
ai = ai + 1.0;
ns = ns - 1;
 
if ( 0 <= ns )
{
temp = qq - ai;
if ( ns == 0 )
{
rx = xx;
}
}
else
{
temp = psq;
psq = psq + 1.0;
}
}
 
return value;
}
/******************************************************************************/
 
double r8_max ( double x, double y )
 
/******************************************************************************/
/*
Purpose:
 
R8_MAX returns the maximum of two R8's.
 
Licensing:
 
This code is distributed under the GNU LGPL license.
 
Modified:
 
18 August 2004
 
Author:
 
John Burkardt
 
Parameters:
 
Input, double X, Y, the quantities to compare.
 
Output, double R8_MAX, the maximum of X and Y.
*/
{
double value;
 
if ( y < x )
{
value = x;
}
else
{
value = y;
}
return value;
}
 
double Pvalue (const double *restrict ARRAY1, const size_t ARRAY1_SIZE, const double *restrict ARRAY2, const size_t ARRAY2_SIZE) {//calculate a p-value based on an array
if (ARRAY1_SIZE <= 1) {
return 1.0;
} else if (ARRAY2_SIZE <= 1) {
return 1.0;
}
double fmean1 = 0.0, fmean2 = 0.0;
for (size_t x = 0; x < ARRAY1_SIZE; x++) {//get sum of values in ARRAY1
if (isfinite(ARRAY1[x]) == 0) {//check to make sure this is a real numbere
puts("Got a non-finite number in 1st array, can't calculate P-value.");
exit(EXIT_FAILURE);
}
fmean1 += ARRAY1[x];
}
fmean1 /= ARRAY1_SIZE;
for (size_t x = 0; x < ARRAY2_SIZE; x++) {//get sum of values in ARRAY2
if (isfinite(ARRAY2[x]) == 0) {//check to make sure this is a real number
puts("Got a non-finite number in 2nd array, can't calculate P-value.");
exit(EXIT_FAILURE);
}
fmean2 += ARRAY2[x];
}
fmean2 /= ARRAY2_SIZE;
// printf("mean1 = %lf mean2 = %lf\n", fmean1, fmean2);
if (fmean1 == fmean2) {
return 1.0;//if the means are equal, the p-value is 1, leave the function
}
double unbiased_sample_variance1 = 0.0, unbiased_sample_variance2 = 0.0;
for (size_t x = 0; x < ARRAY1_SIZE; x++) {//1st part of added unbiased_sample_variance
unbiased_sample_variance1 += (ARRAY1[x]-fmean1)*(ARRAY1[x]-fmean1);
}
for (size_t x = 0; x < ARRAY2_SIZE; x++) {
unbiased_sample_variance2 += (ARRAY2[x]-fmean2)*(ARRAY2[x]-fmean2);
}
// printf("unbiased_sample_variance1 = %lf\tunbiased_sample_variance2 = %lf\n",unbiased_sample_variance1,unbiased_sample_variance2);//DEBUGGING
unbiased_sample_variance1 = unbiased_sample_variance1/(ARRAY1_SIZE-1);
unbiased_sample_variance2 = unbiased_sample_variance2/(ARRAY2_SIZE-1);
const double WELCH_T_STATISTIC = (fmean1-fmean2)/sqrt(unbiased_sample_variance1/ARRAY1_SIZE+unbiased_sample_variance2/ARRAY2_SIZE);
const double DEGREES_OF_FREEDOM = pow((unbiased_sample_variance1/ARRAY1_SIZE+unbiased_sample_variance2/ARRAY2_SIZE),2.0)//numerator
/
(
(unbiased_sample_variance1*unbiased_sample_variance1)/(ARRAY1_SIZE*ARRAY1_SIZE*(ARRAY1_SIZE-1))+
(unbiased_sample_variance2*unbiased_sample_variance2)/(ARRAY2_SIZE*ARRAY2_SIZE*(ARRAY2_SIZE-1))
);
// printf("Welch = %lf DOF = %lf\n", WELCH_T_STATISTIC, DEGREES_OF_FREEDOM);
const double a = DEGREES_OF_FREEDOM/2, x = DEGREES_OF_FREEDOM/(WELCH_T_STATISTIC*WELCH_T_STATISTIC+DEGREES_OF_FREEDOM);
const unsigned int N = 65536;//increase N for a tighter convergence
const double h = x/N;//necessary for integrating with Simpson's method
double sum1 = 0.0, sum2 = 0.0;
for(unsigned int i = 1;i < N+1; i++) {//integrate by Simpson's method (sometimes blows up at i = 0, so I start @ 1
sum1 += (pow(h * i + h / 2.0,a-1))/(sqrt(1-(h * i + h / 2.0)));
sum2 += (pow(h * i,a-1))/(sqrt(1-h * i));
}
// printf("sum1 = %lf sum2 = %lf\n",sum1, sum2);
// double return_value = ((3*h / 8.0) * ((pow(x,a-1))/(sqrt(1-x)) + sum1 + 0))/(exp(lgammal(a)+0.57236494292470009-lgammal(a+0.5)));//0.5723649... is lgammal(0.5)
double return_value = ((h / 6.0) * ((pow(x,a-1))/(sqrt(1-x)) + 4.0 * sum1 + 2.0 * sum2))/(exp(lgammal(a)+0.57236494292470009-lgammal(a+0.5)));//0.5723649... is lgammal(0.5)
if ((isinf(return_value) != 0) || (isnan(return_value) != 0) || (return_value > 1.0)) {
return 1.0;
} else {
int *restrict z = 0;
printf("%g =? %g\n", return_value, betain(x, a, 0.5, exp(lgammal(a)+0.57236494292470009-lgammal(a+0.5)), z));
return return_value;
}
}
//-------------------
int main(void) {
const double d1[] = {27.5,21.0,19.0,23.6,17.0,17.9,16.9,20.1,21.9,22.6,23.1,19.6,19.0,21.7,21.4};
const double d2[] = {27.1,22.0,20.8,23.4,23.4,23.5,25.8,22.0,24.8,20.2,21.9,22.1,22.9,20.5,24.4};
const double d3[] = {17.2,20.9,22.6,18.1,21.7,21.4,23.5,24.2,14.7,21.8};
const double d4[] = {21.5,22.8,21.0,23.0,21.6,23.6,22.5,20.7,23.4,21.8,20.7,21.7,21.5,22.5,23.6,21.5,22.5,23.5,21.5,21.8};
const double d5[] = {19.8,20.4,19.6,17.8,18.5,18.9,18.3,18.9,19.5,22.0};
const double d6[] = {28.2,26.6,20.1,23.3,25.2,22.1,17.7,27.6,20.6,13.7,23.2,17.5,20.6,18.0,23.9,21.6,24.3,20.4,24.0,13.2};
const double d7[] = {30.02,29.99,30.11,29.97,30.01,29.99};
const double d8[] = {29.89,29.93,29.72,29.98,30.02,29.98};
const double x[] = {3.0,4.0,1.0,2.1};
const double y[] = {490.2,340.0,433.9};
const double v1[] = {0.010268,0.000167,0.000167};
const double v2[] = {0.159258,0.136278,0.122389};
const double s1[] = {1.0/15,10.0/62.0};
const double s2[] = {1.0/10,2/50.0};
const double z1[] = {9/23.0,21/45.0,0/38.0};
const double z2[] = {0/44.0,42/94.0,0/22.0};
printf("Test sets 1 p-value = %g\n",Pvalue(d1,sizeof(d1)/sizeof(*d1),d2,sizeof(d2)/sizeof(*d2)));
printf("Test sets 2 p-value = %g\n",Pvalue(d3,sizeof(d3)/sizeof(*d3),d4,sizeof(d4)/sizeof(*d4)));
printf("Test sets 3 p-value = %g\n",Pvalue(d5,sizeof(d5)/sizeof(*d5),d6,sizeof(d6)/sizeof(*d6)));
printf("Test sets 4 p-value = %g\n",Pvalue(d7,sizeof(d7)/sizeof(*d7),d8,sizeof(d8)/sizeof(*d8)));
printf("Test sets 5 p-value = %g\n",Pvalue(x,sizeof(x)/sizeof(*x),y,sizeof(y)/sizeof(*y)));
printf("Test sets 6 p-value = %g\n",Pvalue(v1,sizeof(v1)/sizeof(*v1),v2,sizeof(v2)/sizeof(*v2)));
printf("Test sets 7 p-value = %g\n",Pvalue(s1,sizeof(s1)/sizeof(*s1),s2,sizeof(s2)/sizeof(*s2)));
printf("Test sets z p-value = %g\n", Pvalue(z1, 3, z2, 3));
const double g41[] = {1.062, 0.774, 0.909};
const double g412[] = {1.459, 0.674, 0.732};
const double g414[]= {1.174, 1.406, 1.536};
printf("41 gr -vs- 41.2 gr p-value = %g\n", Pvalue(g41, 3, g412, 3));
printf("41 gr -vs- 41.4 gr p-value = %g\n", Pvalue(g41, 3, g414, 3));
printf("41.2 gr -vs- 41.4 gr p-value = %g\n", Pvalue(g412, 3, g414, 3));
/*
Test sets 1 p-value = 2.137830e-02
Test sets 2 p-value = 1.488426e-01
Test sets 3 p-value = 3.597278e-02
Test sets 4 p-value = 9.077370e-02
Test sets 5 p-value = 1.075156e-02
Test sets 6 p-value = 3.399076e-03
Test sets 7 p-value = 5.272635e-01
 
with N = 199 and Simpson's 1st method:
Test sets 1 p-value = 2.292482e-02
Test sets 2 p-value = 1.535105e-01
Test sets 3 p-value = 3.857763e-02
Test sets 4 p-value = 9.265142e-02
Test sets 5 p-value = 1.076123e-02
Test sets 6 p-value = 3.414684e-03
Test sets 7 p-value = 5.266487e-01
*/
return 0;
}
</lang>
--[[User:Hailholyghost|Hailholyghost]] ([[User talk:Hailholyghost|talk]]) 11:47, 28 December 2017 (UTC)
:Replace
<lang c>int *restrict z = 0;
printf("%g =? %g\n", return_value, betain(x, a, 0.5, exp(lgammal(a)+0.57236494292470009-lgammal(a+0.5)), z));</lang>
:
:With
<lang c>int z = 0;
printf("%g =? %g\n", return_value, betain(x, a, 0.5, lgammal(a)+0.57236494292470009-lgammal(a+0.5), &z));</lang>
:
:There is another problem: you damaged the original function, by replacing <code>*ifault = 0;</code> with <code>ifault = 0;</code>, replacing ifault with a null pointer. This will crash the program with a null pointer dereference whenever the arguments passed to betain are incorrect (hence ifault is assigned a value). I didn't check if you introduced another problem in the code.
:Your statement that ''existing libraries don't calculate correct numbers'' is wrong, at least for this one.
:[[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 19:31, 28 December 2017 (UTC)
 
=== Stata translation of Burkardt's C translation of AS 63 ===
For the record, here is a quick Stata translation of this betain implementation.
 
The original C code can be simplified in several places using shortcuts (it could be simplified even more in C). Also, ifault is not used here: instead, the function returns a missing value if the arguments are invalid. And log(beta(p,q)) is computed inside betain, rather than passed as an argument.
 
<lang stata>mata
/*
This code is distributed under the GNU LGPL license.
Modified 28dec2017
Author:
Original Fortran 77 version by KL Majumder, GP Bhattacharjee.
C version by John Burkardt.
Stata version by Jean-Claude Arbaut.
*/
function betain(x,p,q) {
/* Check the input arguments. */
if (p<=0 | q<=0 | x<0 | x>1) return(.)
 
/* Special cases. */
if (x==0 | x==1) {
return(x)
}
acu = 1e-15
lnbeta = lngamma(p)+lngamma(q)-lngamma(p+q)
 
/* Change tail if necessary and determine S. */
psq = p+q
if (p<psq*x) {
xx = 1-x
cx = x
pp = q
qq = p
indx = 1
}
else {
xx = x
cx = 1-x
pp = p
qq = q
indx = 0
}
term = ai = value = 1
ns = floor(qq+cx*psq)
 
/* Use the Soper reduction formula. */
rx = xx/cx
temp = qq-ai
if (ns==0) rx = xx
while(1) {
term = term*temp*rx/(pp+ai)
value = value+term
temp = abs(term)
if (temp<=acu & temp<=acu*value) {
value = value*exp(pp*log(xx)+(qq-1)*log(cx)-lnbeta)/pp
return(indx?1-value:value)
}
ai++
if (--ns>=0) {
temp = qq-ai
if (ns==0) rx = xx
}
else temp = psq++
}
}
end</lang>
 
A note on the license: I used the same license as John Burkardt, but I am not certain it's permitted. The [http://lib.stat.cmu.edu/apstat/ StatLib site] states that ''The Royal Statistical Society holds the copyright to these routines, but has given its permission for their distribution provided that no fee is charged.'' However, LGPL allows redistribution in nonfree software.
 
[[User:Eoraptor|Eoraptor]] ([[User talk:Eoraptor|talk]]) 21:31, 28 December 2017 (UTC)
1,336

edits