Statistics/Normal distribution: Difference between revisions

Content added Content deleted

Inline

Revision as of 16:06, 18 April 2012

The Normal (or Gaussian) distribution is a freqently used distribution in statistics. While most programming languages provide a uniformly distributed random number generator, one can derive normally distributed random numbers from a uniform generator.

The task

Take a uniform random number generator and create a large (you decide how large) set of numbers that follow a normal (Gaussian) distribution. Calculate the dataset's mean and stddev, and show the histogram here.
Mention any native language support for the generation of normally distributed random numbers.

Reference

You may refer to code in Statistics/Basic if available.

Go

Box-Muller method shown here. Go has a normally distributed random function in the standard library, as shown in the Go Random numbers solution. It uses the ziggurat method. <lang go>package main

import (

   "fmt"
   "math"
   "math/rand"
   "strings"

)

// Box-Muller func norm2() (s, c float64) {

   r := math.Sqrt(-2 * math.Log(rand.Float64()))
   s, c = math.Sincos(2 * math.Pi * rand.Float64())
   return s * r, c * r

}

func main() {

   const (
       n     = 10000
       bins  = 12
       sig   = 3
       scale = 100
   )
   var sum, sumSq float64
   h := make([]int, bins)
   for i, accum := 0, func(v float64) {
       sum += v
       sumSq += v * v
       b := int((v + sig) * bins / sig / 2)
       if b >= 0 && b < bins {
           h[b]++
       }
   }; i < n/2; i++ {
       v1, v2 := norm2()
       accum(v1)
       accum(v2)
   }
   m := sum / n
   fmt.Println("mean:", m)
   fmt.Println("stddev:", math.Sqrt(sumSq/float64(n)-m*m))
   for _, p := range h {
       fmt.Println(strings.Repeat("*", p/scale))
   }

}</lang> Output:

mean: -0.0034970888831523488
stddev: 1.0040682925006286

*
****
*********
***************
*******************
******************
**************
*********
****
*

Mathematica

<lang Mathematica>x:= RandomReal[1] SampleNormal[n_] := (Print[#//Length, " numbers, Mean : ", #//Mean, ", StandardDeviation : ", #//StandardDeviation];

   Histogram[#, BarOrigin -> Left,Axes -> False])& [(Table[(-2*Log[x])^0.5*Cos[2*Pi*x], {n} ]]

Invocation: SampleNormal[ 10000 ] ->10000 numbers, Mean : -0.0122308, StandardDeviation : 1.00646 </lang>

MATLAB / Octave

<lang Matlab> N = 100000;

 x = randn(N,1);
 mean(x)
 std(x) 
 [nn,xx] = hist(x,100);
 bar(xx,nn);</lang>

Liberty BASIC

Uses LB Statistics/Basic <lang lb>call sample 100000

end

sub sample n

   dim dat( n)
   for i =1 to n
       dat( i) =normalDist( 1, 0.2)
   next i

   '// show mean, standard deviation. Find max, min.
   mx  =-1000
   mn  = 1000
   sum =0
   sSq =0
   for i =1 to n
       d =dat( i)
       mx =max( mx, d)
       mn =min( mn, d)
       sum =sum +d
       sSq =sSq +d^2
   next i
   print n; " data terms used."

   mean =sum / n
   print "Largest term was "; mx; " & smallest was "; mn
   range =mx -mn
   print "Mean ="; mean

   print "Stddev ="; ( sSq /n -mean^2)^0.5

   '// show histogram
   nBins =50
   dim bins( nBins)
   for i =1 to n
       z =int( ( dat( i) -mn) /range *nBins)
       bins( z) =bins( z) +1
   next i
   for b =0 to nBins -1
       for j =1 to int( nBins *bins( b)) /n *30)
           print "#";
       next j
       print
   next b
   print

end sub

function normalDist( m, s) ' Box Muller method

   u =rnd( 1)
   v =rnd( 1)
   normalDist =( -2 *log( u))^0.5 *cos( 2 *3.14159265 *v)

end function</lang>

100000 data terms used.
Largest term was 4.12950792 & smallest was -4.37934139
Mean =-0.26785425e-2
Stddev =1.00097669

#
##
###
#####
########
############
################
########################
##############################
######################################
##############################################
########################################################
###################################################################
##############################################################################
#######################################################################################
################################################################################################
####################################################################################################
########################################################################################################
#####################################################################################################
##############################################################################################
#########################################################################################
##################################################################################
#########################################################################
##############################################################
####################################################
##########################################
#################################
##########################
##################
#############
#########
######
####
##
#
#

PARI/GP

Works with: PARI/GP version 2.4.3 and above

<lang parigp>rnormal()={ my(u1=random(1.),u2=random(1.); sqrt(-2*log(u1))*cos(2*Pi*u1) \\ Could easily be extended with a second normal at very little cost. }; mean(v)={

 sum(i=1,#v,v[i])/#v

}; stdev(v,mu="")={

 if(mu=="",mu=mean(v));
 sqrt(sum(i=1,#v,(v[i]-mu)^2))/#v

}; histogram(v,bins=16,low=0,high=1)={

 my(u=vector(bins),width=(high-low)/bins);
 for(i=1,#v,u[(v[i]-low)\width+1]++);
 u

}; show(n)={

 my(v=vector(n,i,rnormal()),m=mean(v),s=stdev(v,m),h,sz=ceil(n/300));
 h=histogram(v,,vecmin(v)-.1,vecmax(v)+.1);
 for(i=1,#h,for(j=1,h[i]\sz,print1("#"));print());

}; show(10^4)</lang>

For versions before 2.4.3, define <lang parigp>rreal()={

 my(pr=32*ceil(default(realprecision)*log(10)/log(4294967296))); \\ Current precision
 random(2^pr)*1.>>pr

};</lang> and use rreal() in place of random(1.).

Perl 6

<lang perl6>constant τ = 2 * pi;

sub normdist ($m, $σ) {

   gather loop {
       my $r = sqrt -2 * log rand;
       my $Θ = τ * rand;
       take $r * $_($Θ) * $σ + $m for &cos, &sin;
   }

}

sub MAIN ($size = 100000, $mean = 50, $stddev = 4) {

   my @dataset = normdist($mean,$stddev)[^$size];

   my $m = [+](@dataset) / $size;
   say (:$m);

   my $σ = sqrt [+](@dataset X** 2) / $size - $mean**2;
   say (:$σ);

   (my %hash){.round}++ for @dataset;
   my $scale = 180 * $stddev / $size;
   constant @subbar = < ⎸ ▏ ▎ ▍ ▌ ▋ ▊ ▉ █ >;
   for %hash.keys».Int.minmax(+*) -> $i {
       my $x = (%hash{$i} // 0) * $scale;
       my $full = floor $x;
       my $part = 8 * ($x - $full);
       say $i, "\t", '█' x $full, @subbar[$part];
   }

}</lang>

Output:

"m" => 50.006107405837142e0
"σ" => 4.0814435639885254e0
33	⎸
34	⎸
35	⎸
36	▏
37	▎
38	▊
39	█▋
40	███⎸
41	█████▊
42	██████████⎸
43	███████████████▋
44	███████████████████████▏
45	████████████████████████████████▌
46	███████████████████████████████████████████▍
47	██████████████████████████████████████████████████████▏
48	███████████████████████████████████████████████████████████████▏
49	█████████████████████████████████████████████████████████████████████▋
50	███████████████████████████████████████████████████████████████████████▊
51	█████████████████████████████████████████████████████████████████████▌
52	███████████████████████████████████████████████████████████████⎸
53	██████████████████████████████████████████████████████▎
54	███████████████████████████████████████████⎸
55	████████████████████████████████▌
56	███████████████████████▍
57	███████████████▉
58	█████████▉
59	█████▍
60	███▍
61	█▋
62	▊
63	▍
64	▏
65	⎸
66	⎸
67	⎸

PureBasic

<lang purebasic>Procedure.f randomf(resolution = 2147483647)

 ProcedureReturn Random(resolution) / resolution

EndProcedure

Procedure.f normalDist() ;Box Muller method

  ProcedureReturn Sqr(-2 * Log(randomf())) * Cos(2 * #PI * randomf())

EndProcedure

Procedure sample(n, nBins = 50)

 Protected i, maxBinValue, binNumber
 Protected.f d, mean, sum, sumSq, mx, mn, range
 
 Dim dat.f(n)
 For i = 1 To n
   dat(i) = normalDist()
 Next
 
 ;show mean, standard deviation, find max & min.
 mx  = -1000
 mn  =  1000
 sum = 0
 sumSq = 0
 For i = 1 To n
   d = dat(i)
   If d > mx: mx = d: EndIf
   If d < mn: mn = d: EndIf
   sum + d
   sumSq + d * d
 Next
 
 PrintN(Str(n) + " data terms used.")
 PrintN("Largest term was " + StrF(mx) + " & smallest was " + StrF(mn))
 mean = sum / n
 PrintN("Mean = " + StrF(mean))
 PrintN("Stddev = " + StrF((sumSq / n) - Sqr(mean * mean)))
 
 ;show histogram
 range = mx - mn
 Dim bins(nBins)
 For i = 1 To n
   binNumber = Int(nBins * (dat(i) - mn) / range)
   bins(binNumber) + 1
 Next
  
 maxBinValue = 1
 For i = 0 To nBins
   If bins(i) > maxBinValue
     maxBinValue = bins(i)
   EndIf
 Next
 
 #normalizedMaxValue = 70
 For binNumber = 0 To nBins
   tickMarks = Round(bins(binNumber) * #normalizedMaxValue / maxBinValue, #PB_Round_Nearest)
   PrintN(ReplaceString(Space(tickMarks), " ", "#"))
 Next
 PrintN("")

EndProcedure

If OpenConsole()

 sample(100000)
 
 Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
 CloseConsole()

EndIf</lang> Sample output:

100000 data terms used.
Largest term was 4.5352029800 & smallest was -4.5405135155
Mean = 0.0012346541
Stddev = 0.9959455132





#
###
######
##########
##################
############################
#########################################
#####################################################
################################################################
######################################################################
######################################################################
################################################################
#####################################################
#########################################
#############################
##################
##########
######
###
#

SAS

<lang sas>data test; n=100000; twopi=2*constant('pi'); do i=1 to n; u=ranuni(0); v=ranuni(0); r=sqrt(-2*log(u)); x=r*cos(twopi*v); y=r*sin(twopi*v); z=rannor(0); output; end; keep x y z;

proc means mean stddev;

proc univariate; histogram /normal;

run;

/* Variable Mean Std Dev

x -0.0052720 0.9988467 y 0.000023995 1.0019996 z 0.0012857 1.0056536

/</lang>

Tcl

<lang tcl>package require Tcl 8.5

Uses the Box-Muller transform to compute a pair of normal random numbers

proc tcl::mathfunc::nrand {mean stddev} {

   variable savednormalrandom
   if {[info exists savednormalrandom]} {

return [expr {$savednormalrandom*$stddev + $mean}][unset savednormalrandom]

   }
   set r [expr {sqrt(-2*log(rand()))}]
   set theta [expr {2*3.1415927*rand()}]
   set savednormalrandom [expr {$r*sin($theta)}]
   expr {$r*cos($theta)*$stddev + $mean}

} proc stats {size {slotfactor 10}} {

   set sum 0.0
   set sum2 0.0
   for {set i 0} {$i < $size} {incr i} {

set r [expr { nrand(0.5, 0.2) }]

incr histo([expr {int(floor($r*$slotfactor))}]) set sum [expr {$sum + $r}] set sum2 [expr {$sum2 + $r**2}]

   }
   set mean [expr {$sum / $size}]
   set stddev [expr {sqrt($sum2/$size - $mean**2)}]
   puts "$size numbers"
   puts "Mean:   $mean"
   puts "StdDev: $stddev"
   foreach i [lsort -integer [array names histo]] {

puts [string repeat "*" [expr {$histo($i)*350/int($size)}]]

}

stats 100 puts "" stats 1000 puts "" stats 10000 puts "" stats 100000 20</lang> Sample output:

100 numbers
Mean:   0.49355955990390254
StdDev: 0.19651396178121985
***
*******
**************
***********************************
********************************************************
******************************************************************
*************************************************************************
******************************************
**************************************
**************

1000 numbers
Mean:   0.5066940614105869
StdDev: 0.2016794788065389


*
*****
**************
****************************
**********************************************************
****************************************************************
*************************************************************
******************************************************
***********************************
************
*********
*

10000 numbers
Mean:   0.49980964730768285
StdDev: 0.1968441612522318

*
*****
***************
*******************************
*****************************************************
******************************************************************
*******************************************************************
****************************************************
*********************************
***************
*****
*



100000 numbers
Mean:   0.49960438950922254
StdDev: 0.20060211160998606





*
**
***
******
*********
**************
******************
***********************
*****************************
********************************
**********************************
**********************************
********************************
****************************
***********************
******************
*************
*********
******
***
**
*

The blank lines in the output are where the number of samples is too small to even merit a single unit on the histogram.