Statistics/Basic: Difference between revisions
m (→{{header|Go}}: a little more on the extra credit) |
(CoffeeScript) |
||
Line 119: | Line 119: | ||
} |
} |
||
}</lang> |
}</lang> |
||
=={{header|CoffeeScript}}== |
|||
<lang coffeescript> |
|||
generate_statistics = (n) -> |
|||
hist = {} |
|||
update_hist = (r) -> |
|||
hist[Math.floor 10*r] ||= 0 |
|||
hist[Math.floor 10*r] += 1 |
|||
sum = 0 |
|||
sum_squares = 0.0 |
|||
for i in [1..n] |
|||
r = Math.random() |
|||
sum += r |
|||
sum_squares += r*r |
|||
update_hist r |
|||
mean = sum / n |
|||
stddev = Math.sqrt((sum_squares / n) - mean*mean) |
|||
[n, mean, stddev, hist] |
|||
display_statistics = (n, mean, stddev, hist) -> |
|||
console.log "-- Stats for sample size #{n}" |
|||
console.log "mean: #{mean}" |
|||
console.log "sdev: #{stddev}" |
|||
for x, cnt of hist |
|||
bars = repeat "=", Math.floor(cnt*300/n) |
|||
console.log "#{x/10}: #{bars} #{cnt}" |
|||
repeat = (c, n) -> |
|||
s = '' |
|||
s += c for i in [1..n] |
|||
s |
|||
for n in [100, 1000, 10000, 1000000] |
|||
[n, mean, stddev, hist] = generate_statistics n |
|||
display_statistics n, mean, stddev, hist |
|||
</lang> |
|||
output |
|||
<lang> |
|||
> coffee stats.coffee |
|||
-- Stats for sample size 100 |
|||
mean: 0.5058459933893755 |
|||
sdev: 0.2752669422150894 |
|||
0: ================== 6 |
|||
0.1: ============================================= 15 |
|||
0.2: =========================== 9 |
|||
0.3: ===================== 7 |
|||
0.4: ============================================= 15 |
|||
0.5: ======================== 8 |
|||
0.6: ================================= 11 |
|||
0.7: ========================================== 14 |
|||
0.8: ===================== 7 |
|||
0.9: ======================== 8 |
|||
-- Stats for sample size 1000 |
|||
mean: 0.49664502244861797 |
|||
sdev: 0.2942483939245344 |
|||
0: ========================== 89 |
|||
0.1: ===================================== 126 |
|||
0.2: =========================== 93 |
|||
0.3: ==================================== 121 |
|||
0.4: =========================== 93 |
|||
0.5: ====================== 75 |
|||
0.6: ================================ 108 |
|||
0.7: ======================== 82 |
|||
0.8: ============================== 101 |
|||
0.9: ================================= 112 |
|||
-- Stats for sample size 10000 |
|||
mean: 0.4985696110446239 |
|||
sdev: 0.29007446138438986 |
|||
0: ============================== 1005 |
|||
0.1: ============================== 1016 |
|||
0.2: ============================== 1022 |
|||
0.3: ============================== 1012 |
|||
0.4: ============================ 958 |
|||
0.5: =============================== 1035 |
|||
0.6: ============================= 974 |
|||
0.7: ============================= 968 |
|||
0.8: ============================= 973 |
|||
0.9: =============================== 1037 |
|||
-- Stats for sample size 1000000 |
|||
mean: 0.5001718024678293 |
|||
sdev: 0.2887130780006248 |
|||
0: ============================== 100113 |
|||
0.1: ============================= 99830 |
|||
0.2: ============================== 100029 |
|||
0.3: ============================= 99732 |
|||
0.4: ============================= 99911 |
|||
0.5: ============================= 99722 |
|||
0.6: ============================== 100780 |
|||
0.7: ============================= 99812 |
|||
0.8: ============================= 99875 |
|||
0.9: ============================== 100196 |
|||
</lang> |
|||
=={{header|Go}}== |
=={{header|Go}}== |
Revision as of 18:50, 9 January 2012
Statistics is all about large groups of numbers. When talking about a set of sampled data, most frequently used is their mean value and standard deviation (stddev). If you have set of data where , the mean is , while the stddev is .
When examining a large quantity of data, one often uses a histogram, which shows the counts of data samples falling into a prechosen set of intervals (or bins). When plotted, often as bar graphs, it visually indicates how often each data value occurs.
Task Using your language's random number routine, generate real numbers in the range of [0, 1]. It doesn't matter if you chose to use open or closed range. Create 100 of such numbers (i.e. sample size 100) and calculate their mean and stddev. Do so for sample size of 1,000 and 10,000, maybe even higher if you feel like. Show a histogram of any of these sets. Do you notice some patterns about the standard deviation?
Extra Sometimes so much data need to be processed that it's impossible to keep all of them at once. Can you calculate the mean, stddev and histogram of a trillion numbers? (You don't really need to do a trillion numbers, just show how it can be done.)
- Hint
For a finite population with equal probabilities at all points, one can derive:
C
Sample code. <lang C>#include <stdio.h>
- include <stdlib.h>
- include <math.h>
- include <stdint.h>
- define n_bins 10
double rand01() { return rand() / (RAND_MAX + 1.0); }
double avg(int count, double *stddev, int *hist) { double x[count]; double m = 0, s = 0;
for (int i = 0; i < n_bins; i++) hist[i] = 0; for (int i = 0; i < count; i++) { m += (x[i] = rand01()); hist[(int)(x[i] * n_bins)] ++; }
m /= count; for (int i = 0; i < count; i++) s += x[i] * x[i]; *stddev = sqrt(s / count - m * m); return m; }
void hist_plot(int *hist) { int max = 0, step = 1; double inc = 1.0 / n_bins;
for (int i = 0; i < n_bins; i++) if (hist[i] > max) max = hist[i];
/* scale if numbers are too big */ if (max >= 60) step = (max + 59) / 60;
for (int i = 0; i < n_bins; i++) { printf("[%5.2g,%5.2g]%5d ", i * inc, (i + 1) * inc, hist[i]); for (int j = 0; j < hist[i]; j += step) printf("#"); printf("\n"); } }
/* record for moving average and stddev. Values kept are sums and sum data^2
* to avoid excessive precision loss due to divisions, but some loss is inevitable */
typedef struct { uint64_t size; double sum, x2; uint64_t hist[n_bins]; } moving_rec;
void moving_avg(moving_rec *rec, double *data, int count) { double sum = 0, x2 = 0; /* not adding data directly to the sum in case both recorded sum and * count of this batch are large; slightly less likely to lose precision*/ for (int i = 0; i < count; i++) { sum += data[i]; x2 += data[i] * data[i]; rec->hist[(int)(data[i] * n_bins)]++; }
rec->sum += sum; rec->x2 += x2; rec->size += count; }
int main() { double m, stddev; int hist[n_bins], samples = 10;
while (samples <= 10000) { m = avg(samples, &stddev, hist); printf("size %5d: %g %g\n", samples, m, stddev); samples *= 10; }
printf("\nHistograph:\n"); hist_plot(hist);
printf("\nMoving average:\n N Mean Sigma\n"); moving_rec rec = { 0, 0, 0, {0} }; double data[100]; for (int i = 0; i < 10000; i++) { for (int j = 0; j < 100; j++) data[j] = rand01();
moving_avg(&rec, data, 100);
if ((i % 1000) == 999) { printf("%4lluk %f %f\n", rec.size/1000, rec.sum / rec.size, sqrt(rec.x2 * rec.size - rec.sum * rec.sum)/rec.size ); } } }</lang>
CoffeeScript
<lang coffeescript> generate_statistics = (n) ->
hist = {}
update_hist = (r) -> hist[Math.floor 10*r] ||= 0 hist[Math.floor 10*r] += 1
sum = 0 sum_squares = 0.0
for i in [1..n] r = Math.random() sum += r sum_squares += r*r update_hist r mean = sum / n stddev = Math.sqrt((sum_squares / n) - mean*mean)
[n, mean, stddev, hist]
display_statistics = (n, mean, stddev, hist) ->
console.log "-- Stats for sample size #{n}" console.log "mean: #{mean}" console.log "sdev: #{stddev}" for x, cnt of hist bars = repeat "=", Math.floor(cnt*300/n) console.log "#{x/10}: #{bars} #{cnt}"
repeat = (c, n) ->
s = s += c for i in [1..n] s
for n in [100, 1000, 10000, 1000000]
[n, mean, stddev, hist] = generate_statistics n display_statistics n, mean, stddev, hist
</lang>
output <lang> > coffee stats.coffee -- Stats for sample size 100 mean: 0.5058459933893755 sdev: 0.2752669422150894 0: ================== 6 0.1: ============================================= 15 0.2: =========================== 9 0.3: ===================== 7 0.4: ============================================= 15 0.5: ======================== 8 0.6: ================================= 11 0.7: ========================================== 14 0.8: ===================== 7 0.9: ======================== 8 -- Stats for sample size 1000 mean: 0.49664502244861797 sdev: 0.2942483939245344 0: ========================== 89 0.1: ===================================== 126 0.2: =========================== 93 0.3: ==================================== 121 0.4: =========================== 93 0.5: ====================== 75 0.6: ================================ 108 0.7: ======================== 82 0.8: ============================== 101 0.9: ================================= 112 -- Stats for sample size 10000 mean: 0.4985696110446239 sdev: 0.29007446138438986 0: ============================== 1005 0.1: ============================== 1016 0.2: ============================== 1022 0.3: ============================== 1012 0.4: ============================ 958 0.5: =============================== 1035 0.6: ============================= 974 0.7: ============================= 968 0.8: ============================= 973 0.9: =============================== 1037 -- Stats for sample size 1000000 mean: 0.5001718024678293 sdev: 0.2887130780006248 0: ============================== 100113 0.1: ============================= 99830 0.2: ============================== 100029 0.3: ============================= 99732 0.4: ============================= 99911 0.5: ============================= 99722 0.6: ============================== 100780 0.7: ============================= 99812 0.8: ============================= 99875 0.9: ============================== 100196 </lang>
Go
<lang go>package main
import (
"fmt" "math" "math/rand" "strings"
)
func main() {
sample(100) sample(1000) sample(10000)
}
func sample(n int) {
// generate data d := make([]float64, n) for i := range d { d[i] = rand.Float64() } // show mean, standard deviation var sum, ssq float64 for _, s := range d { sum += s ssq += s * s } fmt.Println(n, "numbers") m := sum / float64(n) fmt.Println("Mean: ", m) fmt.Println("Stddev:", math.Sqrt(ssq/float64(n)-m*m)) // show histogram h := make([]int, 10) for _, s := range d { h[int(s*10)]++ } for _, c := range h { fmt.Println(strings.Repeat("*", c*205/int(n))) } fmt.Println()
}</lang> Output:
100 numbers Mean: 0.5231064889267764 Stddev: 0.292668237816841 **************** **************** ************************ ********************** ****************** ****************** **************** ************************** ************************ ******************** 1000 numbers Mean: 0.496026080160094 Stddev: 0.2880988956436907 ********************* ******************** ***************** *********************** ****************** ********************** ******************** ********************* ****************** ******************* 10000 numbers Mean: 0.5009091903581223 Stddev: 0.289269693719711 ******************* ******************** ******************** ******************** ********************* ******************** ******************* ******************* ******************** *********************
The usual approach to the extra problem is sampling. That is, to not do it.
To show really show how computations could be done a trillion numbers however, here is an outline of a map reduce strategy. The main task indicated that numbers should be generated before doing any computations on them. Consistent with that, The function getSegment returns data based on a starting and ending index, as if it were accessing some large data store.
The following runs comfortably on a simulated data size of 10 million. To scale to a trillion, and to use real data, you would want to use a technique like Distributed_programming#Go to distribute work across multiple computers, and on each computer, use a technique like Parallel_calculations#Go to distribute work across multiple cores within each computer. You would tune parameters like the constant threshold in the code below to optimize cache performance. <lang go>package main
import (
"fmt" "math" "math/rand" "strings"
)
func main() {
bigSample(1e7)
}
func bigSample(n int64) {
sum, ssq, h := reduce(0, n) // compute final statistics and output as above fmt.Println(n, "numbers") m := sum / float64(n) fmt.Println("Mean: ", m) fmt.Println("Stddev:", math.Sqrt(ssq/float64(n)-m*m)) for _, c := range h { fmt.Println(strings.Repeat("*", c*205/int(n))) } fmt.Println()
}
const threshold = 1e6
func reduce(start, end int64) (sum, ssq float64, h []int) {
n := end - start if n < threshold { d := getSegment(start, end) return computeSegment(d) } // map to two sub problems half := (start + end) / 2 sum1, ssq1, h1 := reduce(start, half) sum2, ssq2, h2 := reduce(half, end) // combine results for i, c := range h2 { h1[i] += c } return sum1 + sum2, ssq1 + ssq2, h1
}
func getSegment(start, end int64) []float64 {
d := make([]float64, end-start) for i := range d { d[i] = rand.Float64() } return d
}
func computeSegment(d []float64) (sum, ssq float64, h []int) {
for _, s := range d { sum += s ssq += s * s } h = make([]int, 10) for _, s := range d { h[int(s*10)]++ } return
}</lang> Output:
10000000 numbers Mean: 0.4999673191148989 Stddev: 0.2886663876567514 ******************** ******************** ******************** ******************** ******************** ******************** ******************** ******************** ******************** ********************
Icon and Unicon
The following uses the stddev procedure from the Standard_deviation task. In this example,
<lang Icon>procedure main(A)
W := 50 # avg width for histogram bar B := 10 # histogram bins if *A = 0 then put(A,100) # 100 if none specified
while N := get(A) do { # once per argument
write("\nN=",N)
N := 0 < integer(N) | next # skip if invalid stddev() # reset m := 0. H := list(B,0) # Histogram of every i := 1 to N do { # calc running ... s := stddev(r := ?0) # ... std dev m +:= r/N # ... mean H[integer(*H*r)+1] +:= 1 # ... histogram }
write("mean=",m) write("stddev=",s) every i := 1 to *H do # show histogram write(right(real(i)/*H,5)," : ",repl("*",integer(*H*50./N*H[i]))) }
end</lang>
Output:
N=100 mean=0.4941076275054806 stddev=0.2812938788216594 0.1 : **************************************** 0.2 : ******************************************************* 0.3 : ******************************************************* 0.4 : ********************************************************************** 0.5 : **************************************** 0.6 : ********************************************* 0.7 : **************************************** 0.8 : ***************************************************************** 0.9 : **************************************** 1.0 : ************************************************** N=10000 mean=0.4935428224375008 stddev=0.2884171825227816 0.1 : *************************************************** 0.2 : *************************************************** 0.3 : *************************************************** 0.4 : ************************************************** 0.5 : **************************************************** 0.6 : ************************************************* 0.7 : *********************************************** 0.8 : ************************************************ 0.9 : ************************************************** 1.0 : *********************************************** N=1000000 mean=0.4997503773607869 stddev=0.2886322440610256 0.1 : ************************************************* 0.2 : ************************************************** 0.3 : ************************************************** 0.4 : ************************************************** 0.5 : ************************************************* 0.6 : ************************************************** 0.7 : ************************************************* 0.8 : ************************************************* 0.9 : ************************************************** 1.0 : *************************************************
J
J has library routines to compute mean and standard deviation: <lang j> require'statfns'
(mean,stddev) ?1000#0
0.484669 0.287482
(mean,stddev) ?10000#0
0.503642 0.290777
(mean,stddev) ?100000#0
0.499677 0.288726</lang>
And, for a histogram:
<lang j>histogram=: <: @ (#/.~) @ (i.@#@[ , I.) require'plot' plot ((%*1+i.)100) ([;histogram) ?10000#0</lang>
but these are not quite what is being asked for here.
Instead:
<lang j>histogram=: <: @ (#/.~) @ (i.@#@[ , I.)
meanstddevP=:3 :0
NB. compute mean and std dev of y random numbers NB. picked from even distribution between 0 and 1 NB. and display a normalized ascii histogram for this sample NB. note: should use population mean, not sample mean, for stddev NB. given the equation specified for this task. h=.s=.t=. 0 buckets=. (%~1+i.)10 for_n.i.<.y%1e6 do. data=. ?1e6#0 h=.h+ buckets histogram data s=.s+ +/ data t=.t+ +/(data-0.5)^2 end. data=. ?(1e6|y)#0 h=.h+ buckets histogram data s=.s+ +/ data t=.t++/(data-0.5)^2 smoutput (<.300*h%y)#"0'#' (s%y),%:t%y
)</lang>
Example use:
<lang j> meanstddevP 1000
0.488441 0.289744
meanstddevP 10000
0.49697 0.289433
meanstddevP 100000
0.500872 0.288241</lang>
(That said, note that these numbers are random, so reported standard deviation will vary with the random sample being tested.)
This could handle a trillion random numbers on a bog-standard computer, but I am not inclined to wait that long.
Liberty BASIC
Be aware that the PRNG in LB has a SLIGHT bias. <lang lb> call sample 100 call sample 1000 call sample 10000
end
sub sample n
dim dat( n) for i =1 to n dat( i) =rnd( 1) next i
'// show mean, standard deviation sum =0 sSq =0 for i =1 to n sum =sum +dat( i) sSq =sSq +dat( i)^2 next i print n; " data terms used."
mean =sum / n print "Mean ="; mean
print "Stddev ="; ( sSq /n -mean^2)^0.5
'// show histogram nBins =10 dim bins( nBins) for i =1 to n z =int( nBins *dat( i)) bins( z) =bins( z) +1 next i for b =0 to nBins -1 for j =1 to int( nBins *bins( b)) /n *70) print "#"; next j print next b print
end sub </lang>
100000 data terms used. Mean =0.49870232 Stddev =0.28926563 ###################################################################### ###################################################################### ###################################################################### ###################################################################### ##################################################################### ##################################################################### ##################################################################### ##################################################################### ###################################################################### #####################################################################
Mathematica
<lang Mathematica>Sample[n_]:= (Print[#//Length," numbers, Mean : ",#//Mean,", StandardDeviation : ",#//StandardDeviation ];
BarChart[BinCounts[#,{0,1,.1}], Axes->False, BarOrigin->Left])&[(RandomReal[1,#])&[ n ]]
Sample/@{100,1 000,10 000,1 000 000}
100 numbers, Mean : 0.478899, StandardDeviation : 0.322265
1000 numbers, Mean : 0.503383, StandardDeviation : 0.278352
10000 numbers, Mean : 0.498278, StandardDeviation : 0.28925
1000000 numbers, Mean : 0.500248, StandardDeviation : 0.288713</lang>
PARI/GP
<lang parigp>mean(v)={
sum(i=1,#v,v[i])/#v
}; stdev(v,mu="")={
if(mu=="",mu=mean(v)); sqrt(sum(i=1,#v,(v[i]-mu)^2))/#v
}; histogram(v,bins=16,low=0,high=1)={
my(u=vector(bins),width=(high-low)/bins); for(i=1,#v,u[(v[i]-low)\width+1]++); u
}; show(n)={
my(v=vector(n,i,random(1.)),mu=mean(v),s=stdev(v,mu),h=histogram(v),sz=ceil(n/50/16)); for(i=1,16,for(j=1,h[i]\sz,print1("#"));print()); print("Mean: "mu); print("Stdev: "s);
}; show(100); show(1000); show(10000);</lang>
Perl
<lang perl>my @histogram = (); my $hist_rows = 10; my $sum = 0; my $sum_squares = 0; my $n = $ARGV[ 0 ];
for( my $counter = 0; $counter < $n; $counter ++ ){
my $current = rand( 1 ); $sum += $current; $sum_squares += $current ** 2; $histogram[ $current * $hist_rows ] += 1;
}
my $mean = $sum / $n;
print "$n numbers\n" .
"Mean: $mean\n" . "Stddev: " . sqrt(($sum_squares / $n) - ($mean ** 2)) . "\n";
for( my $row_counter = 0; $row_counter < $hist_rows; $row_counter ++ ){
printf( "%.1f - %.1f : ", ($row_counter/$hist_rows), ((1/$hist_rows) + ($row_counter/$hist_rows)));
for( my $counter = 0; $counter < ($histogram[ $row_counter ]*(30/($n/10))); $counter ++ ){ # the extra math on the condition is to normalize the rows of the # histogram to be 30 chars long print "*"; } print "\n";
}</lang>
Usage:
perl rand_statistics.pl (number of values)
$ perl rand_statistics.pl 100 100 numbers Mean: 0.531591369804339 Stddev: 0.28440375340793 0.0 - 0.1 : *************************** 0.1 - 0.2 : ************************ 0.2 - 0.3 : *************************** 0.3 - 0.4 : ************************ 0.4 - 0.5 : ********************************* 0.5 - 0.6 : ************************************ 0.6 - 0.7 : ************************************ 0.7 - 0.8 : ****************** 0.8 - 0.9 : *************************************** 0.9 - 1.0 : ************************************ $ perl rand_statistics.pl 1000 1000 numbers Mean: 0.51011452684812 Stddev: 0.29490201218115 0.0 - 0.1 : ****************************** 0.1 - 0.2 : ******************************* 0.2 - 0.3 : *************************** 0.3 - 0.4 : ***************************** 0.4 - 0.5 : ********************************** 0.5 - 0.6 : **************************** 0.6 - 0.7 : ************************ 0.7 - 0.8 : ************************************* 0.8 - 0.9 : ******************************** 0.9 - 1.0 : ********************************* $ perl rand_statistics.pl 10000 10000 numbers Mean: 0.495329167703333 Stddev: 0.285944419431566 0.0 - 0.1 : ***************************** 0.1 - 0.2 : ******************************* 0.2 - 0.3 : ********************************* 0.3 - 0.4 : ******************************* 0.4 - 0.5 : ****************************** 0.5 - 0.6 : ******************************* 0.6 - 0.7 : ****************************** 0.7 - 0.8 : ****************************** 0.8 - 0.9 : ***************************** 0.9 - 1.0 : ****************************** $ perl rand_statistics.pl 10000000 10000000 numbers Mean: 0.499973935749229 Stddev: 0.2887231680817 0.0 - 0.1 : ****************************** 0.1 - 0.2 : ******************************* 0.2 - 0.3 : ****************************** 0.3 - 0.4 : ******************************* 0.4 - 0.5 : ****************************** 0.5 - 0.6 : ******************************* 0.6 - 0.7 : ****************************** 0.7 - 0.8 : ****************************** 0.8 - 0.9 : ******************************* 0.9 - 1.0 : *******************************
PicoLisp
The following has no limit on the number of samples. The 'statistics' function accepts an executable body 'Prg', which it calls repeatedly to get the samples. <lang PicoLisp>(scl 6)
(de statistics (Cnt . Prg)
(prinl Cnt " numbers") (let (Sum 0 Sqr 0 Hist (need 10 NIL 0)) (do Cnt (let N (run Prg 1) # Get next number (inc 'Sum N) (inc 'Sqr (*/ N N 1.0)) (inc (nth Hist (inc (/ N 0.1)))) ) ) (let M (*/ Sum Cnt) (prinl "Mean: " (round M)) (prinl "StdDev: " (round (sqrt (* 1.0 (- (*/ Sqr Cnt) (*/ M M 1.0)) ) ) ) ) ) (for (I . H) Hist (prin (format I 1) " ") (do (*/ H 400 Cnt) (prin '=)) (prinl) ) ) )</lang>
Test: <lang PicoLisp>(statistics 100
(rand 0 (dec 1.0)) )
(prinl)
(statistics 10000
(rand 0 (dec 1.0)) )
(prinl)
(statistics 1000000
(rand 0 (dec 1.0)) )
(prinl)</lang> Output:
100 numbers Mean: 0.501 StdDev: 0.284 0.1 ======================================== 0.2 ==================================== 0.3 ==================================================== 0.4 ======================== 0.5 ======================== 0.6 ================================================================ 0.7 ======================================================== 0.8 ==================================== 0.9 ======================== 1.0 ============================================ 10000 numbers Mean: 0.501 StdDev: 0.288 0.1 ======================================= 0.2 ======================================== 0.3 ======================================= 0.4 ========================================= 0.5 ========================================= 0.6 ======================================== 0.7 ========================================= 0.8 ======================================== 0.9 ======================================== 1.0 ======================================== 1000000 numbers Mean: 0.500 StdDev: 0.289 0.1 ======================================== 0.2 ======================================== 0.3 ======================================== 0.4 ======================================== 0.5 ======================================== 0.6 ======================================== 0.7 ======================================== 0.8 ======================================== 0.9 ======================================== 1.0 ========================================
PureBasic
Changes were made from the Liberty BASIC version to normalize the histogram as well as implement a random float function. <lang purebasic>Procedure.f randomf()
#RNG_max_resolution = 2147483647 ProcedureReturn Random(#RNG_max_resolution) / #RNG_max_resolution
EndProcedure
Procedure sample(n)
Protected i, nBins, binNumber, tickMarks, maxBinValue Protected.f sum, sumSq, mean Dim dat.f(n) For i = 1 To n dat(i) = randomf() Next ;show mean, standard deviation For i = 1 To n sum + dat(i) sumSq + dat(i) * dat(i) Next i PrintN(Str(n) + " data terms used.") mean = sum / n PrintN("Mean =" + StrF(mean)) PrintN("Stddev =" + StrF((sumSq / n) - Sqr(mean * mean))) ;show histogram nBins = 10 Dim bins(nBins) For i = 1 To n binNumber = Int(nBins * dat(i)) bins(binNumber) + 1 Next maxBinValue = 1 For i = 0 To nBins If bins(i) > maxBinValue maxBinValue = bins(i) EndIf Next #normalizedMaxValue = 70 For binNumber = 0 To nBins tickMarks = Int(bins(binNumber) * #normalizedMaxValue / maxBinValue) PrintN(ReplaceString(Space(tickMarks), " ", "#")) Next PrintN("")
EndProcedure
If OpenConsole()
sample(100) sample(1000) sample(10000) Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input() CloseConsole()
EndIf</lang> Sample output:
100 data terms used. Mean =0.4349198639 Stddev =-0.1744846404 ######################################################### ######################################### ################################ ################################################################# ################################ ##################################################### ###################################################################### ################ ######################## ################ 1000 data terms used. Mean =0.4960154891 Stddev =-0.1691310555 ############################################################### ####################################################### ############################################################# ###################################################################### ########################################################## ############################################################## #################################################################### ############################################################### ############################################################# ##################################################### 10000 data terms used. Mean =0.5042046309 Stddev =-0.1668083966 ################################################################## ################################################################ ################################################################## #################################################################### ################################################################ ###################################################################### #################################################################### ################################################################### #################################################################### ####################################################################
Python
The second function, sd2 only needs to go once through the numbers and so can more efficiently handle large streams of numbers. <lang python>def sd1(numbers):
if numbers: mean = sum(numbers) / len(numbers) sd = (sum((n - mean)**2 for n in numbers) / len(numbers))**0.5 return sd, mean else: return 0, 0
def sd2(numbers):
if numbers: sx = sxx = n = 0 for x in numbers: sx += x sxx += x*x n += 1 sd = (n * sxx - sx*sx)**0.5 / n return sd, sx / n else: return 0, 0
def histogram(numbers):
h = [0] * 10 maxwidth = 50 # characters for n in numbers: h[int(n*10)] += 1 mx = max(h) print() for n, i in enumerate(h): print('%3.1f: %s' % (n / 10, '+' * int(i / mx * maxwidth))) print()
if __name__ == '__main__':
import random for i in range(1,8): n = [random.random() for i in range(10**i)] print("\n##\n## %i numbers\n##" % 10**i) print(' Naive method: sd: %8.6f, mean: %8.6f' % sd1(n)) print(' Second method: sd: %8.6f, mean: %8.6f' % sd2(n)) histogram(n)</lang>
- Section of output
for larger sets of random numbers, the distribution of numbers between the bins of the histogram evens out.
... ## ## 100 numbers ## Naive method: sd: 0.288911, mean: 0.508686 Second method: sd: 0.288911, mean: 0.508686 0.0: +++++++++++++++++++++++++++++++ 0.1: ++++++++++++++++++++++++++++ 0.2: +++++++++++++++++++++++++ 0.3: ++++++++++++++++++++++++++++++++++++++++++++++++++ 0.4: ++++++++++++++++++ 0.5: +++++++++++++++++++++++++++++++ 0.6: ++++++++++++++++++ 0.7: +++++++++++++++++++++++++++++++++++++ 0.8: ++++++++++++++++++++++++++++++++++++++++ 0.9: +++++++++++++++++++++++++++++++ ... ## ## 10000000 numbers ## Naive method: sd: 0.288750, mean: 0.499839 Second method: sd: 0.288750, mean: 0.499839 0.0: ++++++++++++++++++++++++++++++++++++++++++++++++++ 0.1: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.2: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.3: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.4: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.5: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.6: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.7: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.8: +++++++++++++++++++++++++++++++++++++++++++++++++ 0.9: +++++++++++++++++++++++++++++++++++++++++++++++++
Ruby
<lang ruby>def generate_statistics(n)
t = Time.now sum = sum2 = 0.0 hist = Array.new(10, 0) n.times do r = rand sum += r sum2 += r**2 hist[(10*r).to_i] += 1 end mean = sum / n stddev = Math::sqrt((sum2 / n) - mean**2)
puts "size: #{n}" puts "mean: #{mean}" puts "stddev: #{stddev}" hist.each {|x| puts "=" * (75*x/hist.max)} puts "(elapsed: #{Time.now.to_i - t.to_i})" puts ""
end
[1_000, 1_000_000, 1_000_000_000].each {|n| generate_statistics n}</lang>
output
size: 1000 mean: 0.49480969251917134 stddev: 0.2866407835263572 ===================================================================== ====================================================================== =============================================================== =========================================================== ============================================================= =========================================================================== ======================================================================= ===================================================================== ================================================================== ======================================================== (elapsed: 0) size: 1000000 mean: 0.49965649774638726 stddev: 0.288787703478284 =========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== (elapsed: 14) size: 1000000000 mean: 0.49999951488216027 stddev: 0.2886774344977368 ========================================================================== ========================================================================== =========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== ========================================================================== (elapsed: 7880)
Tcl
<lang tcl>package require Tcl 8.5 proc stats {size} {
set sum 0.0 set sum2 0.0 for {set i 0} {$i < $size} {incr i} {
set r [expr {rand()}]
incr histo([expr {int(floor($r*10))}]) set sum [expr {$sum + $r}] set sum2 [expr {$sum2 + $r**2}]
} set mean [expr {$sum / $size}] set stddev [expr {sqrt($sum2/$size - $mean**2)}] puts "$size numbers" puts "Mean: $mean" puts "StdDev: $stddev" foreach i {0 1 2 3 4 5 6 7 8 9} {
# The 205 is a magic factor stolen from the Go solution puts [string repeat "*" [expr {$histo($i)*205/int($size)}]]
}
}
stats 100 puts "" stats 1000 puts "" stats 10000</lang> Output:
100 numbers Mean: 0.4801193240797704 StdDev: 0.28697057708153784 ************** ********************************** ******************** ************** **************************** **************** ************** **************************** **************** **************** 1000 numbers Mean: 0.49478823525495275 StdDev: 0.2821543810265757 ******************* ****************** ************************ ******************** ******************* ********************** ********************* ******************** ****************** ****************** 10000 numbers Mean: 0.49928563715870816 StdDev: 0.2888258479070212 ******************** ********************* ******************** ******************** ******************* ********************* ******************* ******************** ********************* ********************
As can be seen, increasing the sample size reduces the variation between the buckets, showing that the rand()
function at least approximates a uniform distribution. (Because Tcl 8.5 supports arbitrary precision integer arithmetic there is no reason in principle why the details for a trillion numbers couldn't be calculated, but it would take quite a while.)