Fivenum: Difference between revisions

Content added Content deleted

Inline

Revision as of 22:36, 13 March 2018

Many big data or scientific programs use boxplots to show distributions of data. In addition, sometimes saving large arrays for boxplots can be impractical and use extreme amounts of RAM. It can be useful to save large arrays as arrays with five numbers to save memory.

For example, the R programming language implements Tukey's five-number summary as the fivenum function.

Task

Given an array of numbers, compute the five-number summary.

Note

While these five numbers can be used to draw a boxplot, statistical packages will typically need extra data. Moreover, while there is a consensus about the "box" of the boxplot, there are variations among statistical packages for the whiskers.

C

Translation of: Kotlin

<lang c>#include <stdio.h>

include <stdlib.h>

double median(double *x, int start, int end_inclusive) {

   int size = end_inclusive - start + 1;
   if (size <= 0) {
       printf("Array slice cannot be empty\n");
       exit(1);
   }
   int m = start + size / 2;
   if (size % 2) return x[m];
   return (x[m - 1] + x[m]) / 2.0;

}

int compare (const void *a, const void *b) {

   double aa = *(double*)a; 
   double bb = *(double*)b;
   if (aa > bb) return 1;
   if (aa < bb) return -1;
   return 0;

}

int fivenum(double *x, double *result, int x_len) {

   int i, m, lower_end;
   for (i = 0; i < x_len; i++) {
       if (x[i] != x[i]) {
          printf("Unable to deal with arrays containing NaN\n\n");
          return 1;
       }
   } 
   qsort(x, x_len, sizeof(double), compare);
   result[0] = x[0];
   result[2] = median(x, 0, x_len - 1);
   result[4] = x[x_len - 1];
   m = x_len / 2;
   lower_end = (x_len % 2) ? m : m - 1;
   result[1] = median(x, 0, lower_end);
   result[3] = median(x, m, x_len - 1);
   return 0;

}

int show(double *result, int places) {

   int i;
   char f[7];
   sprintf(f, "%%.%dlf", places);
   printf("[");
   for (i = 0; i < 5; i++) {     
       printf(f, result[i]);
       if (i < 4) printf(", ");
   }
   printf("]\n\n");

}

int main() {

   double result[5];

   double x1[11] = {15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0};
   if (!fivenum(x1, result, 11)) show(result, 1);

   double x2[6] = {36.0, 40.0, 7.0, 39.0, 41.0, 15.0};
   if (!fivenum(x2, result, 6)) show(result, 1);

   double x3[20] = {
        0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,  0.73438555,
       -0.03035726,  1.46675970, -0.74621349, -0.72588772,  0.63905160,  0.61501527,
       -0.98983780, -1.00447874, -0.62759469,  0.66206163,  1.04312009, -0.10305385,
        0.75775634,  0.32566578
   };
   if (!fivenum(x3, result, 20)) show(result, 9);

   return 0;

}</lang>

Output:

[6.0, 25.5, 40.0, 42.5, 49.0]

[7.0, 15.0, 37.5, 40.0, 41.0]

[-1.950595940, -0.676741205, 0.233247060, 0.746070945, 1.731315070]

Go

Translation of: Perl

<lang go>package main

import (

   "fmt"
   "math"
   "sort"

)

func fivenum(a []float64) (n5 [5]float64) {

   sort.Float64s(a)
   n := float64(len(a))
   n4 := float64((len(a)+3)/2) / 2
   d := []float64{1, n4, (n + 1) / 2, n + 1 - n4, n}
   for e, de := range d {
       floor := int(de - 1)
       ceil := int(math.Ceil(de - 1))
       n5[e] = .5 * (a[floor] + a[ceil])
   }
   return

}

var (

   x1 = []float64{36, 40, 7, 39, 41, 15}
   x2 = []float64{15, 6, 42, 41, 7, 36, 49, 40, 39, 47, 43}
   x3 = []float64{
       0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594,
       0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772,
       0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469,
       0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578,
   }

)

func main() {

   fmt.Println(fivenum(x1))
   fmt.Println(fivenum(x2))
   fmt.Println(fivenum(x3))

}</lang>

Output:

[7 15 37.5 40 41]
[6 25.5 40 42.5 49]
[-1.95059594 -0.676741205 0.23324706 0.746070945 1.73131507]

Alternate:

This solution is aimed at handling larger data sets more efficiently. It replaces the O(n log n) sort with O(n) quickselect. It also does not attempt to reproduce the R result exactly, to average values to get a median of an even number of data values, or otherwise estimate quantiles. The quickselect here leaves the input partitioned around the selected value, which allows another small optimization: The first quickselect call partitions the full input around the median. The second call, to get the first quartile, thus only has to process the partition up to the median. The third call, to get the minimum, only has to process the partition up to the first quartile. The 3rd quartile and maximum are obtained similarly. <lang go>package main

import (

   "fmt"
   "math/rand"

)

func fivenum(a []float64) (n [5]float64) {

   last := len(a) - 1
   m := last / 2
   n[2] = qsel(a, m)
   q1 := len(a) / 4
   n[1] = qsel(a[:m], q1)
   n[0] = qsel(a[:q1], 0)
   a = a[m:]
   q3 := last - m - q1
   n[3] = qsel(a, q3)
   a = a[q3:]
   n[4] = qsel(a, len(a)-1)
   return

}

func qsel(a []float64, k int) float64 {

   for len(a) > 1 {
       px := rand.Intn(len(a))
       pv := a[px]
       last := len(a) - 1
       a[px], a[last] = a[last], pv
       px = 0
       for i, v := range a[:last] {
           if v < pv {
               a[px], a[i] = v, a[px]
               px++
           }
       }
       a[px], a[last] = pv, a[px]
       if px == k {
           return pv
       }
       if k < px {
           a = a[:px]
       } else {
           a = a[px+1:]
           k -= px + 1
       }
   }
   return a[0]

}

var (

   x1 = []float64{36, 40, 7, 39, 41, 15}
   x2 = []float64{15, 6, 42, 41, 7, 36, 49, 40, 39, 47, 43}
   x3 = []float64{
       0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594,
       0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772,
       0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469,
       0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578,
   }

)

func main() {

   fmt.Println(fivenum(x1))
   fmt.Println(fivenum(x2))
   fmt.Println(fivenum(x3))

}</lang>

Output:

[7 15 36 40 41]
[6 15 40 43 49]
[-1.95059594 -0.62759469 0.14082834 0.73438555 1.73131507]

Java

Translation of: Kotlin

<lang java>import java.util.Arrays;

public class Fivenum {

   static double median(double[] x, int start, int endInclusive) {
       int size = endInclusive - start + 1;
       if (size <= 0) throw new IllegalArgumentException("Array slice cannot be empty");
       int m = start + size / 2;
       return (size % 2 == 1) ? x[m] : (x[m - 1] + x[m]) / 2.0;
   }

   static double[] fivenum(double[] x) {
       for (Double d : x) {
           if (d.isNaN())
               throw new IllegalArgumentException("Unable to deal with arrays containing NaN");
       }
       double[] result = new double[5];
       Arrays.sort(x);
       result[0] = x[0];
       result[2] = median(x, 0, x.length - 1);
       result[4] = x[x.length - 1];
       int m = x.length / 2;
       int lowerEnd = (x.length % 2 == 1) ? m : m - 1;
       result[1] = median(x, 0, lowerEnd);
       result[3] = median(x, m, x.length - 1);
       return result;
   }

   public static void main(String[] args) {
       double xl[][] = {
           {15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0},
           {36.0, 40.0, 7.0, 39.0, 41.0, 15.0},
           {
                0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,  0.73438555,
               -0.03035726,  1.46675970, -0.74621349, -0.72588772,  0.63905160,  0.61501527,
               -0.98983780, -1.00447874, -0.62759469,  0.66206163,  1.04312009, -0.10305385,
                0.75775634,  0.32566578
           }
       };
       for (double[] x : xl) System.out.printf("%s\n\n", Arrays.toString(fivenum(x)));
   }

}</lang>

Output:

[6.0, 25.5, 40.0, 42.5, 49.0]

[7.0, 15.0, 37.5, 40.0, 41.0]

[-1.95059594, -0.676741205, 0.23324706, 0.746070945, 1.73131507]

Julia

Works with: Julia version 0.6

<lang julia>function mediansorted(x::AbstractVector{T}, i::Integer, l::Integer)::T where T

   len = l - i + 1
   len > zero(len) || throw(ArgumentError("Array slice cannot be empty."))
   mid = i + len ÷ 2
   return isodd(len) ? x[mid] : (x[mid-1] + x[mid]) / 2

end

function fivenum(x::AbstractVector{T}) where T<:AbstractFloat

   r = Vector{T}(5)
   xs = sort(x)
   mid::Int = length(xs) ÷ 2
   lowerend::Int = isodd(length(xs)) ? mid : mid - 1
   r[1] = xs[1]
   r[2] = mediansorted(xs, 1, lowerend)
   r[3] = mediansorted(xs, 1, endof(xs))
   r[4] = mediansorted(xs, mid, endof(xs))
   r[end] = xs[end]
   return r

end

for v in ([15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0],

         [36.0, 40.0, 7.0, 39.0, 41.0, 15.0],
         [0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,  0.73438555,
         -0.03035726,  1.46675970, -0.74621349, -0.72588772,  0.63905160,  0.61501527,
         -0.98983780, -1.00447874, -0.62759469,  0.66206163,  1.04312009, -0.10305385,
          0.75775634,  0.32566578])
   println("# ", v, "\n -> ", fivenum(v))

end</lang>

Output:

# [15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0]
 -> [6.0, 15.0, 40.0, 42.0, 49.0]
# [36.0, 40.0, 7.0, 39.0, 41.0, 15.0]
 -> [7.0, 11.0, 37.5, 39.5, 41.0]
# [0.140828, 0.0974879, 1.73132, 0.87636, -1.9506, 0.734386, -0.0303573, 1.46676, -0.746213, -0.725888, 0.639052, 0.615015, -0.989838, -1.00448, -0.627595,0.662062, 1.04312, -0.103054, 0.757756, 0.325666]
 -> [-1.9506, -0.725888, 0.233247, 0.734386, 1.73132]

Kotlin

The following uses Tukey's method for calculating the lower and upper quartiles (or 'hinges') which is what the R function, fivenum, appears to use.

As arrays containing NaNs and nulls cannot really be dealt with in a sensible fashion in Kotlin, they've been excluded altogether. <lang scala>// version 1.2.21

fun median(x: DoubleArray, start: Int, endInclusive: Int): Double {

   val size = endInclusive - start + 1
   require (size > 0) { "Array slice cannot be empty" }
   val m = start + size / 2
   return if (size % 2 == 1) x[m] else (x[m - 1] + x[m]) / 2.0

}

fun fivenum(x: DoubleArray): DoubleArray {

   require(x.none { it.isNaN() }) { "Unable to deal with arrays containing NaN" }
   val result = DoubleArray(5)
   x.sort()
   result[0] = x[0]
   result[2] = median(x, 0, x.size - 1)
   result[4] = x[x.lastIndex]
   val m = x.size / 2
   var lowerEnd = if (x.size % 2 == 1) m else m - 1
   result[1] = median(x, 0, lowerEnd)
   result[3] = median(x, m, x.size - 1)
   return result

}

fun main(args: Array<String>) {

   var xl = listOf(
       doubleArrayOf(15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0),
       doubleArrayOf(36.0, 40.0, 7.0, 39.0, 41.0, 15.0),
       doubleArrayOf(
            0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,  0.73438555,
           -0.03035726,  1.46675970, -0.74621349, -0.72588772,  0.63905160,  0.61501527,
           -0.98983780, -1.00447874, -0.62759469,  0.66206163,  1.04312009, -0.10305385,
            0.75775634,  0.32566578
       )
   )
   xl.forEach { println("${fivenum(it).asList()}\n") }

}</lang>

Output:

[6.0, 25.5, 40.0, 42.5, 49.0]

[7.0, 15.0, 37.5, 40.0, 41.0]

[-1.95059594, -0.676741205, 0.23324706, 0.746070945, 1.73131507]

Perl

Translation of: R

Works with: Perl 5.10

<lang Perl>#!/usr/bin/env perl

use strict; use warnings; use Cwd 'getcwd'; use feature 'say'; my $TOP_DIRECTORY = getcwd();

sub log_error_and_die {

   my $error = shift;

https://codereview.stackexchange.com/questions/182010/parallel-processing-in-different-directories-in-perl?noredirect=1#comment345753_182010

   my $fail_filename = "$TOP_DIRECTORY/$0.FAIL";
   open my $fh, '>', $fail_filename or die "Can't write $fail_filename: $!";
   print $fh $error;

   die $error;

}

local $SIG{__WARN__} = sub {

   my $message = shift;
   log_error_and_die( sprintf( '%s @ %s', $message, getcwd() ) );

};

use POSIX qw(ceil floor);

sub fivenum {

  my $array = shift;
  my $n = scalar @{ $array };
  if ($n == 0) {
     print "no values were entered into fivenum.\n";
     die;
  }
  my @x = sort {$a <=> $b} @{ $array };
  my $n4 = floor(($n+3)/2)/2;
  my @d = (1, $n4, ($n +1)/2, $n+1-$n4, $n);#d <- c(1, n4, (n + 1)/2, n + 1 - n4, n)
  my @sum_array;
  foreach my $e (0..4) {
     my $floor = floor($d[$e]-1);
     my $ceil  =  ceil($d[$e]-1);
     push @sum_array, (0.5 * ($x[$floor] + $x[$ceil]));
  }
  return @sum_array;

}

my @x = qw(0.14082834 0.09748790 1.73131507 0.87636009 -1.95059594 0.73438555 -0.03035726 1.46675970 -0.74621349 -0.72588772 0.63905160 0.61501527

-0.98983780 -1.00447874 -0.62759469  0.66206163  1.04312009 -0.10305385
 0.75775634  0.32566578);

my @y = fivenum(\@x);

say join (',', @y); </lang>

Output:

 -1.95059594,-0.676741205,0.23324706,0.746070945,1.73131507

Perl 6

Translation of: Perl

<lang perl6>sub fourths ( Int $end ) {

   my $end_22 = $end div 2 / 2;

   return 0, $end_22, $end/2, $end - $end_22, $end;

} sub fivenum ( @nums ) {

   my @x = @nums.sort(+*)
       or die 'Input must have at least one element';

   my @d = fourths(@x.end);

   return ( @x[@d».floor] Z+ @x[@d».ceiling] ) »/» 2;

}

say .&fivenum for [15, 6, 42, 41, 7, 36, 49, 40, 39, 47, 43],

                 [36, 40, 7, 39, 41, 15], [
   0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,
   0.73438555, -0.03035726,  1.46675970, -0.74621349, -0.72588772,
   0.63905160,  0.61501527, -0.98983780, -1.00447874, -0.62759469,
   0.66206163,  1.04312009, -0.10305385,  0.75775634,  0.32566578,

]; </lang>

Output:

(6 25.5 40 42.5 49)
(7 15 37.5 40 41)
(-1.95059594 -0.676741205 0.23324706 0.746070945 1.73131507)

Python

Translation of: Perl

Work with: Python 2 <lang python>from __future__ import division import math import sys

def fivenum(array):

   n = len(array)
   if n == 0:
       print "you entered an empty array."
       sys.exit()
   x = sorted(array)
   
   n4 = math.floor((n+3.0)/2.0)/2.0
   d = [1, n4, (n+1)/2, n+1-n4, n]
   sum_array = []
   
   for e in range(5):
       floor = int(math.floor(d[e] - 1))
       ceil = int(math.ceil(d[e] - 1))
       sum_array.append(0.5 * (x[floor] + x[ceil]))
   
   return sum_array

x = [0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555, -0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578]

y = fivenum(x) print y</lang>

Output:

[-1.95059594, -0.676741205, 0.23324706, 0.746070945, 1.73131507]

R

The fivenum function is built-in, see R manual.

<lang R>x <- c(0.14082834, 0.09748790, 1.73131507, 0.87636009, -1.95059594, 0.73438555,-0.03035726, 1.46675970, -0.74621349, -0.72588772, 0.63905160, 0.61501527, -0.98983780, -1.00447874, -0.62759469, 0.66206163, 1.04312009, -0.10305385, 0.75775634, 0.32566578) > fivenum(x) [1] -1.9505959 -0.6767412 0.2332471 0.7460709 1.7313151</lang>

REXX

Programming note: this REXX program uses a unity─based array. <lang rexx>/*REXX program computes the five─number summary (LO─value, p25, medium, p75, HI─value).*/ parse arg x if x= then x= 15 6 42 41 7 36 49 40 39 47 43 /*Not specified? Then use the defaults*/ say 'input numbers: ' space(x) /*display the original list of numbers.*/ call 5num /*invoke the five-number function. */ say ' five-numbers: ' result /*display " " " results. */ exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ bSort: procedure expose @.; parse arg n; m=n-1 /*N: is the number of @ array elements.*/

          do m=m  for m  by -1  until ok;  ok=1 /*keep sorting the  @  array 'til done.*/
            do j=1  for m;  k=j + 1;   if @.j<=@.k  then iterate    /*In order?   Good.*/
            _=@.j @.k 0; parse var _ @.k @.j ok /*swap two elements;  flag as not done.*/
            end   /*j*/
          end     /*m*/;          return

/*──────────────────────────────────────────────────────────────────────────────────────*/ med: arg s,e; $=e-s+1; m=s+$%2; if $//2 then return @.m; _=m-1; return (@._+@.m)/2 /*──────────────────────────────────────────────────────────────────────────────────────*/ 5num: #=words(x); if #==0 then return '***error*** array is empty.'

      parse var x . 1 LO . 1 HI .               /*assume values for LO and HI (for now)*/
      q2=# % 2
                    do j=1  for #;    @.j=word(x,j);   ?=datatype(@.j, 'N')
                    if \?  then return '***error***  element'  j  "isn't numeric: "   @.j
                    LO=min(LO, @.j);  HI=max(HI, @.j)
                    end   /*j*/                /* [↑]  traipse thru array, find min,max*/
      call bSort #                             /*use a bubble sort  (easiest to code). */
      if #//2  then p25=q2;  else p25=q2 - 1   /*calculate the second quartile number. */
      return LO  med(1,p25)  med(1,#)  med(q2,#)  HI  /*return list of the five numbers*/</lang>

output when using the default input of: 15 6 42 41 7 36 49 40 39 47 43

input numbers:  15 6 42 41 7 36 49 40 39 47 43
 five-numbers:  6 15 40 42 49

output when using the (internal) default inputs of: 36 40 7 39 41 15

input numbers:  36 40 7 39 41 15
 five-numbers:  7 11 37.5 39.5 41

SAS

<lang sas>/* build a dataset */ data test; do i=1 to 10000; x=rannor(12345); output; end; keep x; run;

/* compute the five numbers */ proc means data=test min p25 median p75 max; var x; run;</lang>

Output

Analysis Variable : x
Minimum	25th Pctl	Median	75th Pctl	Maximum
-4.0692299	-0.6533022	0.0066299	0.6768043	4.1328026

Stata

First build a dataset:

<lang stata>clear set seed 17760704 qui set obs 10000 gen x=rnormal()</lang>

The summarize command produces all the required statistics, and more:

<lang stata>qui sum x, detail di r(min),r(p25),r(p50),r(p75),r(max)</lang>

Output

-3.6345866 -.66536 .0026834 .68398139 3.7997103

It's also possible to use the tabstat command

<lang stata>tabstat x, s(mi q ma)</lang>

Output

    variable |       min       p25       p50       p75       max
-------------+--------------------------------------------------
           x | -3.634587   -.66536  .0026834  .6839814   3.79971
----------------------------------------------------------------

Another example:

<lang stata>clear mat a=0.14082834\0.09748790\1.73131507\0.87636009\-1.95059594\ /// 0.73438555\-0.03035726\1.46675970\-0.74621349\-0.72588772\ /// 0.63905160\0.61501527\-0.98983780\-1.00447874\-0.62759469\ /// 0.66206163\1.04312009\-0.10305385\0.75775634\0.32566578 svmat a tabstat a1, s(mi q ma)</lang>

Output

    variable |       min       p25       p50       p75       max
-------------+--------------------------------------------------
          a1 | -1.950596 -.6767412  .2332471   .746071  1.731315
----------------------------------------------------------------

zkl

Uses GNU GSL library. <lang zkl>var [const] GSL=Import("zklGSL"); // libGSL (GNU Scientific Library) fcn fiveNum(v){ // V is a GSL Vector, --> min, 1st qu, median, 3rd qu, max

  v.sort();
  return(v.min(),v.quantile(0.25),v.median(),v.quantile(0.75),v.max())

}</lang> <lang zkl>fiveNum(GSL.VectorFromData(

  15.0, 6.0, 42.0, 41.0, 7.0, 36.0, 49.0, 40.0, 39.0, 47.0, 43.0)).println();

println(fiveNum(GSL.VectorFromData(36.0, 40.0, 7.0, 39.0, 41.0, 15.0)));

v:=GSL.VectorFromData(

  0.14082834,  0.09748790,  1.73131507,  0.87636009, -1.95059594,  0.73438555,
 -0.03035726,  1.46675970, -0.74621349, -0.72588772,  0.63905160,  0.61501527,
 -0.98983780, -1.00447874, -0.62759469,  0.66206163,  1.04312009, -0.10305385,
  0.75775634,  0.32566578);

println(fiveNum(v));</lang>

Output:

L(6,25.5,40,42.5,49)
L(7,20.25,37.5,39.75,41)
L(-1.9506,-0.652168,0.233247,0.740228,1.73132)