Entropy

From Rosetta Code
Jump to: navigation, search
Task
Entropy
You are encouraged to solve this task according to the task description, using any language you may know.

Calculate the information entropy (Shannon entropy) of a given input string.

Entropy is the expected value of the measure of information content in a system. In general, the Shannon entropy of a variable X is defined as:

H(X) = \sum_{x\in\Omega} P(x) I(x)

where the information content I(x) = − logbP(x). If the base of the logarithm b = 2, the result is expressed in bits, a unit of information. Therefore, given a string S of length n where P(si) is the relative frequency of each character, the entropy of a string in bits is:

H(S) = -\sum_{i=0}^n P(s_i) \log_2 (P(s_i))

For this task, use "1223334444" as an example. The result should be around 1.84644 bits.

Related Task: Fibonacci_word

Contents

[edit] Ada

Uses Ada 2012.

with Ada.Text_IO, Ada.Float_Text_IO, Ada.Numerics.Elementary_Functions;
 
procedure Count_Entropy is
 
package TIO renames Ada.Text_IO;
 
Count: array(Character) of Natural := (others => 0);
Sum: Natural := 0;
Line: String := "1223334444";
 
begin
for I in Line'Range loop -- count the characters
Count(Line(I)) := Count(Line(I))+1;
Sum := Sum + 1;
end loop;
 
declare -- compute the entropy and print it
function P(C: Character) return Float is (Float(Count(C)) / Float(Sum));
use Ada.Numerics.Elementary_Functions, Ada.Float_Text_IO;
Result: Float := 0.0;
begin
for Ch in Character loop
Result := Result -
(if P(Ch)=0.0 then 0.0 else P(Ch) * Log(P(Ch), Base => 2.0));
end loop;
Put(Result, Fore => 1, Aft => 5, Exp => 0);
end;
end Count_Entropy;

[edit] Aime

integer i, l;
record r;
real h, x;
text s;
 
s = argv(1);
l = length(s);
 
i = l;
while (i) {
i -= 1;
rn_a_integer(r, cut(s, i, 1), 1);
}
 
h = 0;
if (r_first(r, s)) {
do {
x = r_q_integer(r, s);
x /= l;
h -= x * log2(x);
} while (r_greater(r, s, s));
}
 
o_real(6, h);
o_newline();

Examples:

$ aime -a tmp/entr 1223334444
1.846439
$ aime -a tmp/entr 'Rosetta Code is the best site in the world!'
3.646513
$ aime -a tmp/entr 1234567890abcdefghijklmnopqrstuvwxyz
5.169925

[edit] ALGOL 68

Works with: ALGOL 68G version Any - tested with release 2.8.win32
# calculate the shannon entropy of a string                                #
 
PROC shannon entropy = ( STRING s )REAL:
BEGIN
 
INT string length = ( UPB s - LWB s ) + 1;
 
# count the occurances of each character #
 
[ 0 : max abs char ]INT char count;
 
FOR char pos FROM LWB char count TO UPB char count DO
char count[ char pos ] := 0
OD;
 
FOR char pos FROM LWB s TO UPB s DO
char count[ ABS s[ char pos ] ] +:= 1
OD;
 
# calculate the entropy, we use log base 10 and then convert #
# to log base 2 after calculating the sum #
 
REAL entropy := 0;
 
FOR char pos FROM LWB char count TO UPB char count DO
IF char count[ char pos ] /= 0
THEN
# have a character that occurs in the string #
REAL probability = char count[ char pos ] / string length;
entropy -:= probability * log( probability )
FI
OD;
 
entropy / log( 2 )
END; # shannon entropy #
 
 
 
main:
(
# test the shannon entropy routine #
print( ( shannon entropy( "1223334444" ), newline ) )
)
 
Output:
+1.84643934467102e  +0

[edit] AutoHotkey

MsgBox, % Entropy(1223334444)
 
Entropy(n)
{
a := [], len := StrLen(n), m := n
while StrLen(m)
{
s := SubStr(m, 1, 1)
m := RegExReplace(m, s, "", c)
a[s] := c
}
for key, val in a
{
m := Log(p := val / len)
e -= p * m / Log(2)
}
return, e
}
Output:
1.846440

[edit] AWK

#!/usr/bin/awk -f 
{
for (i=1; i<= length($0); i++) {
H[substr($0,i,1)]++;
N++;
}
}
 
END {
for (i in H) {
p = H[i]/N;
E -= p * log(p);
}
print E/log(2);
}
Usage:
 echo 1223334444 |./entropy.awk
1.84644

[edit] Burlesque

blsq ) "1223334444"F:u[vv^^{1\/?/2\/LG}m[?*++
1.8464393446710157

[edit] C

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
 
#define MAXLEN 100 //maximum string length
 
int makehist(char *S,int *hist,int len){
int wherechar[256];
int i,histlen;
histlen=0;
for(i=0;i<256;i++)wherechar[i]=-1;
for(i=0;i<len;i++){
if(wherechar[(int)S[i]]==-1){
wherechar[(int)S[i]]=histlen;
histlen++;
}
hist[wherechar[(int)S[i]]]++;
}
return histlen;
}
 
double entropy(int *hist,int histlen,int len){
int i;
double H;
H=0;
for(i=0;i<histlen;i++){
H-=(double)hist[i]/len*log2((double)hist[i]/len);
}
return H;
}
 
int main(void){
char S[MAXLEN];
int len,*hist,histlen;
double H;
scanf("%[^\n]",S);
len=strlen(S);
hist=(int*)calloc(len,sizeof(int));
histlen=makehist(S,hist,len);
//hist now has no order (known to the program) but that doesn't matter
H=entropy(hist,histlen,len);
printf("%lf\n",H);
return 0;
}

Examples:

$ ./entropy
1223334444
1.846439
$ ./entropy
Rosetta Code is the best site in the world!
3.646513

[edit] C++

#include <string>
#include <map>
#include <iostream>
#include <algorithm>
#include <cmath>
 
double log2( double number ) {
return log( number ) / log( 2 ) ;
}
 
int main( int argc , char *argv[ ] ) {
std::string teststring( argv[ 1 ] ) ;
std::map<char , int> frequencies ;
for ( char c : teststring )
frequencies[ c ] ++ ;
int numlen = teststring.length( ) ;
double infocontent = 0 ;
for ( std::pair<char , int> p : frequencies ) {
double freq = static_cast<double>( p.second ) / numlen ;
infocontent += freq * log2( freq ) ;
}
infocontent *= -1 ;
std::cout << "The information content of " << teststring
<< " is " << infocontent << " !\n" ;
return 0 ;
}
Output:
The information content of 1223334444 is 1.84644 !

[edit] Clojure

(defn entropy [s]
(let [len (count s), log-2 (Math/log 2)]
(->> (frequencies s)
(map (fn [[_ v]]
(let [rf (/ v len)]
(-> (Math/log rf) (/ log-2) (* rf) Math/abs))))
(reduce +))))
Output:
(entropy "1223334444")
1.8464393446710154

[edit] C#

Translation of C++.

 
using System;
using System.Collections.Generic;
namespace Entropy
{
class Program
{
public static double logtwo(double num)
{
return Math.Log(num)/Math.Log(2);
}
public static void Main(string[] args)
{
label1:
string input = Console.ReadLine();
double infoC=0;
Dictionary<char,double> table = new Dictionary<char, double>();
 
 
foreach (char c in input)
{
if (table.ContainsKey(c))
table[c]++;
else
table.Add(c,1);
 
}
double freq;
foreach (KeyValuePair<char,double> letter in table)
{
freq=letter.Value/input.Length;
infoC+=freq*logtwo(freq);
}
infoC*=-1;
Console.WriteLine("The Entropy of {0} is {1}",input,infoC);
goto label1;
 
}
}
}
 
Output:
The Entropy of 1223334444 is 1.84643934467102

Without using Hashtables or Dictionaries:

using System;
namespace Entropy
{
class Program
{
public static double logtwo(double num)
{
return Math.Log(num)/Math.Log(2);
}
static double Contain(string x,char k)
{
double count=0;
foreach (char Y in x)
{
if(Y.Equals(k))
count++;
}
return count;
}
public static void Main(string[] args)
{
label1:
string input = Console.ReadLine();
double infoC=0;
double freq;
string k="";
foreach (char c1 in input)
{
if (!(k.Contains(c1.ToString())))
k+=c1;
}
foreach (char c in k)
{
freq=Contain(input,c)/(double)input.Length;
infoC+=freq*logtwo(freq);
}
infoC/=-1;
Console.WriteLine("The Entropy of {0} is {1}",input,infoC);
goto label1;
 
}
}
}

[edit] CoffeeScript

entropy = (s) ->
freq = (s) ->
result = {}
for ch in s.split ""
result[ch] ?= 0
result[ch]++
return result
 
frq = freq s
n = s.length
((frq[f]/n for f of frq).reduce ((e, p) -> e - p * Math.log(p)), 0) * Math.LOG2E
 
console.log "The entropy of the string '1223334444' is #{entropy '1223334444'}"
Output:
The entropy of the string '1223334444' is 1.8464393446710157

[edit] Common Lisp

(defun entropy(input-string)
(let ((frequency-table (make-hash-table :test 'equal))
(entropy 0))
(map 'nil #'(lambda(c) (setf (gethash c frequency-table) (if (gethash c frequency-table) (+ (gethash c frequency-table) 1) 1))) (coerce input-string 'list))
(maphash #'(lambda(k v) (setf entropy (+ entropy (* -1 (/ v (length input-string)) (log (/ v (length input-string)) 2))))) frequency-table)
entropy))
 

[edit] D

import std.stdio, std.algorithm, std.math;
 
double entropy(T)(T[] s)
pure nothrow if (__traits(compiles, s.sort())) {
immutable sLen = s.length;
return s
.sort()
.group
.map!(g => g[1] / double(sLen))
.map!(p => -p * p.log2)
.sum;
}
 
void main() {
"1223334444"d.dup.entropy.writeln;
}
Output:
1.84644

[edit] Emacs Lisp

(defun shannon-entropy (input)
(let ((freq-table (make-hash-table))
(entropy 0)
(length (+ (length input) 0.0)))
(mapcar (lambda (x)
(puthash x
(+ 1 (gethash x freq-table 0))
freq-table))
input)
(maphash (lambda (k v)
(set 'entropy (+ entropy
(* (/ v length)
(log (/ v length) 2)))))
freq-table)
(- entropy)))
Output:

After adding the above to the emacs runtime, you can run the function interactively in the scratch buffer as shown below (type ctrl-j at the end of the first line and the output will be placed by emacs on the second line).

(shannon-entropy "1223334444")
1.8464393446710154

[edit] Erlang

 
-module( entropy ).
 
-export( [shannon/1, task/0] ).
 
shannon( String ) -> shannon_information_content( lists:foldl(fun count/2, dict:new(), String), erlang:length(String) ).
 
task() -> shannon( "1223334444" ).
 
 
 
count( Character, Dict ) -> dict:update_counter( Character, 1, Dict ).
 
shannon_information_content( Dict, String_length ) ->
{_String_length, Acc} = dict:fold( fun shannon_information_content/3, {String_length, 0.0}, Dict ),
Acc / math:log( 2 ).
 
shannon_information_content( _Character, How_many, {String_length, Acc} ) ->
Frequency = How_many / String_length,
{String_length, Acc - (Frequency * math:log(Frequency))}.
 
Output:
24> entropy:task().
1.8464393446710157

[edit] Euler Math Toolbox

>function entropy (s) ...
$ v=strtochar(s);
$ m=getmultiplicities(unique(v),v);
$ m=m/sum(m);
$ return sum(-m*logbase(m,2))
$endfunction
>entropy("1223334444")
1.84643934467


[edit] F#

open System
 
let ld x = Math.Log x / Math.Log 2.
 
let entropy (s : string) =
let n = float s.Length
Seq.groupBy id s
|> Seq.map (fun (_, vals) -> float (Seq.length vals) / n)
|> Seq.fold (fun e p -> e - p * ld p) 0.
 
printfn "%f" (entropy "1223334444")
Output:
1.846439

[edit] friendly interactive shell

Sort of hacky, but friendly interactive shell isn't really optimized for mathematic tasks (in fact, it doesn't even have associative arrays).

function entropy
for arg in $argv
set name count_$arg
if not count $$name > /dev/null
set $name 0
set values $values $arg
end
set $name (math $$name + 1)
end
set entropy 0
for value in $values
set name count_$value
set entropy (echo "
scale = 50
p = "$$name" / "(count $argv)"
$entropy - p * l(p)
" | bc -l)
end
echo "$entropy / l(2)" | bc -l
end
entropy (echo 1223334444 | fold -w1)
Output:
1.84643934467101549345

[edit] Forth

: flog2 ( f -- f ) fln 2e fln f/ ;
 
create freq 256 cells allot
 
: entropy ( str len -- f )
freq 256 cells erase
tuck
bounds do
i c@ cells freq +
1 swap +!
loop
0e
256 0 do
i cells freq + @ ?dup if
s>f dup s>f f/
fdup flog2 f* f-
then
loop
drop ;
 
s" 1223334444" entropy f. \ 1.84643934467102 ok
 

[edit] Fortran

Please find the GNU/linux compilation instructions along with sample run among the comments at the start of the FORTRAN 2008 source. This program acquires input from the command line argument, thereby demonstrating the fairly new get_command_argument intrinsic subroutine. The expression of the algorithm is a rough translated of the j solution. Thank you.

 
!-*- mode: compilation; default-directory: "/tmp/" -*-
!Compilation started at Tue May 21 21:43:12
!
!a=./f && make $a && OMP_NUM_THREADS=2 $a 1223334444
!gfortran -std=f2008 -Wall -ffree-form -fall-intrinsics f.f08 -o f
! Shannon entropy of 1223334444 is 1.84643936
!
!Compilation finished at Tue May 21 21:43:12
 
program shannonEntropy
implicit none
integer :: num, L, status
character(len=2048) :: s
num = 1
call get_command_argument(num, s, L, status)
if ((0 /= status) .or. (L .eq. 0)) then
write(0,*)'Expected a command line argument with some length.'
else
write(6,*)'Shannon entropy of '//(s(1:L))//' is ', se(s(1:L))
endif
 
contains
! algebra
!
! 2**x = y
! x*log(2) = log(y)
! x = log(y)/log(2)
 
! NB. The j solution
! entropy=: +/@:-@(* 2&^.)@(#/.~ % #)
! entropy '1223334444'
!1.84644
 
real function se(s)
implicit none
character(len=*), intent(in) :: s
integer, dimension(256) :: tallies
real, dimension(256) :: norm
tallies = 0
call TallyKey(s, tallies)
! J's #/. works with the set of items in the input.
! TallyKey is sufficiently close that, with the merge, gets the correct result.
norm = tallies / real(len(s))
se = sum(-(norm*log(merge(1.0, norm, norm .eq. 0))/log(2.0)))
end function se
 
subroutine TallyKey(s, counts)
character(len=*), intent(in) :: s
integer, dimension(256), intent(out) :: counts
integer :: i, j
counts = 0
do i=1,len(s)
j = iachar(s(i:i))
counts(j) = counts(j) + 1
end do
end subroutine TallyKey
 
end program shannonEntropy
 

[edit] Go

package main
 
import (
"fmt"
"math"
)
 
const s = "1223334444"
 
func main() {
m := map[rune]float64{}
for _, r := range s {
m[r]++
}
hm := 0.
for _, c := range m {
hm += c * math.Log2(c)
}
const l = float64(len(s))
fmt.Println(math.Log2(l) - hm/l)
}
Output:
1.8464393446710152

[edit] Groovy

String.metaClass.getShannonEntrophy = {
-delegate.inject([:]) { map, v -> map[v] = (map[v] ?: 0) + 1; map }.values().inject(0.0) { sum, v ->
def p = (BigDecimal)v / delegate.size()
sum + p * Math.log(p) / Math.log(2)
}
}

Testing

[ '1223334444': '1.846439344671',
'1223334444555555555': '1.969811065121',
'122333': '1.459147917061',
'1227774444': '1.846439344671',
aaBBcccDDDD: '1.936260027482',
'1234567890abcdefghijklmnopqrstuvwxyz': '5.169925004424',
'Rosetta Code': '3.084962500407' ].each { s, expected ->
 
println "Checking $s has a shannon entrophy of $expected"
assert sprintf('%.12f', s.shannonEntrophy) == expected
}
Output:
Checking 1223334444 has a shannon entrophy of 1.846439344671
Checking 1223334444555555555 has a shannon entrophy of 1.969811065121
Checking 122333 has a shannon entrophy of 1.459147917061
Checking 1227774444 has a shannon entrophy of 1.846439344671
Checking aaBBcccDDDD has a shannon entrophy of 1.936260027482
Checking 1234567890abcdefghijklmnopqrstuvwxyz has a shannon entrophy of 5.169925004424
Checking Rosetta Code has a shannon entrophy of 3.084962500407

[edit] Haskell

import Data.List
 
main = print $ entropy "1223334444"
 
entropy s =
sum . map lg' . fq' . map (fromIntegral.length) . group . sort $ s
where lg' c = (c * ) . logBase 2 $ 1.0 / c
fq'
c = let sc = sum c in map (/ sc) c

[edit] Icon and Unicon

Hmmm, the 2nd equation sums across the length of the string (for the example, that would be the sum of 10 terms). However, the answer cited is for summing across the different characters in the string (sum of 4 terms). The code shown here assumes the latter and works in Icon and Unicon. This assumption is consistent with the Wikipedia description.

procedure main(a)
s := !a | "1223334444"
write(H(s))
end
 
procedure H(s)
P := table(0.0)
every P[!s] +:= 1.0/*s
every (h := 0.0) -:= P[c := key(P)] * log(P[c],2)
return h
end
Output:
->en
1.846439344671015
->

[edit] J

Solution:
   entropy=:  +/@:-@(* 2&^.)@(#/.~ % #)
Example:
   entropy '1223334444'
1.84644

[edit] Java

Translation of: NetRexx
Translation of: REXX
Works with: Java version 7+
import java.lang.Math;
import java.util.Map;
import java.util.HashMap;
 
public class REntropy {
 
@SuppressWarnings("boxing")
public static double getShannonEntropy(String s) {
int n = 0;
Map<Character, Integer> occ = new HashMap<>();
 
for (int c_ = 0; c_ < s.length(); ++c_) {
char cx = s.charAt(c_);
if (occ.containsKey(cx)) {
occ.put(cx, occ.get(cx) + 1);
} else {
occ.put(cx, 1);
}
++n;
}
 
double e = 0.0;
for (Map.Entry<Character, Integer> entry : occ.entrySet()) {
char cx = entry.getKey();
double p = (double) entry.getValue() / n;
e += p * log2(p);
}
return -e;
}
 
private static double log2(double a) {
return Math.log(a) / Math.log(2);
}
public static void main(String[] args) {
String[] sstr = {
"1223334444",
"1223334444555555555",
"122333",
"1227774444",
"aaBBcccDDDD",
"1234567890abcdefghijklmnopqrstuvwxyz",
"Rosetta Code",
};
 
for (String ss : sstr) {
double entropy = REntropy.getShannonEntropy(ss);
System.out.printf("Shannon entropy of %40s: %.12f%n", "\"" + ss + "\"", entropy);
}
return;
}
}
Output:
Shannon entropy of                             "1223334444": 1.846439344671
Shannon entropy of                    "1223334444555555555": 1.969811065278
Shannon entropy of                                 "122333": 1.459147917027
Shannon entropy of                             "1227774444": 1.846439344671
Shannon entropy of                            "aaBBcccDDDD": 1.936260027532
Shannon entropy of   "1234567890abcdefghijklmnopqrstuvwxyz": 5.169925001442
Shannon entropy of                           "Rosetta Code": 3.084962500721

[edit] JavaScript

Works with: ECMA-262 (5.1)

The proces function builds a histogram of character frequencies then iterates over it.

The entropy function calls into process and evaluates the frequencies as they're passed back.

(function(shannon) {
// Create a dictionary of character frequencies and iterate over it.
function process(s, evaluator) {
var h = Object.create(null), k;
s.split('').forEach(function(c) {
h[c] && h[c]++ || (h[c] = 1); });
if (evaluator) for (k in h) evaluator(k, h[k]);
return h;
};
// Measure the entropy of a string in bits per symbol.
shannon.entropy = function(s) {
var sum = 0,len = s.length;
process(s, function(k, f) {
var p = f/len;
sum -= p * Math.log(p) / Math.log(2);
});
return sum;
};
})(window.shannon = window.shannon || {});
 
// Log the Shannon entropy of a string.
function logEntropy(s) {
console.log('Entropy of "' + s + '" in bits per symbol:', shannon.entropy(s));
}
 
logEntropy('1223334444');
logEntropy('0');
logEntropy('01');
logEntropy('0123');
logEntropy('01234567');
logEntropy('0123456789abcdef');
Output:
Entropy of "1223334444" in bits per symbol: 1.8464393446710154
Entropy of "0" in bits per symbol: 0
Entropy of "01" in bits per symbol: 1
Entropy of "0123" in bits per symbol: 2
Entropy of "01234567" in bits per symbol: 3
Entropy of "0123456789abcdef" in bits per symbol: 4

[edit] jq

For efficiency with long strings, we use a hash (a JSON object) to compute the frequencies.

The helper function, counter, could be defined as an inner function of entropy, but for the sake of clarity and because it is independently useful, it is defined separately.

# Input: an array of strings.
# Output: an object with the strings as keys, the values of which are the corresponding frequencies.
def counter:
reduce .[] as $item ( {}; .[$item] += 1 ) ;
 
# entropy in bits of the input string
def entropy:
(explode | map( [.] | implode ) | counter
| [ .[] | . * (.|log) ] | add) as $sum
| ((length|log) - ($sum / length)) / (2|log) ;
Example:
"1223334444" | entropy # => 1.8464393446710154

[edit] Julia

A oneliner, probably not efficient on very long strings.

entropy(s)=-sum(x->x*log(2,x), [count(x->x==c,s)/length(s) for c in unique(s)])
Output:
julia> entropy("1223334444")
1.8464393446710154

[edit] Lang5

: -rot rot rot ; [] '__A set : dip swap __A swap 1 compress append '__A
set execute __A -1 extract nip ; : nip swap drop ; : sum '+ reduce ;
: 2array 2 compress ; : comb "" split ; : lensize length nip ;
: <group> #( a -- 'a )
grade subscript dup 's dress distinct strip
length 1 2array reshape swap
'A set
 : `filter(*) A in A swap select ;
'`filter apply
 ;
 
: elements(*) lensize ;
: entropy #( s -- n )
length "<group> 'elements apply" dip /
dup neg swap log * 2 log / sum ;
 
"1223334444" comb entropy . # 1.84643934467102

[edit] Mathematica

shE[s_String] := -Plus @@ ((# Log[2., #]) & /@ ((Length /@ Gather[#])/
Length[#]) &[Characters[s]])
Example:
 shE["1223334444"]
1.84644
shE["Rosetta Code"]
3.08496

[edit] MATLAB / Octave

This version allows for any input vectors, including strings, floats, negative integers, etc.

function E = entropy(d)
if ischar(d), d=abs(d); end;
[Y,I,J] = unique(d);
H = sparse(J,1,1);
p = full(H(H>0))/length(d);
E = -sum(p.*log2(p));
end;
Usage:
> entropy('1223334444')
ans = 1.8464

[edit] NetRexx

Translation of: REXX
/* NetRexx */
options replace format comments java crossref savelog symbols
 
runSample(Arg)
return
 
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/* REXX ***************************************************************
* 28.02.2013 Walter Pachl
**********************************************************************/

method getShannonEntropy(s = "1223334444") public static
--trace var occ c chars n cn i e p pl
Numeric Digits 30
occ = 0
chars = ''
n = 0
cn = 0
Loop i = 1 To s.length()
c = s.substr(i, 1)
If chars.pos(c) = 0 Then Do
cn = cn + 1
chars = chars || c
End
occ[c] = occ[c] + 1
n = n + 1
End i
p = ''
Loop ci = 1 To cn
c = chars.substr(ci, 1)
p[c] = occ[c] / n
End ci
e = 0
Loop ci = 1 To cn
c = chars.substr(ci, 1)
pl = log2(p[c])
e = e + p[c] * pl
End ci
Return -e
 
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
method log2(a = double) public static binary returns double
return Math.log(a) / Math.log(2)
 
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
method runSample(Arg) public static
parse Arg sstr
if sstr = '' then
sstr = '1223334444' -
'1223334444555555555' -
'122333' -
'1227774444' -
'aaBBcccDDDD' -
'1234567890abcdefghijklmnopqrstuvwxyz' -
'Rosetta_Code'
say 'Calculating Shannon''s entropy for the following list:'
say '['(sstr.space(1, ',')).changestr(',', ', ')']'
say
entropies = 0
ssMax = 0
-- This crude sample substitutes a '_' character for a space in the input strings
loop w_ = 1 to sstr.words()
ss = sstr.word(w_)
ssMax = ssMax.max(ss.length())
ss_ = ss.changestr('_', ' ')
entropy = getShannonEntropy(ss_)
entropies[ss] = entropy
end w_
loop report = 1 to sstr.words()
ss = sstr.word(report)
ss_ = ss.changestr('_', ' ')
Say 'Shannon entropy of' ('"'ss_'"').right(ssMax + 2)':' entropies[ss].format(null, 12)
end report
return
 
Output:
Calculating Shannon's entropy for the following list:
[1223334444, 1223334444555555555, 122333, 1227774444, aaBBcccDDDD, 1234567890abcdefghijklmnopqrstuvwxyz, Rosetta_Code]

Shannon entropy of                           "1223334444": 1.846439344671
Shannon entropy of                  "1223334444555555555": 1.969811065278
Shannon entropy of                               "122333": 1.459147917027
Shannon entropy of                           "1227774444": 1.846439344671
Shannon entropy of                          "aaBBcccDDDD": 1.936260027532
Shannon entropy of "1234567890abcdefghijklmnopqrstuvwxyz": 5.169925001442
Shannon entropy of                         "Rosetta Code": 3.084962500721

[edit] Nimrod

import tables, math
 
proc entropy(s): float =
var t = initCountTable[char]()
for c in s: t.inc(c)
for x in t.values: result -= x/s.len * log2(x/s.len)
 
echo entropy("1223334444")


[edit] Pascal

Free Pascal (http://freepascal.org).

 
PROGRAM entropytest;
 
USES StrUtils, Math;
 
TYPE FArray = ARRAY of CARDINAL;
 
VAR strng: STRING = '1223334444';
 
// list unique characters in a string
FUNCTION uniquechars(str: STRING): STRING;
VAR n: CARDINAL;
BEGIN
uniquechars := '';
FOR n := 1 TO length(str) DO
IF (PosEx(str[n],str,n)>0)
AND (PosEx(str[n],uniquechars,1)=0)
THEN uniquechars += str[n];
END;
 
// obtain a list of character-frequencies for a string
// given a string containing its unique characters
FUNCTION frequencies(str,ustr: STRING): FArray;
VAR u,s,p,o: CARDINAL;
BEGIN
SetLength(frequencies, Length(ustr)+1);
p := 0;
FOR u := 1 TO length(ustr) DO
FOR s := 1 TO length(str) DO BEGIN
o := p; p := PosEx(ustr[u],str,s);
IF (p>o) THEN INC(frequencies[u]);
END;
END;
 
// Obtain the Shannon entropy of a string
FUNCTION entropy(s: STRING): EXTENDED;
VAR pf : FArray;
us : STRING;
i,l: CARDINAL;
BEGIN
us := uniquechars(s);
pf := frequencies(s,us);
l := length(s);
entropy := 0.0;
FOR i := 1 TO length(us) DO
entropy -= pf[i]/l * log2(pf[i]/l);
END;
 
BEGIN
Writeln('Entropy of "',strng,'" is ',entropy(strng):2:5, ' bits.');
END.
 
Output:
Entropy of "1223334444" is 1.84644 bits.

[edit] PARI/GP

entropy(s)=s=Vec(s);my(v=vecsort(s,,8));-sum(i=1,#v,(x->x*log(x))(sum(j=1,#s,v[i]==s[j])/#s))/log(2)
>entropy("1223334444")
%1 = 1.8464393446710154934341977463050452232

[edit] Perl

sub entropy {
my %count; $count{$_}++ for @_;
my $entropy = 0;
for (values %count) {
my $p = $_/@_;
$entropy -= $p * log $p;
}
$entropy / log 2
}
 
print entropy split //, "1223334444";

[edit] Perl 6

sub entropy(@a) {
[+] map -> \p { p * -log p }, bag(@a).values »/» +@a;
}
 
say log(2) R/ entropy '1223334444'.comb;
Output:
1.84643934467102

In case we would like to add this function to Perl 6's core, here is one way it could be done:

use MONKEY_TYPING;
augment class Bag {
method entropy {
[+] map -> \p { - p * log p },
self.values »/» +self;
}
}
 
say '1223334444'.comb.Bag.entropy / log 2;

[edit] PL/I

*process source xref attributes or(!);
/*--------------------------------------------------------------------
* 08.08.2014 Walter Pachl translated from REXX version 1
*-------------------------------------------------------------------*/

ent: Proc Options(main);
Dcl (index,length,log2,substr) Builtin;
Dcl sysprint Print;
Dcl occ(100) Bin fixed(31) Init((100)0);
Dcl (n,cn,ci,i,pos) Bin fixed(31) Init(0);
Dcl chars Char(100) Var Init('');
Dcl s Char(100) Var Init('1223334444');
Dcl c Char(1);
Dcl (occf,p(100)) Dec Float(18);
Dcl e Dec Float(18) Init(0);
Do i=1 To length(s);
c=substr(s,i,1);
pos=index(chars,c);
If pos=0 Then Do;
pos=length(chars)+1;
cn+=1;
chars=chars!!c;
End;
occ(pos)+=1;
n+=1;
End;
do ci=1 To cn;
occf=occ(ci);
p(ci)=occf/n;
End;
Do ci=1 To cn;
e=e+p(ci)*log2(p(ci));
End;
Put Edit('s='''!!s!!''' Entropy=',-e)(Skip,a,f(15,12));
End;
Output:
s='1223334444' Entropy= 1.846439344671

[edit] Python

[edit] Python: Longer version

from __future__ import division
import math
 
def hist(source):
hist = {}; l = 0;
for e in source:
l += 1
if e not in hist:
hist[e] = 0
hist[e] += 1
return (l,hist)
 
def entropy(hist,l):
elist = []
for v in hist.values():
c = v / l
elist.append(-c * math.log(c ,2))
return sum(elist)
 
def printHist(h):
flip = lambda (k,v) : (v,k)
h = sorted(h.iteritems(), key = flip)
print 'Sym\thi\tfi\tInf'
for (k,v) in h:
print '%s\t%f\t%f\t%f'%(k,v,v/l,-math.log(v/l, 2))
 
 
 
source = "1223334444"
(l,h) = hist(source);
print '.[Results].'
print 'Length',l
print 'Entropy:', entropy(h, l)
printHist(h)
Output:
.[Results].
Length 10
Entropy: 1.84643934467
Sym	hi	fi	Inf
1	1.000000	0.100000	3.321928
2	2.000000	0.200000	2.321928
3	3.000000	0.300000	1.736966
4	4.000000	0.400000	1.321928

[edit] Python: More succinct version

The Counter module is only available in Python >= 2.7.

>>> import math
>>> from collections import Counter
>>>
>>> def entropy(s):
... p, lns = Counter(s), float(len(s))
... return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
...
>>> entropy("1223334444")
1.8464393446710154
>>>

[edit] R

entropy = function(s)
{freq = prop.table(table(strsplit(s, '')[1]))
-sum(freq * log(freq, base = 2))}
 
print(entropy("1223334444")) # 1.846439

[edit] Racket

#lang racket
(require math)
(provide entropy hash-entropy list-entropy digital-entropy)
 
(define (hash-entropy h)
(define (log2 x) (/ (log x) (log 2)))
(define n (for/sum [(c (in-hash-values h))] c))
(- (for/sum ([c (in-hash-values h)] #:unless (zero? c))
(* (/ c n) (log2 (/ c n))))))
 
(define (list-entropy x) (hash-entropy (samples->hash x)))
 
(define entropy (compose list-entropy string->list))
(define digital-entropy (compose entropy number->string))
 
(module+ test
(require rackunit)
(check-= (entropy "1223334444") 1.8464393446710154 1E-8)
(check-= (digital-entropy 1223334444) (entropy "1223334444") 1E-8)
(check-= (digital-entropy 1223334444) 1.8464393446710154 1E-8)
(check-= (entropy "xggooopppp") 1.8464393446710154 1E-8))
 
(module+ main (entropy "1223334444"))
Output:
 1.8464393446710154

[edit] REXX

[edit] version 1

/* REXX ***************************************************************
* 28.02.2013 Walter Pachl
* 12.03.2013 Walter Pachl typo in log corrected. thanx for testing
* 22.05.2013 -"- extended the logic to accept other strings
* 25.05.2013 -"- 'my' log routine is apparently incorrect
* 25.05.2013 -"- problem identified & corrected
**********************************************************************/

Numeric Digits 30
Parse Arg s
If s='' Then
s="1223334444"
occ.=0
chars=''
n=0
cn=0
Do i=1 To length(s)
c=substr(s,i,1)
If pos(c,chars)=0 Then Do
cn=cn+1
chars=chars||c
End
occ.c=occ.c+1
n=n+1
End
do ci=1 To cn
c=substr(chars,ci,1)
p.c=occ.c/n
/* say c p.c */
End
e=0
Do ci=1 To cn
c=substr(chars,ci,1)
e=e+p.c*log(p.c,30,2)
End
Say 'Version 1:' s 'Entropy' format(-e,,12)
Exit
 
log: Procedure
/***********************************************************************
* Return log(x) -- with specified precision and a specified base
* Three different series are used for the ranges 0 to 0.5
* 0.5 to 1.5
* 1.5 to infinity
* 03.09.1992 Walter Pachl
* 25.05.2013 -"- 'my' log routine is apparently incorrect
* 25.05.2013 -"- problem identified & corrected
***********************************************************************/

Parse Arg x,prec,b
If prec='' Then prec=9
Numeric Digits (2*prec)
Numeric Fuzz 3
Select
When x<=0 Then r='*** invalid argument ***'
When x<0.5 Then Do
z=(x-1)/(x+1)
o=z
r=z
k=1
Do i=3 By 2
ra=r
k=k+1
o=o*z*z
r=r+o/i
If r=ra Then Leave
End
r=2*r
End
When x<1.5 Then Do
z=(x-1)
o=z
r=z
k=1
Do i=2 By 1
ra=r
k=k+1
o=-o*z
r=r+o/i
If r=ra Then Leave
End
End
Otherwise /* 1.5<=x */ Do
z=(x+1)/(x-1)
o=1/z
r=o
k=1
Do i=3 By 2
ra=r
k=k+1
o=o/(z*z)
r=r+o/i
If r=ra Then Leave
End
r=2*r
End
End
If b<>'' Then
r=r/log(b,prec)
Numeric Digits (prec)
r=r+0
Return r
/* REXX ***************************************************************
* Test program to compare Versions 1 and 2
* (the latter tweaked to be acceptable by my (oo)Rexx
* and to give the same output.)
* version 1 was extended to accept the strings of the incorrect flag
* 22.05.2013 Walter Pachl (I won't analyze the minor differences)
* 25.05.2013 I did now analyze and had to discover that
* 'my' log routine is apparently incorrect
* 25.05.2013 problem identified & corrected
*********************************************************************/

Call both '1223334444'
Call both '1223334444555555555'
Call both '122333'
Call both '1227774444'
Call both 'aaBBcccDDDD'
Call both '1234567890abcdefghijklmnopqrstuvwxyz'
Exit
both:
Parse Arg s
Call entropy s
Call entropy2 s
Say ' '
Return
 
Output:
Version 1: 1223334444 Entropy 1.846439344671
Version 2: 1223334444 Entropy 1.846439344671

Version 1: 1223334444555555555 Entropy 1.969811065278
Version 2: 1223334444555555555 Entropy 1.969811065278

Version 1: 122333 Entropy 1.459147917027
Version 2: 122333 Entropy 1.459147917027

Version 1: 1227774444 Entropy 1.846439344671
Version 2: 1227774444 Entropy 1.846439344671

Version 1: 1234567890abcdefghijklmnopqrstuvwxyz Entropy 5.169925001442
Version 2: 1234567890abcdefghijklmnopqrstuvwxyz Entropy 5.169925001442

[edit] version 2

REXX doesn't have a BIF for LOG or LN,   so the subroutine (function) LOG2 is included herein.
The LOG2 subroutine in only included here for functionality, not to document how to calculate LOG2 using REXX.

/*REXX program calculates the information entropy  for a given char str.*/
numeric digits 30 /*use thirty digits for precision*/
parse arg $; if $=='' then $=1223334444 /*obtain optional input*/
n=0; @.=0; L=length($); $$=
 
do j=1 for L; _=substr($,j,1) /*process each character in $ str*/
if @._==0 then do; n=n+1 /*if unique, bump char counter. */
$$=$$ || _ /*add this character to the list.*/
end
@._ = @._+1 /*keep track of this char count. */
end /*j*/
sum=0 /*calc info entropy for each char*/
do i=1 for n; _=substr($$,i,1) /*obtain a char from unique list.*/
sum=sum - @._/L * log2(@._/L) /*add (negatively) the entropies.*/
end /*i*/
 
say ' input string: ' $
say 'string length: ' L
say ' unique chars: ' n ; say
say 'the information entropy of the string ──► ' format(sum,,12) " bits."
exit /*stick a fork in it, we're done.*/
/*──────────────────────────────────LOG2 subroutine───────────────────────────*/
log2: procedure; parse arg x 1 xx; ig= x>1.5; is=1-2*(ig\==1); ii=0
numeric digits digits()+5 /* [↓] precision of E must be > digits().*/
e=2.7182818284590452353602874713526624977572470936999595749669676277240766303535
do while ig & xx>1.5 | \ig&xx<.5; _=e; do j=-1; iz=xx* _**-is
if j>=0 then if ig & iz<1 | \ig&iz>.5 then leave; _=_*_; izz=iz; end /*j*/
xx=izz; ii=ii+is*2**j; end /*while*/; x=x* e**-ii-1; z=0; _=-1; p=z
do k=1; _=-_*x; z=z+_/k; if z=p then leave; p=z; end /*k*/
r=z+ii; if arg()==2 then return r; return r/log2(2,0)
Output:
when using the default input of: 1223334444
 input string:  1223334444
string length:  10
 unique chars:  4

the information entropy of the string ──►  1.846439344671  bits.
Output:
when using the input of: Rosetta Code
 input string:  Rosetta Code
string length:  12
 unique chars:  9

the information entropy of the string ──►  3.084962500721  bits.

[edit] Ruby

Works with: Ruby version 1.9
def entropy(s)
counts = Hash.new(0)
s.each_char { |c| counts[c] += 1 }
 
counts.values.reduce(0) do |entropy, count|
freq = count / s.length.to_f
entropy - freq * Math.log2(freq)
end
end

One-liner, same performance (or better):

def entropy2(s)
s.each_char.group_by(&:to_s).values.map { |x| x.length / s.length.to_f }.reduce(0) { |e, x| e - x*Math.log2(x) }
end

[edit] Rust

// works for Rust 0.9
fn entropy(s: &str) -> f32 {
let mut entropy: f32 = 0.0;
let mut histogram = [0, ..256];
let len = s.len();
 
for i in range(0, len) { histogram[s[i]] += 1; }
for i in range(0, 256) {
if histogram[i] > 0 {
let ratio = (histogram[i] as f32 / len as f32) as f32;
entropy -= (ratio * log2(ratio)) as f32;
}
}
 
entropy
}

[edit] Scala

import scala.math._
 
def entropy( v:String ) = { v
.groupBy (a => a)
.values
.map( i => i.length.toDouble / v.length )
.map( p => -p * log10(p) / log10(2))
.sum
}
 
// Confirm that "1223334444" has an entropy of about 1.84644
assert( math.round( entropy("1223334444") * 100000 ) * 0.00001 == 1.84644 )

[edit] scheme

A version capable of calculating multidimensional entropy.

 
(define (entropy input)
(define (close? a b)
(define (norm x y)
(define (infinite_norm m n)
(define (absminus p q)
(cond ((null? p) '())
(else (cons (abs (- (car p) (car q))) (absminus (cdr p) (cdr q))))))
(define (mm l)
(cond ((null? (cdr l)) (car l))
((> (car l) (cadr l)) (mm (cons (car l) (cddr l))))
(else (mm (cdr l)))))
(mm (absminus m n)))
(if (pair? x) (infinite_norm x y) (abs (- x y))))
(let ((epsilon 0.2))
(< (norm a b) epsilon)))
(define (freq-list x)
(define (f x)
(define (count a b)
(cond ((null? b) 1)
(else (+ (if (close? a (car b)) 1 0) (count a (cdr b))))))
(let ((t (car x)) (tt (cdr x)))
(count t tt)))
(define (g x)
(define (filter a b)
(cond ((null? b) '())
((close? a (car b)) (filter a (cdr b)))
(else (cons (car b) (filter a (cdr b))))))
(let ((t (car x)) (tt (cdr x)))
(filter t tt)))
(cond ((null? x) '())
(else (cons (f x) (freq-list (g x))))))
(define (scale x)
(define (sum x)
(if (null? x) 0.0 (+ (car x) (sum (cdr x)))))
(let ((z (sum x)))
(map (lambda(m) (/ m z)) x)))
(define (cal x)
(if (null? x) 0 (+ (* (car x) (/ (log (car x)) (log 2))) (cal (cdr x)))))
(- (cal (scale (freq-list input)))))
 
(entropy (list 1 2 2 3 3 3 4 4 4 4))
(entropy (list (list 1 1) (list 1.1 1.1) (list 1.2 1.2) (list 1.3 1.3) (list 1.5 1.5) (list 1.6 1.6)))
 
Output:
1.8464393446710154 bits

1.4591479170272448 bits

[edit] Seed7

$ include "seed7_05.s7i";
include "float.s7i";
include "math.s7i";
 
const func float: entropy (in string: stri) is func
result
var float: entropy is 0.0;
local
var hash [char] integer: count is (hash [char] integer).value;
var char: ch is ' ';
var float: p is 0.0;
begin
for ch range stri do
if ch in count then
incr(count[ch]);
else
count @:= [ch] 1;
end if;
end for;
for key ch range count do
p := flt(count[ch]) / flt(length(stri));
entropy -:= p * log(p) / log(2.0);
end for;
end func ;
 
const proc: main is func
begin
writeln(entropy("1223334444") digits 5);
end func;
Output:
1.84644

[edit] Tcl

proc entropy {str} {
set log2 [expr log(2)]
foreach char [split $str ""] {dict incr counts $char}
set entropy 0.0
foreach count [dict values $counts] {
set freq [expr {$count / double([string length $str])}]
set entropy [expr {$entropy - $freq * log($freq)/$log2}]
}
return $entropy
}

Demonstration:

puts [format "entropy = %.5f" [entropy "1223334444"]]
puts [format "entropy = %.5f" [entropy "Rosetta Code"]]
Output:
entropy = 1.84644
entropy = 3.08496

[edit] XPL0

code real RlOut=48, Ln=54;      \intrinsic routines
string 0; \use zero-terminated strings
 
func StrLen(A); \Return number of characters in an ASCIIZ string
char A;
int I;
for I:= 0, -1>>1-1 do
if A(I) = 0 then return I;
 
func real Entropy(Str); \Return Shannon entropy of string
char Str;
int Len, I, Count(128);
real Sum, Prob;
[Len:= StrLen(Str);
for I:= 0 to 127 do Count(I):= 0;
for I:= 0 to Len-1 do \count number of each character in string
Count(Str(I)):= Count(Str(I)) + 1;
Sum:= 0.0;
for I:= 0 to 127 do
if Count(I) # 0 then \(avoid Ln(0.0) error)
[Prob:= float(Count(I)) / float(Len); \probability of char in string
Sum:= Sum + Prob*Ln(Prob);
];
return -Sum/Ln(2.0);
];
 
RlOut(0, Entropy("1223334444"))
Output:
    1.84644

[edit] zkl

Translation of: D
fcn entropy(text){
text.pump(Void,fcn(c,freq){ c=c.toAsc(); freq[c]+=1; freq }
.fp1( (0).pump(256,List,0.0).copy() )) // array[256] of 0.0
.filter() // remove all zero entries from array
.apply('/(text.len())) // (num of char)/len
.apply(fcn(p){-p*p.log()}) // |p*ln(p)|
.sum(0.0)/(2.0).log(); // sum * ln(e)/ln(2) to convert to log2
}
 
entropy("1223334444").println(" bits");
Output:
1.84644 bits
Personal tools
Namespaces

Variants
Actions
Community
Explore
Misc
Toolbox