K-means++ clustering

From Rosetta Code
Task
K-means++ clustering
You are encouraged to solve this task according to the task description, using any language you may know.
K-means++ clustering a classification of data, so that points assigned to the same cluster are similar (in some sense). It is identical to the K-means algorithm, except for the selection of initial conditions.
A circular distribution of data partitioned into 7 colored clusters; similar to the top of a beach ball
This data was partitioned into 7 clusters using the K-means algorithm.

The task is to implement the K-means++ algorithm. Produce a function which takes two arguments: the number of clusters K, and the dataset to classify. K is a positive integer and the dataset is a list of points in the Cartesian plane. The output is a list of clusters (related sets of points, according to the algorithm).

For extra credit (in order):

  1. Provide a function to exercise your code, which generates a list of random points.
  2. Provide a visualization of your results, including centroids (see example image).
  3. Generalize the function to polar coordinates (in radians).
  4. Generalize the function to points in an arbitrary N space (i.e. instead of x,y pairs, points are an N-tuples representing coordinates in ℝN).
    If this is different or more difficult than the [naive] solution for ℝ2, discuss what had to change to support N dimensions.

Extra credit is only awarded if the examples given demonstrate the feature in question. To earn credit for 1. and 2., visualize 6 clusters of 30,000 points in ℝ2. It is not necessary to provide visualization for spaces higher than ℝ2 but the examples should demonstrate features 3. and 4. if the solution has them.

C[edit]

K++-C.png

Output is in EPS. 100,000 point EPS file can take quite a while to display.

To extend the code to handle dimensions higher than 2, just make point_t have more coordinates, and change the dist2 distance function (and the point generation, or course) accordingly. Neither is difficult to do, but it would make code somewhat longer and less to the point of the demonstrating the algorithm, so I chose simplicity and clarity over extra credit.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
 
typedef struct { double x, y; int group; } point_t, *point;
 
double randf(double m)
{
return m * rand() / (RAND_MAX - 1.);
}
 
point gen_xy(int count, double radius)
{
double ang, r;
point p, pt = malloc(sizeof(point_t) * count);
 
/* note: this is not a uniform 2-d distribution */
for (p = pt + count; p-- > pt;) {
ang = randf(2 * M_PI);
r = randf(radius);
p->x = r * cos(ang);
p->y = r * sin(ang);
}
 
return pt;
}
 
inline double dist2(point a, point b)
{
double x = a->x - b->x, y = a->y - b->y;
return x*x + y*y;
}
 
inline int
nearest(point pt, point cent, int n_cluster, double *d2)
{
int i, min_i;
point c;
double d, min_d;
 
# define for_n for (c = cent, i = 0; i < n_cluster; i++, c++)
for_n {
min_d = HUGE_VAL;
min_i = pt->group;
for_n {
if (min_d > (d = dist2(c, pt))) {
min_d = d; min_i = i;
}
}
}
if (d2) *d2 = min_d;
return min_i;
}
 
void kpp(point pts, int len, point cent, int n_cent)
{
# define for_len for (j = 0, p = pts; j < len; j++, p++)
int i, j;
int n_cluster;
double sum, *d = malloc(sizeof(double) * len);
 
point p, c;
cent[0] = pts[ rand() % len ];
for (n_cluster = 1; n_cluster < n_cent; n_cluster++) {
sum = 0;
for_len {
nearest(p, cent, n_cluster, d + j);
sum += d[j];
}
sum = randf(sum);
for_len {
if ((sum -= d[j]) > 0) continue;
cent[n_cluster] = pts[j];
break;
}
}
for_len p->group = nearest(p, cent, n_cluster, 0);
free(d);
}
 
point lloyd(point pts, int len, int n_cluster)
{
int i, j, min_i;
int changed;
 
point cent = malloc(sizeof(point_t) * n_cluster), p, c;
 
/* assign init grouping randomly */
//for_len p->group = j % n_cluster;
 
/* or call k++ init */
kpp(pts, len, cent, n_cluster);
 
do {
/* group element for centroids are used as counters */
for_n { c->group = 0; c->x = c->y = 0; }
for_len {
c = cent + p->group;
c->group++;
c->x += p->x; c->y += p->y;
}
for_n { c->x /= c->group; c->y /= c->group; }
 
changed = 0;
/* find closest centroid of each point */
for_len {
min_i = nearest(p, cent, n_cluster, 0);
if (min_i != p->group) {
changed++;
p->group = min_i;
}
}
} while (changed > (len >> 10)); /* stop when 99.9% of points are good */
 
for_n { c->group = i; }
 
return cent;
}
 
void print_eps(point pts, int len, point cent, int n_cluster)
{
# define W 400
# define H 400
int i, j;
point p, c;
double min_x, max_x, min_y, max_y, scale, cx, cy;
double *colors = malloc(sizeof(double) * n_cluster * 3);
 
for_n {
colors[3*i + 0] = (3 * (i + 1) % 11)/11.;
colors[3*i + 1] = (7 * i % 11)/11.;
colors[3*i + 2] = (9 * i % 11)/11.;
}
 
max_x = max_y = -(min_x = min_y = HUGE_VAL);
for_len {
if (max_x < p->x) max_x = p->x;
if (min_x > p->x) min_x = p->x;
if (max_y < p->y) max_y = p->y;
if (min_y > p->y) min_y = p->y;
}
scale = W / (max_x - min_x);
if (scale > H / (max_y - min_y)) scale = H / (max_y - min_y);
cx = (max_x + min_x) / 2;
cy = (max_y + min_y) / 2;
 
printf("%%!PS-Adobe-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10, H + 10);
printf( "/l {rlineto} def /m {rmoveto} def\n"
"/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
" gsave 1 setgray fill grestore gsave 3 setlinewidth"
" 1 setgray stroke grestore 0 setgray stroke }def\n"
);
for_n {
printf("%g %g %g setrgbcolor\n",
colors[3*i], colors[3*i + 1], colors[3*i + 2]);
for_len {
if (p->group != i) continue;
printf("%.3f %.3f c\n",
(p->x - cx) * scale + W / 2,
(p->y - cy) * scale + H / 2);
}
printf("\n0 setgray %g %g s\n",
(c->x - cx) * scale + W / 2,
(c->y - cy) * scale + H / 2);
}
printf("\n%%%%EOF");
free(colors);
# undef for_n
# undef for_len
}
 
#define PTS 100000
#define K 11
int main()
{
int i;
point v = gen_xy(PTS, 10);
point c = lloyd(v, PTS, K);
print_eps(v, PTS, c, K);
// free(v); free(c);
return 0;
}

D[edit]

Translation of: C
import std.stdio, std.math, std.random, std.typecons, std.algorithm;
 
// On Windows this uses the printf from the Microsoft C runtime,
// that doesn't handle real type and some of the C99 format
// specifiers, but it's faster for bulk printing.
extern(C) nothrow int printf(const char*, ...);
 
struct Point {
immutable double x, y; // Or float.
size_t cluster;
}
 
Point[] generatePoints(in size_t nPoints,
in double radius,
ref Xorshift rnd)
in {
assert(nPoints > 0);
assert(radius > 0);
} out(result) {
assert(result.length == nPoints);
foreach (const ref p; result) {
assert(p.cluster == 0);
assert(!p.x.isNaN && !p.y.isNaN);
}
} body {
Point[] points;
points.reserve(nPoints);
 
// This is not a uniform 2D distribution.
foreach (immutable i; 0 .. nPoints) {
immutable r = uniform(0.0, radius, rnd);
immutable ang = uniform(0.0, 2 * PI, rnd);
points ~= Point(r * ang.cos, r * ang.sin); // Sincos?
}
 
return points;
}
 
 
struct ClusterCenter {
double x, y;
void opAssign(in ref Point p) pure nothrow @nogc {
this.x = p.x;
this.y = p.y;
}
}
 
 
const(ClusterCenter)[] lloyd(Point[] points,
in size_t nclusters,
ref Xorshift rnd)
in {
assert(points.length >= nclusters);
assert(nclusters > 0);
foreach (const ref p; points)
assert(!p.x.isNaN && !p.y.isNaN);
} out(result) {
assert(result.length == nclusters);
foreach (const ref cc; result)
assert(!cc.x.isNaN && !cc.y.isNaN);
} body {
/// Distance and index of the closest cluster center.
static Tuple!(size_t, double)
nearestClusterCenter(in ref Point point,
in ClusterCenter[] centers) pure nothrow @nogc
in {
assert(centers.length > 0);
} out(result) {
assert(result[0] < centers.length);
immutable ClusterCenter c = centers[result[0]];
immutable d = (c.x - point.x) ^^ 2 + (c.y - point.y) ^^ 2;
assert(feqrel(result[1], d) > 45); // Arbitrary.
} body {
static double sqrDistance2D(in ref ClusterCenter a,
in ref Point b) pure nothrow @nogc{
return (a.x - b.x) ^^ 2 + (a.y - b.y) ^^ 2;
}
 
size_t minIndex = point.cluster;
double minDist = double.max;
 
foreach (immutable i, const ref cc; centers) {
immutable d = sqrDistance2D(cc, point);
if (minDist > d) {
minDist = d;
minIndex = i;
}
}
 
return tuple(minIndex, minDist);
}
 
 
static void kMeansPP(Point[] points,
ClusterCenter[] centers,
ref Xorshift rnd)
in {
assert(points.length >= centers.length);
assert(centers.length > 0);
} body {
centers[0] = points[uniform(0, $, rnd)];
auto d = new double[points.length];
 
foreach (immutable i; 1 .. centers.length) {
double sum = 0;
foreach (immutable j, const ref p; points) {
d[j] = nearestClusterCenter(p, centers[0 .. i])[1];
sum += d[j];
}
 
sum = uniform(0.0, sum, rnd);
 
foreach (immutable j, immutable dj; d) {
sum -= dj;
if (sum > 0)
continue;
centers[i] = points[j];
break;
}
}
 
foreach (ref p; points)
// Implicit cast of Hconst!ClusterCenter
// to ClusterCenter[].
p.cluster = nearestClusterCenter(p, centers)[0];
}
 
 
auto centers = new ClusterCenter[nclusters];
kMeansPP(points, centers, rnd);
auto clusterSizes = new size_t[centers.length];
 
size_t changed;
do {
// Find clusters centroids.
centers[] = ClusterCenter(0, 0);
clusterSizes[] = 0;
 
foreach (immutable i, const ref p; points)
with (centers[p.cluster]) {
clusterSizes[p.cluster]++;
x += p.x;
y += p.y;
}
 
foreach (immutable i, ref cc; centers) {
cc.x /= clusterSizes[i];
cc.y /= clusterSizes[i];
}
 
// Find closest centroid of each point.
changed = 0;
foreach (ref p; points) {
immutable minI = nearestClusterCenter(p, centers)[0];
if (minI != p.cluster) {
changed++;
p.cluster = minI;
}
}
// Stop when 99.9% of points are good.
} while (changed > (points.length >> 10));
 
return centers;
}
 
 
void printEps(in Point[] points, in ClusterCenter[] centers,
in size_t W = 400, in size_t H = 400) nothrow
in {
assert(points.length >= centers.length);
assert(centers.length > 0);
assert(W > 0 && H > 0);
foreach (const ref p; points)
assert(!p.x.isNaN && !p.y.isNaN);
foreach (const ref cc; centers)
assert(!cc.x.isNaN && !cc.y.isNaN);
} body {
auto findBoundingBox() nothrow @nogc {
double min_x, max_x, min_y, max_y;
max_x = max_y = -double.max;
min_x = min_y = double.max;
 
foreach (const ref p; points) {
if (max_x < p.x) max_x = p.x;
if (min_x > p.x) min_x = p.x;
if (max_y < p.y) max_y = p.y;
if (min_y > p.y) min_y = p.y;
}
assert(max_x > min_x && max_y > min_y);
 
return tuple(min(W / (max_x - min_x), H / (max_y - min_y)),
(max_x + min_x) / 2, (max_y + min_y) / 2);
}
//immutable (scale, cx, cy) = findBoundingBox();
immutable sc_cx_cy = findBoundingBox();
immutable double scale = sc_cx_cy[0];
immutable double cx = sc_cx_cy[1];
immutable double cy = sc_cx_cy[2];
 
static immutable struct Color { immutable double r, g, b; }
 
immutable size_t k = centers.length;
Color[] colors;
colors.reserve(centers.length);
foreach (immutable i; 0 .. centers.length)
colors ~= Color((3 * (i + 1) % k) / double(k),
(7 * i % k) / double(k),
(9 * i % k) / double(k));
 
printf("%%!PS-Adobe-3.0\n%%%%BoundingBox: -5 -5 %d %d\n",
W + 10, H + 10);
 
printf("/l {rlineto} def /m {rmoveto} def\n" ~
"/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n" ~
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath " ~
" gsave 1 setgray fill grestore gsave 3 setlinewidth" ~
" 1 setgray stroke grestore 0 setgray stroke }def\n");
 
foreach (immutable i, const ref cc; centers) {
printf("%g %g %g setrgbcolor\n", colors[i].tupleof);
 
foreach (const ref p; points) {
if (p.cluster != i)
continue;
printf("%.3f %.3f c\n",
(p.x - cx) * scale + W / 2,
(p.y - cy) * scale + H / 2);
}
 
printf("\n0 setgray %g %g s\n",
(cc.x - cx) * scale + W / 2,
(cc.y - cy) * scale + H / 2);
}
 
"\n%%%%EOF".printf;
}
 
 
void main() {
enum size_t nPoints = 100_000;
enum size_t nClusters = 11; // k.
auto rnd = 1.Xorshift; // For speed and repeatability.
 
auto points = generatePoints(nPoints, 10, rnd);
const clusterCenters = lloyd(points, nClusters, rnd);
printEps(points, clusterCenters);
}

Compiled with ldc2 it's about as fast as the C entry.

Euler Math Toolbox[edit]

 
>type kmeanscluster
function kmeanscluster (x: numerical, k: index)
n=rows(x); m=cols(x);
i=floor((0:k)/k*(n-1))+1;
means=zeros(k,m);
loop 1 to k;
means[#]=sum(x[i[#]:(i[#+1]-1)]')'/(i[#+1]-i[#]);
end;
j=1:n;
loop 1 to n;
d=sum((x[#]-means)^2);
j[#]=extrema(d')[2];
end;
repeat
loop 1 to k;
i=nonzeros(j==#);
if cols(i)==0 then means[#]=1;
else means[#]=(sum(x[i]')/cols(i))';
endif;
end;
jold=j;
loop 1 to n;
d=sum((x[#]-means)^2);
j[#]=extrema(d')[2];
end;
if all(jold==j) then break; endif;
end
return j
endfunction
 

Let us apply to random data.

 
>load clustering.e
Functions for clustering data.
>np=5; m=3*normal(np,2);
% Spread n points randomly around these points.
>n=5000; x=m[intrandom(1,n,np)]+normal(n,2);
% The function kmeanscluster contains the algorithm. It returns the
% indices of the clusters the points contain to.
>j=kmeanscluster(x,np);
% We plot each point with a color representing its cluster.
>P=x'; ...
> plot2d(P[1],P[2],r=totalmax(abs(m))+2,color=10+j,points=1,style="."); ...
> loop 1 to k; plot2d(m[#,1],m[#,2],points=1,style="o#",add=1); end; ...
> insimg;
 

ClusteringEulerMathToolbox.png

Go[edit]

package main
 
import (
"fmt"
"image"
"image/color"
"image/draw"
"image/png"
"math"
"math/rand"
"os"
"time"
)
 
type r2 struct {
x, y float64
}
 
type r2c struct {
r2
c int // cluster number
}
 
// kmpp implements K-means++, satisfying the basic task requirement
func kmpp(k int, data []r2c) {
kMeans(data, kmppSeeds(k, data))
}
 
// kmppSeeds is the ++ part.
// It generates the initial means for the k-means algorithm.
func kmppSeeds(k int, data []r2c) []r2 {
s := make([]r2, k)
s[0] = data[rand.Intn(len(data))].r2
d2 := make([]float64, len(data))
for i := 1; i < k; i++ {
var sum float64
for j, p := range data {
_, dMin := nearest(p, s[:i])
d2[j] = dMin * dMin
sum += d2[j]
}
target := rand.Float64() * sum
j := 0
for sum = d2[0]; sum < target; sum += d2[j] {
j++
}
s[i] = data[j].r2
}
return s
}
 
// nearest finds the nearest mean to a given point.
// return values are the index of the nearest mean, and the distance from
// the point to the mean.
func nearest(p r2c, mean []r2) (int, float64) {
iMin := 0
dMin := math.Hypot(p.x-mean[0].x, p.y-mean[0].y)
for i := 1; i < len(mean); i++ {
d := math.Hypot(p.x-mean[i].x, p.y-mean[i].y)
if d < dMin {
dMin = d
iMin = i
}
}
return iMin, dMin
}
 
// kMeans algorithm. Lloyd's
func kMeans(data []r2c, mean []r2) {
// initial assignment
for i, p := range data {
cMin, _ := nearest(p, mean)
data[i].c = cMin
}
mLen := make([]int, len(mean))
for {
// update means
for i := range mean {
mean[i] = r2{}
mLen[i] = 0
}
for _, p := range data {
mean[p.c].x += p.x
mean[p.c].y += p.y
mLen[p.c]++
}
for i := range mean {
inv := 1 / float64(mLen[i])
mean[i].x *= inv
mean[i].y *= inv
}
// make new assignments, count changes
var changes int
for i, p := range data {
if cMin, _ := nearest(p, mean); cMin != p.c {
changes++
data[i].c = cMin
}
}
if changes == 0 {
return
}
}
}
 
// parameters for extra credit exercises
type ecParam struct {
k int
nPoints int
xBox, yBox int
stdv int
}
 
// extra credit 1 and 2:
func main() {
ec := &ecParam{6, 30000, 300, 200, 30}
 
origin, data := genECData(ec)
vis(ec, data, "origin")
fmt.Println("Data set origins:")
fmt.Println(" x y")
for _, o := range origin {
fmt.Printf("%5.1f  %5.1f\n", o.x, o.y)
}
 
kmpp(ec.k, data)
 
fmt.Println(
"\nCluster centroids, mean distance from centroid, number of points:")
fmt.Println(" x y distance points")
cent := make([]r2, ec.k)
cLen := make([]int, ec.k)
inv := make([]float64, ec.k)
for _, p := range data {
cent[p.c].x += p.x
cent[p.c].y += p.y
cLen[p.c]++
}
for i, iLen := range cLen {
inv[i] = 1 / float64(iLen)
cent[i].x *= inv[i]
cent[i].y *= inv[i]
}
dist := make([]float64, ec.k)
for _, p := range data {
dist[p.c] += math.Hypot(p.x-cent[p.c].x, p.y-cent[p.c].y)
}
for i, iLen := range cLen {
fmt.Printf("%5.1f  %5.1f  %8.1f  %6d\n",
cent[i].x, cent[i].y, dist[i]*inv[i], iLen)
}
vis(ec, data, "clusters")
}
 
// genECData generates random data for extra credit tasks.
// k origin points are randomly selected in a bounding box.
// nPoints/k coordinates are then generated for each origin point.
// The x and y coordinates of the data are normally distributed
// with standard deviation stdv. Thus data coordinates are not
// constrained to the origin box; they can range to +/- max float64.
func genECData(ec *ecParam) (orig []r2, data []r2c) {
rand.Seed(time.Now().UnixNano())
orig = make([]r2, ec.k)
data = make([]r2c, ec.nPoints)
for i, n := 0, 0; i < ec.k; i++ {
x := rand.Float64() * float64(ec.xBox)
y := rand.Float64() * float64(ec.yBox)
orig[i] = r2{x, y}
for j := ec.nPoints / ec.k; j > 0; j-- {
data[n].x = rand.NormFloat64()*float64(ec.stdv) + x
data[n].y = rand.NormFloat64()*float64(ec.stdv) + y
data[n].c = i
n++
}
}
return
}
 
// vis writes a .png for extra credit 2.
func vis(ec *ecParam, data []r2c, fn string) {
colors := make([]color.NRGBA, ec.k)
for i := range colors {
i3 := i * 3
third := i3 / ec.k
frac := uint8((i3 % ec.k) * 255 / ec.k)
switch third {
case 0:
colors[i] = color.NRGBA{frac, 255 - frac, 0, 255}
case 1:
colors[i] = color.NRGBA{0, frac, 255 - frac, 255}
case 2:
colors[i] = color.NRGBA{255 - frac, 0, frac, 255}
}
}
bounds := image.Rect(-ec.stdv, -ec.stdv, ec.xBox+ec.stdv, ec.yBox+ec.stdv)
im := image.NewNRGBA(bounds)
draw.Draw(im, bounds, image.NewUniform(color.White), image.ZP, draw.Src)
fMinX := float64(bounds.Min.X)
fMaxX := float64(bounds.Max.X)
fMinY := float64(bounds.Min.Y)
fMaxY := float64(bounds.Max.Y)
for _, p := range data {
imx := math.Floor(p.x)
imy := math.Floor(float64(ec.yBox) - p.y)
if imx >= fMinX && imx < fMaxX && imy >= fMinY && imy < fMaxY {
im.SetNRGBA(int(imx), int(imy), colors[p.c])
}
}
f, err := os.Create(fn + ".png")
if err != nil {
fmt.Println(err)
return
}
err = png.Encode(f, im)
if err != nil {
fmt.Println(err)
}
err = f.Close()
if err != nil {
fmt.Println(err)
}
}

Text output:

Data set origins:
    x      y
256.8  188.6
 91.7   51.2
201.8  100.2
161.6  102.8
 78.9  152.9
 97.8   17.4

Cluster centroids, mean distance from centroid, number of points:
    x      y  distance  points
152.4  102.1      30.9    5654
104.8    8.7      31.4    4947
211.3   99.4      32.0    4961
 78.3   57.7      29.4    4817
257.7  191.4      36.5    4915
 76.9  156.5      35.0    4706

Visualization. Original clusters on left, discovered clusters on right.

GoOrigin.png

GoClusters.png

J[edit]

Solution:
   NB.  Selection of initial centroids, per K-means++
initialCentroids =: (] , randomCentroid)^:(<:@:]`(,:@:seedCentroid@:[))~
seedCentroid =: {~ ?@#
randomCentroid =: [ {~ [: wghtProb [: <./ distance/~
distance =: +/&.:*:@:-"1 NB. Extra credit #3 (N-dimensional is the same as 2-dimensional in J)
wghtProb =: 1&$: : ((%{:)@:(+/\)@:] I. [ ?@$ 0:)"0 1 NB. Due to Roger Hui http://j.mp/lj5Pnt
 
NB. Having selected the initial centroids, the standard K-means algo follows
centroids =: ([ mean/.~ closestCentroid)^:(]`_:`initialCentroids)
closestCentroid =: [: (i.<./)"1 distance/
mean =: +/ % #
Extra credit:
   randMatrix           =:  ?@$&0  NB.  Extra credit #1
packPoints =: <"1@:|: NB. Extra credit #2: Visualization code due to Max Harms http://j.mp/l8L45V
plotClusters =: dyad define NB. as is the example image in this task
require 'plot'
pd 'reset;aspect 1;type dot;pensize 2'
pd@:packPoints&> y
pd 'type marker;markersize 1.5;color 0 0 0'
pd@:packPoints x
pd 'markersize 0.8;color 255 255 0'
pd@:packPoints x
pd 'show'
)
 
NB. Extra credit #4: Polar coordinates are not available in this version
NB. but wouldn't be hard to provide with &.cartToPole .
Example:
   plotRandomClusters   =:  3&$: : (dyad define)
dataset =. randMatrix 2 {. y,2
 
centers =. x centroids dataset
clusters =. centers (closestCentroid~ </. ]) dataset
centers plotClusters clusters
)
 
plotRandomClusters 300 NB. 300 points, 3 clusters
6 plotRandomClusters 30000 NB. 3e5 points, 6 clusters
10 plotRandomClusters 17000 5 NB. 17e3 points, 10 clusters, 5 dimensions

Mathematica[edit]

Solution - Initial kmeans code comes from http://mathematica.stackexchange.com/questions/7441/k-means-clustering, now extended to kmeans++ by introducing the function initM.
Was not able to upload pictures of the result...
:

initM[list_List, k_Integer, distFunc_Symbol] := 
Module[{m = {RandomChoice[list]}, n, d},
While[Length[m] < k,
n = [email protected][m, #] & /@ list;
d = Apply[distFunc, Transpose[{n, list}], {1}];
m = Append[m, RandomChoice[d -> list]]
];
m
];
kmeanspp[list_, k_,
opts : OptionsPattern[{DistanceFunction ->
SquaredEuclideanDistance, "RandomSeed" -> {}}]] :=
BlockRandom[SeedRandom[OptionValue["RandomSeed"]];
Module[{m = initM[list, k, OptionValue[DistanceFunction]], update,
partition, clusters}, update[] := m = Mean /@ clusters;
partition[_] := (clusters =
GatherBy[list,
RandomChoice@
Nearest[m, #, (# -> OptionValue[#] &@DistanceFunction)] &];
update[]);
FixedPoint[partition, list];
{clusters, m}
]
];

Extra credit:
1. no changes required for N dimensions, it juts works.
2. random data can be generated with

dim = 3;
points = 3000;
l = RandomReal[1, {points, dim}];

or

l = Select[ RandomReal[{-1, 1}, {points,2}], 
   EuclideanDistance[#, {0, 0}] <= 1 &];

or

x1 = RandomVariate[MultinormalDistribution[{0, 0}, {{1, 0}, {0, 20}}],
   points]; 
x2 = 
 RandomVariate[MultinormalDistribution[{10, 0}, {{1, 0}, {0, 20}}], 
  points];
l = Join[x1, x2];

3. data can be visualized with 2D:

dim = 2;
points = 30000;
l = RandomReal[1, {points, dim}];
k = 6
r1 = kmeanspp[l, k];
p1 = ListPlot[r1[[1]]];
p2 = ListPlot[r1[[2]],PlotMarkers -> {"#"}];
Show[{p1, p2}]

3D:

dim = 3;
points = 3000;
l = RandomReal[1, {points, dim}];
k = 6
r1 = kmeanspp[l, k];
p1 = ListPointPlot3D[r1[[1]]];
p2 = ListPointPlot3D[r1[[2]]];
Show[{p1, p2}]


Another version

KMeans[k_, data_] :=

Module[{Renew, Label, Iteration},
 clusters = RandomSample[data, k];
 Label[clusters_] := 
  Flatten[Table[
    Ordering[
     Table[EuclideanDistance[datai, clustersj], {j, 
       Length[clusters]}], 1], {i, Length[data]}]];
 Renew[labels_] :=
  Module[{position},
   position = PositionIndex[labels];
   Return[Table[Mean[data[[positioni]]], {i, Length[position]}]]];
 Iteration[labels_, clusters_] :=
  
  Module[{newlabels, newclusters},
   newclusters = Renew[labels];
   newlabels = Label[newclusters];
   If[newlabels == labels, labels, 
    Iteration[newlabels, newclusters]]];
 Return[Iteration[clusters, Label[clusters]]]]

Perl 6[edit]

Works with: rakudo version 2015-09-16

We use Complex numbers to represent points in the plane. We feed the algorithm with three artificially made clouds of points so we can easily see if the output makes sense.

sub infix:«-means++»(Int $K, @data) {
my @means = @data.pick;
until @means == $K {
my @cumulD2 = [\+] @data.map: -> $x {
min @means.map: { abs($x - $_)**2 }
}
my $rand = rand * @cumulD2[*-1];
@means.push: @data[
(^@data).first: { @cumulD2[$_] > $rand }
];
}
sub cluster { @data.classify: -> $x { @means.min: { abs($_ - $x) } } }
loop (
my %cluster;
1e-10 < [+] (@means Z- keys (%cluster = cluster))».abs X** 2;
@means = %cluster.values.map( { .elems R/ [+] @$_ } )
) { ; }
return @means;
}
 
my @centers = 0, 5, 3 + 2i;
my @data = flat @centers.map: { ($_ + .5 - rand + (.5 - rand) * i) xx 100 }
@data.=pick(*);
.say for 3-means++ @data;
Output:
5.04622376429502+0.0145269848483031i
0.0185674577571743+0.0298199687431731i
2.954898072093+2.14922298688815i

Python[edit]

Translation of: D
from math import pi, sin, cos
from collections import namedtuple
from random import random, choice
from copy import copy
 
try:
import psyco
psyco.full()
except ImportError:
pass
 
 
FLOAT_MAX = 1e100
 
 
class Point:
__slots__ = ["x", "y", "group"]
def __init__(self, x=0.0, y=0.0, group=0):
self.x, self.y, self.group = x, y, group
 
 
def generate_points(npoints, radius):
points = [Point() for _ in xrange(npoints)]
 
# note: this is not a uniform 2-d distribution
for p in points:
r = random() * radius
ang = random() * 2 * pi
p.x = r * cos(ang)
p.y = r * sin(ang)
 
return points
 
 
def nearest_cluster_center(point, cluster_centers):
"""Distance and index of the closest cluster center"""
def sqr_distance_2D(a, b):
return (a.x - b.x) ** 2 + (a.y - b.y) ** 2
 
min_index = point.group
min_dist = FLOAT_MAX
 
for i, cc in enumerate(cluster_centers):
d = sqr_distance_2D(cc, point)
if min_dist > d:
min_dist = d
min_index = i
 
return (min_index, min_dist)
 
 
def kpp(points, cluster_centers):
cluster_centers[0] = copy(choice(points))
d = [0.0 for _ in xrange(len(points))]
 
for i in xrange(1, len(cluster_centers)):
sum = 0
for j, p in enumerate(points):
d[j] = nearest_cluster_center(p, cluster_centers[:i])[1]
sum += d[j]
 
sum *= random()
 
for j, di in enumerate(d):
sum -= di
if sum > 0:
continue
cluster_centers[i] = copy(points[j])
break
 
for p in points:
p.group = nearest_cluster_center(p, cluster_centers)[0]
 
 
def lloyd(points, nclusters):
cluster_centers = [Point() for _ in xrange(nclusters)]
 
# call k++ init
kpp(points, cluster_centers)
 
lenpts10 = len(points) >> 10
 
changed = 0
while True:
# group element for centroids are used as counters
for cc in cluster_centers:
cc.x = 0
cc.y = 0
cc.group = 0
 
for p in points:
cluster_centers[p.group].group += 1
cluster_centers[p.group].x += p.x
cluster_centers[p.group].y += p.y
 
for cc in cluster_centers:
cc.x /= cc.group
cc.y /= cc.group
 
# find closest centroid of each PointPtr
changed = 0
for p in points:
min_i = nearest_cluster_center(p, cluster_centers)[0]
if min_i != p.group:
changed += 1
p.group = min_i
 
# stop when 99.9% of points are good
if changed <= lenpts10:
break
 
for i, cc in enumerate(cluster_centers):
cc.group = i
 
return cluster_centers
 
 
def print_eps(points, cluster_centers, W=400, H=400):
Color = namedtuple("Color", "r g b");
 
colors = []
for i in xrange(len(cluster_centers)):
colors.append(Color((3 * (i + 1) % 11) / 11.0,
(7 * i % 11) / 11.0,
(9 * i % 11) / 11.0))
 
max_x = max_y = -FLOAT_MAX
min_x = min_y = FLOAT_MAX
 
for p in points:
if max_x < p.x: max_x = p.x
if min_x > p.x: min_x = p.x
if max_y < p.y: max_y = p.y
if min_y > p.y: min_y = p.y
 
scale = min(W / (max_x - min_x),
H / (max_y - min_y))
cx = (max_x + min_x) / 2
cy = (max_y + min_y) / 2
 
print "%%!PS-Adobe-3.0\n%%%%BoundingBox: -5 -5 %d %d" % (W + 10, H + 10)
 
print ("/l {rlineto} def /m {rmoveto} def\n" +
"/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n" +
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath " +
" gsave 1 setgray fill grestore gsave 3 setlinewidth" +
" 1 setgray stroke grestore 0 setgray stroke }def")
 
for i, cc in enumerate(cluster_centers):
print ("%g %g %g setrgbcolor" %
(colors[i].r, colors[i].g, colors[i].b))
 
for p in points:
if p.group != i:
continue
print ("%.3f %.3f c" % ((p.x - cx) * scale + W / 2,
(p.y - cy) * scale + H / 2))
 
print ("\n0 setgray %g %g s" % ((cc.x - cx) * scale + W / 2,
(cc.y - cy) * scale + H / 2))
 
print "\n%%%%EOF"
 
 
def main():
npoints = 30000
k = 7 # # clusters
 
points = generate_points(npoints, 10)
cluster_centers = lloyd(points, k)
print_eps(points, cluster_centers)
 
 
main()


Racket[edit]

The k-means clustering:

 
#lang racket
(require racket/dict
math/distributions)
 
;; Divides the set of points into k clusters
;; using the standard k-means clustering algorithm
(define (k-means data k #:initialization (init k-means++))
(define (iteration centroids)
(map centroid (clusterize data centroids)))
(fixed-point iteration (init data k) #:same-test small-shift?))
 
;; Finds the centroid for a set of points
(define (centroid pts)
(vector-map (curryr / (length pts))
(for/fold ([sum (car pts)]) ([x (in-list (cdr pts))])
(vector-map + x sum))))
 
;; Divides the set of points into clusters
;; using given centroids
(define (clusterize data centroids)
(for*/fold ([res (map list centroids)]) ([x (in-list data)])
(define c (argmin (distanse-to x) centroids))
(dict-set res c (cons x (dict-ref res c)))))
 
;; Stop criterion: all centroids change their positions
;; by less then 0.1% of the minimal distance between centroids.
(define (small-shift? c1 c2)
(define min-distance
(apply min
(for*/list ([x (in-list c2)]
[y (in-list c2)] #:unless (equal? x y))
((metric) x y))))
(for/and ([a (in-list c1)] [b (in-list c2)])
(< ((metric) a b) (* 0.001 min-distance))))
 

Initialization methods

 
;; picks k points from a dataset randomly
(define (random-choice data k)
(for/list ([i (in-range k)])
(list-ref data (random (length data)))))
 
;; uses k-means++ algorithm
(define (k-means++ data k)
(for/fold ([centroids (random-choice data 1)]) ([i (in-range (- k 1))])
(define weights
(for/list ([x (in-list data)])
(apply min (map (distanse-to x) centroids))))
(define new-centroid
(sample (discrete-dist data weights)))
(cons new-centroid centroids)))
 

Different metrics

 
(define (euclidean-distance a b)
(for/sum ([x (in-vector a)] [y (in-vector b)])
(sqr (- x y))))
 
(define (manhattan-distance a b)
(for/sum ([x (in-vector a)] [y (in-vector b)])
(abs (- x y))))
 
(define metric (make-parameter euclidean-distance))
(define (distanse-to x) (curry (metric) x))
 

The fixed point operator

 
(define (fixed-point f x0 #:same-test [same? equal?])
(let loop ([x x0] [fx (f x0)])
(if (same? x fx) fx (loop fx (f fx)))))
 

Creating sample clusters

Clustering using k-means++ method.
Clustering using random seeds.
Clustering using k-means++ method with Manhattan distanse.
Clustering using k-means++ method almost always handles the difficult case.
Clustering using random seeds may give poor results.


 
(define (gaussian-cluster N
#:stdev (σ 1)
#:center (r0 #(0 0))
#:dim (d 2))
(for/list ([i (in-range N)])
(define r (for/vector ([j (in-range d)]) (sample (normal-dist 0 σ))))
(vector-map + r r0)))
 
(define (uniform-cluster N
#:radius (R 1)
#:center (r0 #(0 0)))
(for/list ([i (in-range N)])
(define r (* R (sqrt (sample (uniform-dist)))))
(define φ (* 2 pi (sample (uniform-dist))))
(vector-map + r0 (vector (* r (cos φ)) (* r (sin φ))))))
 

Visualization

 
(require plot)
 
(define (show-clustering data k #:method (method k-means++))
(define c (k-means data k #:initialization method))
(display
(plot
(append
(for/list ([d (clusterize data c)]
[i (in-naturals)])
(points d #:color i #:sym 'fullcircle1))
(list (points c
#:sym 'fullcircle7
#:fill-color 'yellow
#:line-width 3)))
#:title (format "Initializing by ~a" (object-name method)))))
 

Testing

 
(module+ test
(define circle (uniform-cluster 30000))
 ; using k-means++ method
(show-clustering circle 6)
 ; using standard k-means method
(show-clustering circle 6 #:method random-choice)
 ; using manhattan distance
(parameterize ([metric manhattan-distance])
(show-clustering circle 6)))
 

The difficult case.

 
(module+ test
(define clouds
(append
(gaussian-cluster 1000 #:stdev 0.5 #:center #(0 0))
(gaussian-cluster 1000 #:stdev 0.5 #:center #(2 3))
(gaussian-cluster 1000 #:stdev 0.5 #:center #(2.5 -1))
(gaussian-cluster 1000 #:stdev 0.5 #:center #(6 0))))
 
 ; using k-means++ method
(show-clustering clouds 4)
 ; using standard k-means method
(show-clustering clouds 4 #:method random-choice))
 

Multi-dimensional case.

 
(module+ test
(define 5d-data
(append
(gaussian-cluster 1000 #:dim 5 #:center #(2 0 0 0 0))
(gaussian-cluster 1000 #:dim 5 #:center #(0 2 0 0 0))
(gaussian-cluster 1000 #:dim 5 #:center #(0 0 2 0 0))
(gaussian-cluster 1000 #:dim 5 #:center #(0 0 0 2 0))
(gaussian-cluster 1000 #:dim 5 #:center #(0 0 0 0 2))))
 
(define centroids (k-means 5d-data 5))
 
(map (curry vector-map round) centroids))
 

Output shows that centroids were found correctly.

(#(-0.0 2.0 -0.0 0.0 0.0)
 #(0.0 0.0 -0.0 2.0 -0.0)
 #(2.0 -0.0 -0.0 -0.0 -0.0)
 #(-0.0 -0.0 2.0 0.0 0.0)
 #(-0.0 -0.0 0.0 0.0 2.0))

Rust[edit]

Translation of: Python
(the initial point selection part)
 
extern crate csv;
extern crate getopts;
extern crate gnuplot;
extern crate nalgebra;
extern crate num;
extern crate rand;
extern crate rustc_serialize;
extern crate test;
 
use getopts::Options;
use gnuplot::{Axes2D, AxesCommon, Color, Figure, Fix, PointSize, PointSymbol};
use nalgebra::{DVector, Iterable};
use rand::{Rng, SeedableRng, StdRng};
use rand::distributions::{IndependentSample, Range};
use std::f64::consts::PI;
use std::env;
 
type Point = DVector<f64>;
 
struct Cluster<'a> {
members: Vec<&'a Point>,
center: Point,
}
 
struct Stats {
centroids: Vec<Point>,
mean_d_from_centroid: DVector<f64>,
}
 
/// DVector doesn't implement BaseFloat, so a custom distance function is required.
fn sqdist(p1: &Point, p2: &Point) -> f64 {
(p1.clone() - p2.clone()).iter().map(|x| x * x).fold(0f64, |a, b| a + b)
}
 
/// Returns (distance^2, index) tuple of winning point.
fn nearest(p: &Point, candidates: &Vec<Point>) -> (f64, usize) {
let (dsquared, the_index) = candidates.iter()
.enumerate()
.fold((sqdist(p, &candidates[0]), 0),
|(d, index), next| {
let dprime = sqdist(p, &candidates[next.0]);
if dprime < d {
(dprime, next.0)
} else {
(d, index)
}
});
(dsquared, the_index)
}
 
/// Computes starting centroids and makes initial assignments.
fn kpp(points: &Vec<Point>, k: usize, rng: &mut StdRng) -> Stats {
let mut centroids: Vec<Point> = Vec::new();
// Random point for first centroid guess:
centroids.push(points[rng.gen::<usize>() % points.len()].clone());
let mut dists: Vec<f64> = vec![0f64; points.len()];
 
for _ in 1..k {
let mut sum = 0f64;
for (j, p) in points.iter().enumerate() {
let (dsquared, _) = nearest(&p, &centroids);
dists[j] = dsquared;
sum += dsquared;
}
 
// This part chooses the next cluster center with a probability proportional to d^2
sum *= rng.next_f64();
for (j, d) in dists.iter().enumerate() {
sum -= *d;
if sum <= 0f64 {
centroids.push(points[j].clone());
break;
}
}
}
 
let clusters = assign_clusters(points, &centroids);
compute_stats(&clusters)
}
 
fn assign_clusters<'a>(points: &'a Vec<Point>, centroids: &Vec<Point>) -> Vec<Cluster<'a>> {
let mut clusters: Vec<Cluster> = Vec::new();
 
for _ in 0..centroids.len() {
clusters.push(Cluster {
members: Vec::new(),
center: DVector::new_zeros(points[0].len()),
});
}
 
for p in points.iter() {
let (_, nearest_index) = nearest(p, centroids);
clusters[nearest_index].center = clusters[nearest_index].center.clone() + p.clone();
clusters[nearest_index].members.push(p);
}
 
for i in 0..clusters.len() {
clusters[i].center = clusters[i].center.clone() / clusters[i].members.len() as f64;
}
 
clusters
}
 
/// Computes centroids and mean-distance-from-centroid for each cluster.
fn compute_stats(clusters: &Vec<Cluster>) -> Stats {
let mut centroids = Vec::new();
let mut means_vec = Vec::new();
 
for c in clusters.iter() {
let pts = &c.members;
let seed: DVector<f64> = DVector::new_zeros(pts[0].len());
let centroid = pts.iter().fold(seed, |a, &b| a + b.clone()) / pts.len() as f64;
means_vec.push(pts.iter().fold(0f64, |acc, pt| acc + sqdist(pt, &centroid).sqrt()) /
pts.len() as f64);
centroids.push(centroid);
}
 
Stats {
centroids: centroids,
mean_d_from_centroid: DVector::from_slice(means_vec.len(), means_vec.as_slice()),
}
}
 
fn lloyd<'a>(points: &'a Vec<Point>,
k: usize,
stoppage_delta: f64,
max_iter: u32,
rng: &mut StdRng)
-> (Vec<Cluster<'a>>, Stats) {
 
let mut clusters = Vec::new();
// Choose starting centroids and make initial assignments
let mut stats = kpp(points, k, rng);
 
for i in 1..max_iter {
let last_means: DVector<f64> = stats.mean_d_from_centroid.clone();
clusters = assign_clusters(points, &stats.centroids);
stats = compute_stats(&clusters);
let err = sqdist(&stats.mean_d_from_centroid, &last_means).sqrt();
if err < stoppage_delta {
println!("Stoppage condition reached on iteration {}", i);
return (clusters, stats);
}
// Console output
print!("Iter {}: ", i);
for (cen, mu) in stats.centroids.iter().zip(stats.mean_d_from_centroid.iter()) {
print_dvec(cen);
print!(" {:1.2} | ", mu);
}
print!("{:1.5}\n", err);
}
 
println!("Stoppage condition not reached by iteration {}", max_iter);
(clusters, stats)
}
 
/// Uniform sampling on the unit disk.
fn generate_points(n: u32, rng: &mut StdRng) -> Vec<Point> {
let r_range = Range::new(0f64, 1f64);
let theta_range = Range::new(0f64, 2f64 * PI);
let mut points: Vec<Point> = Vec::new();
 
for _ in 0..n {
let root_r = r_range.ind_sample(rng).sqrt();
let theta = theta_range.ind_sample(rng);
points.push(DVector::<f64>::from_slice(2, &[root_r * theta.cos(), root_r * theta.sin()]));
}
 
points
}
 
// Plot clusters (2d only). Closure idiom allows us to borrow and mutate the Axes2D.
fn viz(clusters: Vec<Cluster>, stats: Stats, k: usize, n: u32, e: f64) {
let mut fg = Figure::new();
{
let prep = |fg: &mut Figure| {
let axes: &mut Axes2D = fg.axes2d();
let title: String = format!("k = {}, n = {}, e = {:4}", k, n, e);
let centroids_x = stats.centroids.iter().map(|c| c[0]);
let centroids_y = stats.centroids.iter().map(|c| c[1]);
for cluster in clusters.iter() {
axes.points(cluster.members.iter().map(|p| p[0]),
cluster.members
.iter()
.map(|p| p[1]),
&[PointSymbol('O'), PointSize(0.25)]);
}
axes.set_aspect_ratio(Fix(1.0))
.points(centroids_x,
centroids_y,
&[PointSymbol('o'), PointSize(1.5), Color("black")])
.set_title(&title[..], &[]);
};
prep(&mut fg);
}
fg.show();
}
 
fn print_dvec(v: &DVector<f64>) {
print!("(");
for elem in v.at.iter().take(v.len() - 1) {
print!("{:+1.2}, ", elem)
}
print!("{:+1.2})", v.at.iter().last().unwrap());
}
 
fn print_usage(program: &str, opts: Options) {
let brief = format!("Usage: {} [options]", program);
print!("{}", opts.usage(&brief));
}
 
fn main() {
let args: Vec<String> = env::args().collect();
let mut k: usize = 7;
let mut n: u32 = 30000;
let mut e: f64 = 1e-3;
let max_iterations = 100u32;
 
let mut opts = Options::new();
opts.optflag("?", "help", "Print this help menu");
opts.optopt("k",
"",
"Number of clusters to assign (default: 7)",
"<clusters>");
opts.optopt("n",
"",
"Operate on this many points on the unit disk (default: 30000)",
"<pts>");
opts.optopt("e",
"",
"Min delta in norm of successive cluster centroids to continue (default: 1e-3)",
"<eps>");
opts.optopt("f", "", "Read points from file (overrides -n)", "<csv>");
 
let program = args[0].clone();
let matches = match opts.parse(&args[1..]) {
Ok(m) => m,
Err(f) => panic!(f.to_string()),
};
if matches.opt_present("?") {
print_usage(&program, opts);
return;
}
match matches.opt_str("k") {
None => {}
Some(x) => k = x.parse::<usize>().unwrap(),
};
match matches.opt_str("n") {
None => {}
Some(x) => n = x.parse::<u32>().unwrap(),
};
match matches.opt_str("e") {
None => {}
Some(x) => e = x.parse::<f64>().unwrap(),
};
 
let seed: &[_] = &[1, 2, 3, 4];
let mut rng: StdRng = SeedableRng::from_seed(seed);
 
let mut points: Vec<Point>;
 
match matches.opt_str("f") {
None => {
// Proceed with random 2d data
points = generate_points(n, &mut rng)
}
Some(file) => {
points = Vec::new();
let mut rdr = csv::Reader::from_file(file.clone()).unwrap();
for row in rdr.records().map(|r| r.unwrap()) {
// row is Vec<String>
let floats: Vec<f64> = row.iter().map(|s| s.parse::<f64>().unwrap()).collect();
points.push(DVector::<f64>::from_slice(floats.len(), floats.as_slice()));
}
assert!(points.iter().all(|v| v.len() == points[0].len()));
n = points.len() as u32;
println!("Read {} points from {}", points.len(), file.clone());
}
};
 
assert!(points.len() >= k);
let (clusters, stats) = lloyd(&points, k, e, max_iterations, &mut rng);
 
println!(" k centroid{}mean dist pop",
std::iter::repeat(" ").take((points[0].len() - 2) * 7 + 7).collect::<String>());
println!("=== {} =========== =====",
std::iter::repeat("=").take(points[0].len() * 7 + 2).collect::<String>());
for i in 0..clusters.len() {
print!(" {:>1} ", i);
print_dvec(&stats.centroids[i]);
print!(" {:1.2} {:>4}\n",
stats.mean_d_from_centroid[i],
clusters[i].members.len());
}
 
if points[0].len() == 2 {
viz(clusters, stats, k, n, e)
}
}
 

[Plots exist but file upload is broken at the moment.]

Output of run on 30k points on the unit disk:

Stoppage condition reached on iteration 10
 k       centroid       mean dist    pop
===  ================  ===========  =====
 0    (+0.34, -0.61)      0.27       4425
 1    (+0.70, -0.01)      0.26       4293
 2    (-0.37, -0.59)      0.27       4319
 3    (+0.35, +0.61)      0.26       4368
 4    (-0.00, +0.01)      0.25       4095
 5    (-0.34, +0.62)      0.26       4190
 6    (-0.71, +0.04)      0.26       4310

Extra credit 4: Use of the DVector type in the nalgebra crate gives some arithmetic vector operations for free, and generalizes to n dimensions with no work. Here is the output of running this program on the 4-D Fisher Iris data (I don't think this data clusters well):

k       centroid                     mean dist    pop
===  ==============================  ===========  =====
0    (+5.00, +3.43, +1.46, +0.25)      0.49         49
1    (+5.88, +2.74, +4.39, +1.43)      0.73         61
2    (+6.85, +3.08, +5.72, +2.05)      0.73         39

Tcl[edit]

Translation of: C
Library: Tcllib (Package: math::constants)
package require Tcl 8.5
package require math::constants
math::constants::constants pi
proc tcl::mathfunc::randf m {expr {$m * rand()}}
 
proc genXY {count radius} {
global pi
for {set i 0} {$i < $count} {incr i} {
set ang [expr {randf(2 * $pi)}]
set r [expr {randf($radius)}]
lappend pt [list [expr {$r*cos($ang)}] [expr {$r*sin($ang)}] -1]
}
return $pt
}
proc dist2 {a b} {
lassign $a ax ay
lassign $b bx by
return [expr {($ax-$bx)**2 + ($ay-$by)**2}]
}
 
proc nearest {pt cent {d2var ""}} {
set minD 1e30
set minI [lindex $pt 2]
set i -1
foreach c $cent {
incr i
set d [dist2 $c $pt]
if {$minD > $d} {
set minD $d
set minI $i
}
}
if {$d2var ne ""} {
upvar 1 $d2var d2
set d2 $minD
}
return $minI
}
 
proc kpp {ptsVar centVar numClusters} {
upvar 1 $ptsVar pts $centVar cent
set idx [expr {int([llength $pts] * rand())}]
set cent [list [lindex $pts $idx]]
for {set nCent 1} {$nCent < $numClusters} {incr nCent} {
set sum 0
set d {}
foreach p $pts {
nearest $p $cent dd
set sum [expr {$sum + $dd}]
lappend d $dd
}
set sum [expr {randf($sum)}]
foreach p $pts dj $d {
set sum [expr {$sum - $dj}]
if {$sum <= 0} {
lappend cent $p
break
}
}
}
set i -1
foreach p $pts {
lset pts [incr i] 2 [nearest $p $cent]
}
}
 
proc lloyd {ptsVar numClusters} {
upvar 1 $ptsVar pts
kpp pts cent $numClusters
while 1 {
# Find centroids for round
set groupCounts [lrepeat [llength $cent] 0]
foreach p $pts {
lassign $p cx cy group
lset groupCounts $group [expr {[lindex $groupCounts $group] + 1}]
lset cent $group 0 [expr {[lindex $cent $group 0] + $cx}]
lset cent $group 1 [expr {[lindex $cent $group 1] + $cy}]
}
set i -1
foreach groupn $groupCounts {
incr i
lset cent $i 0 [expr {[lindex $cent $i 0] / $groupn}]
lset cent $i 1 [expr {[lindex $cent $i 1] / $groupn}]
}
 
set changed 0
set i -1
foreach p $pts {
incr i
set minI [nearest $p $cent]
if {$minI != [lindex $p 2]} {
incr changed
lset pts $i 2 $minI
}
}
if {$changed < ([llength $pts] >> 10)} break
}
set i -1
foreach c $cent {
lset cent [incr i] 2 $i
}
return $cent
}

Demonstration/visualization code:

Library: Tk
package require Tk
image create photo disp -width 400 -height 400
pack [label .l -image disp]
update
proc plot {x y color} {
disp put $color -to [expr {int(200+19.9*$x)}] [expr {int(200+19.9*$y)}]
}
apply {{} {
set POINTS [genXY 100000 10]
set CENTROIDS [lloyd POINTS 11]
foreach c $CENTROIDS {
lappend colors [list [list [format "#%02x%02x%02x" \
[expr {64+int(128*rand())}] [expr {64+int(128*rand())}] \
[expr {64+int(128*rand())}]]]]
}
foreach pt $POINTS {
lassign $pt px py group
plot $px $py [lindex $colors $group]
}
foreach c $CENTROIDS {
lassign $c cx cy group
plot $cx $cy black
}
}}

XPL0[edit]

Like C, simplicity and clarity was chosen over extra credit. Also, the dataset is global, and the arrays are separate instead of being packed into two arguments and passed into the KMeans procedure. Hopefully the animated display, showing the convergence of the clusters, compensates somewhat for these sins. Alas, image uploads appears to be broken.

include c:\cxpl\codes;  \intrinsic 'code' declarations
 
def N = 30000; \number of points
def K = 6; \number of clusters
int Px(N), Py(N), Pc(N), \coordinates of points and their cluster
Cx(K), Cy(K); \coordinates of centroid of cluster
 
 
func Centroid; \Find new centroids of points grouped with current centroids
int Change, Cx0(K), Cy0(K), C, Count, I;
[Change:= false;
for C:= 0 to K-1 do \for each centroid...
[Cx0(C):= Cx(C); Cy0(C):= Cy(C); \save current centroid
Cx(C):= 0; Cx(C):= 0; Count:= 0;\find new centroid
for I:= 0 to N-1 do \for all points
if Pc(I) = C then \ grouped with current centroid...
[Cx(C):= Cx(C) + Px(I);
Cy(C):= Cy(C) + Py(I);
Count:= Count+1;
];
Cx(C):= Cx(C)/Count; Cy(C):= Cy(C)/Count;
if Cx(C)#Cx0(C) or Cy(C)#Cy0(C) then Change:= true;
];
return Change;
];
 
 
proc Voronoi; \Group points with their nearest centroid
int D2, MinD2, I, C; \distance squared, minimum distance squared
[for I:= 0 to N-1 do \for each point...
[MinD2:= -1>>1; \find closest centroid
for C:= 0 to K-1 do
[D2:= sq(Px(I)-Cx(C)) + sq(Py(I)-Cy(C));
if D2 < MinD2 then
[MinD2:= D2; Pc(I):= C]; \update closest centroid
];
];
];
 
 
proc KMeans; \Group points into K clusters
int Change, I;
repeat Voronoi;
Change:= Centroid;
SetVid($101); \show result on 640x480x8 screen
for I:= 0 to N-1 do Point(Px(I), Py(I), Pc(I)+1);
for I:= 0 to K-1 do Point(Cx(I), Cy(I), \bright white\ $F);
until Change = false;
 
 
proc Random(X, Y); \Return random X,Y biased for polar coordinates
int X, Y;
real A, D;
[D:= float(Ran(240)); \distance: 0..239
A:= float(Ran(314159*2)) / 10000.0; \angle: 0..2pi
X(0):= fix(D*Cos(A)) + 320; \rectangular coords centered on screen
Y(0):= fix(D*Sin(A)) + 240;
];
 
 
int I;
[for I:= 0 to N-1 do Random(@Px(I), @Py(I)); \random set of points
for I:= 0 to K-1 do Random(@Cx(I), @Cy(I)); \random set of cluster centroids
KMeans;
I:= ChIn(1); \wait for keystroke
SetVid($03); \restore normal text screen
]