K-means++ clustering: Difference between revisions

← Older edit

K-means++ clustering (view source)

Revision as of 10:55, 12 December 2023

30,007 bytes added , 6 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,485

edits

Revision as of 23:20, 22 February 2021 (view source) rosettacode>DougHill (→‎{{header\|C}}: I moved an operation.) ← Older edit		Latest revision as of 10:55, 12 December 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(15 intermediate revisions by 10 users not shown)
Line 18: To extend the code to handle dimensions higher than 2, make <code>POINT</code> have more coordinates, change the <code>dist2</code> distance function, and change the finding of centroids in the <code>lloyd</code> K-Means function. Multidimensional scaling will be needed to visualize the output. This code uses the function <code>kppFaster</code> to find the initial centroids. It is faster than the original function <code>kpp</code>, especially with large data sets. The function <code>kppFaster</code> uses an array to keep track of the shortest distance from each point to the previously selected centroids. It also uses a bisection search to select the points. It doesn't use the function <code>nearestDistance</code>. The original functions <code>kpp</code> and <code>nearestDistance</code> are included here for comparison. ~~<lang c># define NUMBER_OF_POINTS 100000~~ <syntaxhighlight lang="c"># define NUMBER_OF_POINTS 100000 # define NUMBER_OF_CLUSTERS 11 # define MAXIMUM_ITERATIONS 100 # define RADIUS 10.0 #include <stdio.h> #include <stdlib.h> #include <math.h> typedef struct { double x; double y; int group; } POINT; Line 45 ⟶ 49: POINT * gen_xy(int num_pts, double radius) { int i; double ang, r; POINT * pts; Line 72 ⟶ 76: return xx + yy; } /------------------------------------------------------ nearest This function returns the index of the cluster centroid nearest to the data point passed to this function. ------------------------------------------------------/ int nearest(POINT * pt, POINT * cent, int n_cluster) Line 92 ⟶ 96: clusterIndex = i; } } return clusterIndex; } Line 99 ⟶ 103: nearestDistance This function returns the distance of the cluster centroid nearest to the data point passed to this function. ------------------------------------------------------/ double nearestDistance(POINT pt, POINT * cent, int n_cluster) Line 114 ⟶ 118: } } return min_d; } /---------------------------------------------------------------------- bisectionSearch This function makes a bisectional search of an array of values that are ordered in increasing order, and returns the index of the first element greater than the search value passed as a parameter. This code is adapted from code by Andy Allinger given to the public domain, which was in turn adapted from public domain code for spline evaluation by Rondall Jones (Sandia National Laboratories). Input: x A pointer to an array of values in increasing order to be searched. n The number of elements in the input array x. v The search value. Output: Returns the index of the first element greater than the search value, v. ----------------------------------------------------------------------/ int bisectionSearch(double x, int n, double v) { int il, ir, i; if (n < 1) { return 0; } / If v is less than x(0) or greater than x(n-1) / if (v < x[0]) { return 0; } else if (v > x[n-1]) { return n - 1; } /bisection search / il = 0; ir = n - 1; i = (il + ir) / 2; while ( i != il ) { if (x[i] <= v) { il = i; } else { ir = i; } i = (il + ir) / 2; } if (x[i] <= v) i = ir; return i; } / end of bisectionSearch / /------------------------------------------------------- kppFaster This function uses the K-Means++ method to select the cluster centroids. This code is adapted from code by Andy Allinger given to the public domain. Input: pts A pointer to an array of data points. num_pts The number of points in the pts array. centroids A pointer to an array to receive the centroids. num_clusters The number of clusters to be found. Output: centroids A pointer to the array of centroids found. -------------------------------------------------------/ void kppFaster(POINT pts, int num_pts, POINT * centroids, int num_clusters) { int j; int selectedIndex; int cluster; double sum; double d; double random; double * cumulativeDistances; double * shortestDistance; cumulativeDistances = (double) malloc(sizeof(double) num_pts); shortestDistance = (double) malloc(sizeof(double) num_pts); /* Pick the first cluster centroids at random. / selectedIndex = rand() % num_pts; centroids[0] = pts[ selectedIndex ]; for (j = 0; j < num_pts; ++j) shortestDistance[j] = HUGE_VAL; / Select the centroids for the remaining clusters. / for (cluster = 1; cluster < num_clusters; cluster++) { / For each point find its closest distance to any of the previous cluster centers / for ( j = 0; j < num_pts; j++ ) { d = dist2(&pts[j], &centroids[cluster-1] ); if (d < shortestDistance[j]) shortestDistance[j] = d; } / Create an array of the cumulative distances. / sum = 0.0; for (j = 0; j < num_pts; j++) { sum += shortestDistance[j]; cumulativeDistances[j] = sum; } / Select a point at random. Those with greater distances have a greater probability of being selected. / random = (float) rand() / (float) RAND_MAX sum; selectedIndex = bisectionSearch(cumulativeDistances, num_pts, random); /* assign the selected point as the center / centroids[cluster] = pts[selectedIndex]; } / Assign each point the index of it's nearest cluster centroid. / for (j = 0; j < num_pts; j++) pts[j].group = nearest(&pts[j], centroids, num_clusters); free(shortestDistance); free(cumulativeDistances); return; } / end, kppFaster / /------------------------------------------------------- Line 131 ⟶ 269: double * distances; distances = (double) malloc(sizeof(double) num_pts); /* Pick the first cluster centroids at random. / centroids[0] = pts[ rand() % num_pts ]; / Select the centroids for the remaining clusters. / Line 153 ⟶ 293: / Assign the centroids. the point with the largest distance will have a greater probability of being selected. / for (j = 0; j < num_pts; j++ ) { sum -= distances[j]; Line 169 ⟶ 309: free(distances); return; } / end, kpp / } /------------------------------------------------------- Line 184 ⟶ 326: { int i, clusterIndex; int ~~changed~~changes; int ~~bestPercent~~acceptable = num_pts / 1000; /* The maximum point changes acceptable. / if (num_clusters == 1 \|\| num_pts <= 0 \|\| num_clusters > num_pts ) return 0; POINT centroids = (POINT )malloc(sizeof(POINT) num_clusters); Line 200 ⟶ 344: } / / or use the k-Means++ method / / Original version kpp(pts, num_pts, centroids, num_clusters); / / Faster version / kppFaster(pts, num_pts, centroids, num_clusters); do { Line 229 ⟶ 379: / Find each data point's nearest centroid / ~~changed~~changes = 0; for ( i = 0; i < num_pts; i++ ) { clusterIndex = nearest(&pts[i], centroids, num_clusters); if (clusterIndex != pts[i].group) { pts[i].group = clusterIndex; ~~changed~~changes++; } } maxTimes--; } while ((~~changed~~changes > ~~bestPercent~~acceptable) && (maxTimes > 0)); / Set each centroid's group index / Line 246 ⟶ 396: return centroids; } / end, lloyd / } /------------------------------------------------------- Line 309 ⟶ 459: free(colors); ~~return;~~ } return; } /* end print_eps / /------------------------------------------------------- main Line 317 ⟶ 468: int main() { int num_pts = NUMBER_OF_POINTS; int num_clusters = NUMBER_OF_CLUSTERS; int maxTimes = MAXIMUM_ITERATIONS; double radius = RADIUS; POINT * pts; Line 337 ⟶ 488: return 0; }</~~lang~~syntaxhighlight> =={{header\|Crystal}}== {{trans\|Racket,Python}} <syntaxhighlight lang="crystal"> # kmeans++ clustering in crystal lang # # Task :: function that takes two arguments # k : uint - the number of clusters # points : [[float]] - the dataset to classify # # and returns a list of clusters # The algorithm of kmeans with a specific initialization # # k : int - number of clusters # points : [[float]] - the dataset of k-dimentional points # distance : ([float],[float])->float - the distance between two points # convergence_threshold : float - ratio of correctly classified points # rng : (msg)->float - random number generator # # {[[float]],[int]} - returns a tuple of the center of the cluster and an array # with the cluster-id for each point. # def kmeans(k, points, distance = ->euclidean_distance(Array(Float64),Array(Float64)), convergence_threshold=0.99, rng = Random::DEFAULT) # --------------------------------------------------------------------------- # the k++ method for choosing the initial values ('seeds') for the k-means # clustering. # --------------------------------------------------------------------------- # arrays of the clusters centers and the number of elements in each cluster c_means = points.sample(k,rng).clone c_cnt = Array.new(k,0) # arrays for each point distance to nearest cluster and the nearest cluster id p_dist = Array.new(points.size) do 1/0.0 end p_cluster = Array.new(points.size) do rng.rand(0 ... k) end # choose one center uniformly at random among data points c_means = [points.sample.clone] # to select the k-1 remaining centers (1 ... k).each do \|_\| # For each data point compute the distance (d(x)) and the nearest cluster center. (0 ... points.size).each do \|p_index\| (0 ... c_means.size).each do \|c_index\| d = distance.call(points[p_index],c_means[c_index]) if d < p_dist[p_index] p_dist[p_index] = d p_cluster[p_index] = c_index end end end # choose one new data point at random as a new center with a weighted # probability distribution where a point is chosen with probability # proportional to it's squared distance. (d(x)^2) sum = 0.0 (0 ... p_dist.size).each do \|p_index\| p_dist[p_index] = p_dist[p_index]*2 sum += p_dist[p_index] end sum = rng.rand(0.0 .. 1.0) (0 ... points.size).each do \|p_index\| sum -= p_dist[p_index] next if sum > 0 c_means.push(points[p_index].clone) break end end # --------------------------------------------------------------------------- # kmeans clustering # --------------------------------------------------------------------------- # with the previous cluster centers, the kmeans naive algorithm is performed # until the convergence_threshold is achieved changes_cnt = points.size while (changes_cnt.to_f/(1.0-convergence_threshold)) >= points.size changes_cnt = 0 # assign each point to the nearest cluster (0 ... points.size).each do \|p_index\| nearest_c_index = (0 ... k).min_by do \|c_index\| distance.call(c_means[c_index],points[p_index]) end changes_cnt += (p_cluster[p_index] != nearest_c_index) ? 1 : 0 p_cluster[p_index] = nearest_c_index end # use the points of each cluster to calculate its center using the mean # Reset means p_dim = points[0].size (0 ... k).each do \|c_index\| (0 ... p_dim).each do \|x_index\| c_means[c_index][x_index] = 0.0 c_cnt[c_index] = 0 end end # calculate the mean of the points of each cluster (0 ... points.size).each do \|p_index\| c_index = p_cluster[p_index] c_cnt[c_index] += 1 (0 ... p_dim).each do \|x_index\| c_means[c_index][x_index] += points[p_index][x_index] end end (0 ... k).each do \|c_index\| (0 ... p_dim).each do \|x_index\| c_means[c_index][x_index] /= c_cnt[c_index].to_f end end end # return the center of each cluster and the membership of each point return c_means,p_cluster end # the euclidean distance is used in the kmeans++-algorithm def euclidean_distance(pa,pb) return (0 ... pa.size).each.reduce(0.0) do \|s,i\| s + (pa[i] - pb[i])*2 end end # alternative distance def manhattan_distance(pa,pb) return (0 ... pa.size).each.reduce(0.0) do \|s,i\| s + (pa[i] - pb[i]).abs end end </syntaxhighlight> Extra functions <syntaxhighlight lang="crystal"> # ----------------------------------------------------------------------------- # Function to exercise the code, that generates a list of random points # ----------------------------------------------------------------------------- # Generates a cluster of random points in a unitary circle # def uniform_cluster(num_points,radius = 1.0,center = [0.0,0.0],rng=Random::DEFAULT) return Array.new(num_points) do \|_\| r = radiusMath.sqrt(rng.rand(0.0 .. 1.0)) angle = rng.rand(0.0 .. 2Math::PI) [center[0] + rMath.cos(angle), center[1] + rMath.sin(angle)] end end # Generates an n-dimentional cluster with gaaussian distribution def gaussian_cluster(num_points,stdev=1.0,center=[0.0,0.0],rng=Random::DEFAULT) dimentions = center.size return Array.new(num_points) do Array.new(dimentions) do \|d\| sum = 0.0 12.times do sum += rng.rand(0.0 .. 1.0) end sum -= 6.0 center[d] + (sum stdev) end end end # ----------------------------------------------------------------------------- # Visualization of the results # ----------------------------------------------------------------------------- # This functions creates an svg file with the points, the cluster centers and # the classification of each point. # def plot(cluster_means,points,points_cluster,fname="kmeans_output.svg") # Mapping the points to the interval (0 .. 1) xmin,xmax = points.minmax_by do \|p\| p[0] end.map(&.[0]) ymin,ymax = points.minmax_by do \|p\| p[1] end.map(&.[1]) xspan = (xmax-xmin) + 1e-12 yspan = (ymax-ymin) + 1e-12 points = points.map do \|p\| x = (p[0] - xmin) / xspan y = (ymin - p[1]) / yspan [x,y] end cluster_means = cluster_means.map do \|p\| x = (p[0] - xmin) / xspan y = (ymin - p[1]) / yspan [x,y] end # Generate the file File.open(fname,"w+") do \|io\| io.puts(%(<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="-0.05 -1.05 1.1 1.1">)) (0 ... points.size).each do \|p_index\| p = points[p_index] c = points_cluster[p_index] * (330.0 / cluster_means.size) io.puts(%(<circle cx="#{p[0]}" cy="#{p[1]}" r="0.005" fill="hsl(#{c} 60% 50%)" stroke="none"></circle>)) end (0 ... cluster_means.size).each do \|c_index\| p = cluster_means[c_index] c = c_index * (330.0 / cluster_means.size) io.puts(%(<circle cx="#{p[0]}" cy="#{p[1]}" r="0.02" fill="hsl(50 100% 60%)" stroke-width="0.004" stroke="hsl(52 100% 0%)"></circle>)) end io.puts(%(</svg>)) end end </syntaxhighlight> Examples [[File:Clustering-using-k-means-pp-and-euclidean-distance.svg\|250x250px\|thumb\|Clustering of random points in a unitary circle using k-means ++ and euclidean distance]] [[File:Clustering-using-k-means-pp-and-manhattan-distance.svg\|250x250px\|thumb\|Clustering of random points in a unitary circle using k-means ++ and manhattan distance]] [[File:Gaussian-clustering.svg\|250x250px\|thumb\|alt=Kmeans++ algorithm on gaussian clusters\|Kmeans++ algorithm on gaussian clusters]] <syntaxhighlight lang="crystal"> rngseed = 19 # Basic usage points = uniform_cluster(num_points: 30000,rng: Random.new(rngseed)) cluster_center,point_cluster = kmeans(6, points, rng: Random.new(rngseed)) plot(cluster_center,points,point_cluster,fname: "clustering-using-k-means-pp.svg") # Using another distance points = uniform_cluster(num_points: 30000,rng: Random.new(rngseed)) cluster_center,point_cluster = kmeans(6, points, rng: Random.new(rngseed), distance: ->manhattan_distance(Array(Float64),Array(Float64))) plot(cluster_center,points,point_cluster,fname: "clustering-using-k-means-pp-and-manhattan.svg") # difficult case points = [] of Array(Float64) rng = Random.new(rngseed) points += gaussian_cluster(num_points: 10000,stdev: 0.5,center: [0.0,0.0],rng: rng) points += gaussian_cluster(num_points: 10000,stdev: 0.5,center: [2.0,3.0],rng: rng) points += gaussian_cluster(num_points: 10000,stdev: 0.5,center: [2.5,-1.0],rng: rng) points += gaussian_cluster(num_points: 10000,stdev: 0.5,center: [6.0,0.0],rng: rng) cluster_center,point_cluster = kmeans(4, points, rng: Random.new(rngseed)) plot(cluster_center,points,point_cluster,fname: "gaussian-clustering.svg") # 5d-data points = [] of Array(Float64) rng = Random.new(rngseed) points += gaussian_cluster(num_points: 5000,stdev: 0.5, center: [2.0,0.0,0.0,0.0,0.0],rng: rng) points += gaussian_cluster(num_points: 5000,stdev: 0.5, center: [0.0,2.0,0.0,0.0,0.0],rng: rng) points += gaussian_cluster(num_points: 5000,stdev: 0.5, center: [0.0,0.0,2.0,0.0,0.0],rng: rng) points += gaussian_cluster(num_points: 5000,stdev: 0.5, center: [0.0,0.0,0.0,2.0,0.0],rng: rng) points += gaussian_cluster(num_points: 5000,stdev: 0.5, center: [0.0,0.0,0.0,0.0,2.0],rng: rng) cluster_center,point_cluster = kmeans(5, points, convergence_threshold:0.99999) puts(cluster_center.map(&.map(&.round(2))).join("\n")) </syntaxhighlight> Output shows that centroids were found correctly. <pre> #output for 5d data [0.01, -0.0, 2.0, 0.01, 0.0] [-0.0, 0.0, -0.0, -0.0, 2.0] [-0.01, 2.01, 0.01, 0.01, -0.01] [0.0, 0.0, -0.01, 2.01, 0.0] [2.0, 0.01, -0.0, 0.0, 0.01] </pre> =={{header\|D}}== {{trans\|C}} <~~lang~~syntaxhighlight lang="d">import std.stdio, std.math, std.random, std.typecons, std.algorithm; // On Windows this uses the printf from the Microsoft C runtime, Line 587 ⟶ 1,010: const clusterCenters = lloyd(points, nClusters, rnd); printEps(points, clusterCenters); }</~~lang~~syntaxhighlight> Compiled with ldc2 it's about as fast as the C entry. =={{header\|Delphi}}== {{works with\|Delphi\|6.0}} {{libheader\|Types,ExtCtrls,Graphics}} Translated from XPLo. The XPLo version was simple, straight forward and clear. In other words, it was too good to try to develop a different one. For clarity and simplicity, I did convert some separaate components into structs. <syntaxhighlight lang="Delphi"> const N = 30000; {number of points} const K = 6; {number of clusters} var ScreenSize,Center: TPoint; var Radius: integer; var Points: array [0..N-1] of TPoint; {coordinates of points and their cluster} var Pc: array [0..N-1] of TColor; var Cent: array [0..K-1] of TPoint; {coordinates of centroid of cluster} const Palette: array [0..5] of TColor =( $00 or ($00 shl 8) or ($AA shl 16), $00 or ($AA shl 8) or ($00 shl 16), $00 or ($AA shl 8) or ($AA shl 16), $AA or ($00 shl 8) or ($00 shl 16), $AA or ($00 shl 8) or ($AA shl 16), $AA or ($55 shl 8) or ($00 shl 16)); {These would normally be in a separate library} {Shown here for clarity} procedure ClearImage(Image: TImage; Color: TColor); var R: TRect; begin R:=Rect(0,0,Image.Picture.Bitmap.Width,Image.Picture.Bitmap.Height); Image.Canvas.Brush.Color:=Color; Image.Canvas.Brush.Style:=bsSolid; Image.Canvas.Pen.Mode:=pmCopy; Image.Canvas.Pen.Style:=psSolid; Image.Canvas.Pen.Color:=Color; Image.Canvas.Rectangle(R); Image.Invalidate; end; function PointAdd(V1,V2: TPoint): TPoint; {Add V1 and V2} begin Result.X:= V1.X+V2.X; Result.Y:= V1.Y+V2.Y; end; function PointScalarDivide(V: TPoint; S: double): TPoint; {Divide vector by scalar} begin Result.X:=Trunc(V.X/S); Result.Y:=Trunc(V.Y/S); end; {--------------- Main Program -------------------------------------------------} function Centroid: boolean; {Find new centroids of points grouped with current centroids} var Change: boolean; var C0: array [0..K-1] of TPoint; var C, Count, I: integer; begin Change:= false; for C:= 0 to K-1 do {for each centroid...} begin C0[C]:= Cent[C]; {save current centroid} Cent[C]:=Point(0,0); Count:= 0; {find new centroid} for I:= 0 to N-1 do {for all points} if Pc[I] = Palette[C] then { grouped with current centroid...} begin Cent[C]:=PointAdd(Cent[C],Points[I]); Count:= Count+1; end; Cent[C]:=PointScalarDivide(Cent[C],Count); if (Cent[C].X<>C0[C].X) or (Cent[C].Y<>C0[C].Y) then Change:= true; end; Result:=Change; end; procedure Voronoi; {Group points with their nearest centroid} var D2, MinD2, I, C: integer; {distance squared, minimum distance squared} begin for I:= 0 to N-1 do {for each point...} begin MinD2:= High(Integer); {find closest centroid} for C:= 0 to K-1 do begin D2:= sqr(Points[I].X-Cent[C].X) + sqr(Points[I].Y-Cent[C].Y); if D2 < MinD2 then begin {update closest centroid} MinD2:= D2; Pc[I]:= Palette[C]; end; end; end; end; procedure KMeans(Image: TImage); {Group points into K clusters} var Change: boolean; var I: integer; begin repeat begin Voronoi; Change:= Centroid; for I:= 0 to N-1 do Image.Canvas.Pixels[Points[I].X, Points[I].Y]:=Pc[I]+1; Image.Repaint; for I:= 0 to K-1 do Image.Canvas.Pixels[Cent[I].X, Cent[I].Y]:=clWhite; Image.Repaint; end until Change = false; end; procedure PolarRandom(var P: TPoint); {Return random X,Y biased for polar coordinates} var A, D: double; begin D:=Random(Radius); {distance: 0..239} A:=Random(3141592) / 10000; {angle: 0..2pi} {rectangular coords centered on screen} P:=PointAdd(Point(Trunc(DCos(A)),Trunc(DSin(A))),Center); end; procedure ConfigureScreen(Image: TImage); {Configure screem constants to match current state of Image} begin ScreenSize:=Point(Image.Width,Image.Height); Center:=Point(Image.Width div 2,Image.Height div 2); if Center.X<Center.Y then Radius:=Center.X else Radius:=Center.Y; end; procedure DoKMeansClustering(Image: TImage); var I: integer; begin ConfigureScreen(Image); ClearImage(Image,clBlack); for I:= 0 to N-1 do PolarRandom(Points[I]); {random set of points} for I:= 0 to K-1 do PolarRandom(Cent[I]); {random set of cluster centroids} KMeans(Image); end; </syntaxhighlight> {{out}} [[File:DelphiKMeanCluster.png\|none]] <pre> </pre> =={{header\|Euler Math Toolbox}}== <syntaxhighlight lang="euler math toolbox"> ~~<lang Euler Math Toolbox>~~ >type kmeanscluster function kmeanscluster (x: numerical, k: index) Line 622 ⟶ 1,208: return j endfunction </syntaxhighlight> ~~</lang>~~ Let us apply to random data. <syntaxhighlight lang="euler math toolbox"> ~~<lang Euler Math Toolbox>~~ >load clustering.e Functions for clustering data. Line 640 ⟶ 1,226: > loop 1 to k; plot2d(m[#,1],m[#,2],points=1,style="o#",add=1); end; ... > insimg; </syntaxhighlight> ~~</lang>~~ [[File:ClusteringEulerMathToolbox.png]] Line 647 ⟶ 1,233: {{works with\|GNU Fortran \|6.1.1 20160802}} <syntaxhighlight lang="fortran"> ~~<lang Fortran>~~ ********************************************************************** * KMPP - K-Means++ - Traditional data clustering with a special initialization Line 899 ⟶ 1,485: END ! of test program </syntaxhighlight> ~~</lang>~~ Uniform random points over the unit circle ~~(compare to the solution in C)~~ [[File:kmeans_FORTRAN_round_points.gif]] ~~[http://13olive.net/kmeans_FORTRAN_points.png (External image)]~~ The points with clusters marked by color: [[File:kmeans_FORTRAN_round_clusters.gif]] ~~[http://13olive.net/kmeans_FORTRAN_clusters.png (External image)]~~ =={{header\|Go}}== <~~lang~~syntaxhighlight lang="go">package main import ( Line 1,130 ⟶ 1,716: fmt.Println(err) } }</~~lang~~syntaxhighlight> Text output: <pre> Line 1,160 ⟶ 1,746: Vectors are represented as lists, so the solution could be extended to any space dimension. <~~lang~~syntaxhighlight lang="haskell">{-# LANGUAGE Strict,FlexibleInstances #-} module KMeans where Line 1,217 ⟶ 1,803: mkCluster n s m = take n . transpose <$> mapM randomsAround m where randomsAround x0 = map (\x -> x0+satanh x) <$> getRandomRs (-1,1)</~~lang~~syntaxhighlight> '''Examples''' <~~lang~~syntaxhighlight lang="haskell">module Main where import Graphics.EasyPlot Line 1,244 ⟶ 1,830: where listPlot = Data2D [Title "",Style Dots] [] . map (\(x:y:_) -> (x,y)) </syntaxhighlight> ~~</lang>~~ Result: all centroids and clusters are found. <pre>λ> test Line 1,252 ⟶ 1,838: =={{header\|Huginn}}== <~~lang~~syntaxhighlight lang="huginn">#! /bin/sh exec huginn -E "${0}" "${@}" #! huginn Line 1,470 ⟶ 2,056: } print( "\n%%%%EOF\n" ); }</~~lang~~syntaxhighlight> =={{header\|J}}== '''Solution''':<~~lang~~syntaxhighlight lang="j"> NB. Selection of initial centroids, per K-means++ initialCentroids =: (] , randomCentroid)^:(<:@:]`(,:@:seedCentroid@:[))~ seedCentroid =: {~ ?@# Line 1,484 ⟶ 2,070: centroids =: ([ mean/.~ closestCentroid)^:(]`_:`initialCentroids) closestCentroid =: [: (i.<./)"1 distance/ mean =: +/ % #</~~lang~~syntaxhighlight> '''Extra credit''':<~~lang~~syntaxhighlight lang="j"> randMatrix =: ?@$&0 NB. Extra credit #1 packPoints =: <"1@:\|: NB. Extra credit #2: Visualization code due to Max Harms http://j.mp/l8L45V plotClusters =: dyad define NB. as is the example image in this task Line 1,500 ⟶ 2,086: NB. Extra credit #4: Polar coordinates are not available in this version NB. but wouldn't be hard to provide with &.cartToPole .</~~lang~~syntaxhighlight> '''Example''':<~~lang~~syntaxhighlight lang="j"> plotRandomClusters =: 3&$: : (dyad define) dataset =. randMatrix 2 {. y,2 Line 1,512 ⟶ 2,098: plotRandomClusters 300 NB. 300 points, 3 clusters 6 plotRandomClusters 30000 NB. 3e5 points, 6 clusters 10 plotRandomClusters 17000 5 NB. 17e3 points, 10 clusters, 5 dimensions</~~lang~~syntaxhighlight> =={{header\|Java}}== {{trans\|C}} <~~lang~~syntaxhighlight lang="java"> import java.util.Random; Line 1,697 ⟶ 2,283: } </syntaxhighlight> ~~</lang>~~ =={{header\|JavaScript}}== {{improve\|JavaScript\|live demo link broken/404}} '''Solution''' Live Demo (Extra Credit #2) [https://ezward.github.io/kmeans-javascript/kmeansrandom.html KMeans++ in JavaScript] <~~lang~~syntaxhighlight lang="javascript"> /* * kmeans module Line 2,452 ⟶ 3,039: </syntaxhighlight> ~~</lang>~~ =={{header\|jq}}== '''Adapted from [[#Wren]]''' {{works with\|jq}} The jq program shown here generates PostScript. Since jq does not currently include a PRNG, we will use /dev/urandom as a source of entropy as per the following bash script, which invoces jq to generate 30,000 points in accordance with the task description. The result of one run of this script is shown at https://imgur.com/gallery/F0q1yO6 <syntaxhighlight lang="sh">#!/bin/bash export LC_ALL=C # Generate $1 pseudo-random integers of width $2. # The integers may have leading 0s function prng { cat /dev/urandom \| tr -cd '0-9' \| fold -w "$2" \| head -n "$1" } PTS=30000 prng $((4 * PTS)) 3 \| jq -nr --argjson PTS $PTS -f k-means++-clustering.jq </syntaxhighlight> '''k-means++-clustering.jq''' <syntaxhighlight lang="jq"> # A Point is represented by a JSON array: [x, y, group] def hugeVal: infinite; def RAND_MAX : 1000; def PTS: $PTS; def K : 6; def W : 400; def H : 400; def rand: input \| tonumber; def randf(m): m * rand / (RAND_MAX - 1); def pi: 1 \| atan * 4; # Pseudo-randomly generate `count` points in a circle with the given radius def genXY(count; radius): # note: this is not a uniform 2-d distribution pi as $pi \| reduce range(0; count) as $i ([]; .[$i] = [0, 0, 0] \| randf(2 * $pi) as $ang \| randf(radius) as $r \| .[$i][0] = $r * ($ang\|cos) \| .[$i][1] = $r * ($ang\|sin) ) ; def dist2($a; $b): ($a[0] - $b[0]) as $x \| ($a[1] - $b[1]) as $y \| $x * $x + $y * $y; # output: {minD, minI} def nearest($pt; $cent; $nCluster): { minD : hugeVal, minI : $pt[2] } \| reduce range(0; $nCluster) as $i (.; dist2($cent[$i]; $pt) as $d \| if .minD > $d then .minD = $d \| .minI = $i else . end ) ; # input: {pts, cent} # output: ditto def kpp(len): (.cent\|length) as $nCent \| .cent[0] = .pts[rand % len] \| . + { d: []} \| reduce range(1; $nCent) as $nCluster (.; .sum = 0 \| reduce range(0; len) as $j (.; .d[$j] = nearest(.pts[$j]; .cent; $nCluster).minD \| .sum += .d[$j] ) \| .sum = randf(.sum) \| label $out \| .emit = null # Be sure to emit something: \| foreach (range(0; len), null) as $j (.; if $j == null then .emit = . else .sum -= .d[$j] \| if .sum > 0 then . else .cent[$nCluster] = .pts[$j] \| .emit = . end end; select(.emit) \| (.emit, break $out) ) ) \| reduce range(0; len) as $j (.; .pts[$j][2] = nearest(.pts[$j]; .cent; $nCent).minI ) ; # Input: an array of Point (.pts) # Output: {pts, cent} def lloyd(len; nCluster): {pts: ., cent: [range(0; nCluster) \| [0,0,0]]} \| kpp(len) # stop when > 99.9% of points are good \| until( .changed > (len / 1E4); # group elements of centroids are used as counters reduce range(0; nCluster) as $i (.; .cent[$i] = [0,0,0]) \| reduce range(0; len) as $j (.; .pts[$j] as $p \| .cent[$p[2]] \|= [ .[0]+$p[0], .[1]+$p[1], .[2] + 1] ) \| reduce range(0; nCluster) as $i (.; (.cent[$i][2] \| if . == 0 then 0.0001 else . end) as $divisor \| .cent[$i] \|= [ (.[0]/$divisor), (.[1]/$divisor), .[2] ] ) \| .changed = 0 # find closest centroid of each point \| reduce range(0; len) as $j (.; .pts[$j] as $p \| nearest($p; .cent; nCluster).minI as $minI \| if $minI != $p[2] then .changed += 1 \| .pts[$j][2] = $minI else . end) ) \| .cent \|= reduce range(0; nCluster) as $i (.; .[$i][2] = $i ) ; def printEps($pts; len; cent; nCluster): def f1($x;$y;$z): "\($x) \($y) \($z) setrgbcolor"; def f2($x;$y) : "\($x) \($y) c"; def f3($x;$y) : "\n0 setgray \($x) \($y) s"; def colors: reduce range(0; nCluster) as $i ([]; .[3 * $i + 0] = (3 * ($i + 1) % 11) / 11 \| .[3 * $i + 1] = (7 * $i % 11) / 11 \| .[3 * $i + 2] = (9 * $i % 11) / 11 ); {colors: colors} \| .minX = hugeVal \| .minY = hugeVal \| .maxX = -hugeVal \| .maxY = -hugeVal \| reduce range(0; len) as $j (.; $pts[$j] as $p \| if .maxX < $p[0] then .maxX = $p[0] else . end \| if .minX > $p[0] then .minX = $p[0] else . end \| if .maxY < $p[1] then .maxY = $p[1] else . end \| if .minY > $p[1] then .minY = $p[1] else . end ) \| ([((W / (.maxX - .minX))), ((H / (.maxY - .minY)))] \| min) as $scale \| ( (.maxX + .minX) / 2) as $cx \| ( (.maxY + .minY) / 2) as $cy \| "%!PS-Adobe-3.0\n%%BoundingBox: -5 -5 \(W + 10) \(H + 10)", "/l {rlineto} def /m {rmoveto} def", "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def", "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath ", " gsave 1 setgray fill grestore gsave 3 setlinewidth", " 1 setgray stroke grestore 0 setgray stroke }def", (range(0; nCluster) as $i \| cent[$i] as $c \| f1(.colors[3 * $i]; .colors[3 * $i + 1]; .colors[3 * $i + 2]), ( range(0; len) as $j \| $pts[$j] as $p \| if $p[2] == $i then f2( ($p[0] - $cx) * $scale + W / 2; ($p[1] - $cy) * $scale + H / 2) else empty end ), f3( ($c[0] - $cx) * $scale + W / 2; ($c[1] - $cy) * $scale + H / 2)), "\n%%EOF" ; # The required clustering function with two arguments. # It returns {pts, cent} # where .pts and .cent are arrays of Points, # with the cluster id as the third item. # def cluster($nCluster; $points): $points \| lloyd( length; $nCluster); # The task: genXY(PTS; 10) \| lloyd(PTS; K) as { pts: $pts, cent: $cent } \| printEps($pts; PTS; $cent; K)</syntaxhighlight> {{out}} See https://imgur.com/gallery/F0q1yO6 =={{header\|Julia}}== <~~lang~~syntaxhighlight lang="julia"># run via Julia REPL using Clustering, ~~Makie~~Plots, DataFrames, RDatasets const iris = dataset("datasets", "iris") plt1 = plot(title= "Species Classification", xlabel = "Sepal Width", ylabel = "Sepal length") ~~const colors = [:red, :green, :blue]~~ plt2 = plot(title= "Kmeans++ Classification", xlabel = "Sepal Width", ylabel = "Sepal length") ~~const plt = Vector{Any}(undef,2)~~ ~~scene1 = Scene()~~ ~~scene2 = Scene()~~ for (i, sp) in enumerate(unique(iris[!, :Species])) idx = iris[!, :Species] .== sp ~~sel~~spwidth, splength = iris[idx, [:SepalWidth], iris[idx, :SepalLength]] ~~plt[1] =~~ scatter!(~~scene1~~plt1, ~~sel[1]~~spwidth, ~~sel[2]~~splength, color = ~~colors~~[:red, :green, :blue][i], legend = false) ~~limits = FRect(1.5, 4.0, 3.0, 4.0))~~ end features = ~~permutedims~~collect(~~convert~~Matrix(~~Array~~iris[!, ~~iris[~~1:4])~~, [2, 1]~~') # K Means ++ result = kmeans(features, 3, init = ~~:kmpp~~KmppAlg()) # set to 3 clusters with kmeans++ ~~:kmpp~~ for center in unique(result.assignments) idx = result.assignments .== center ~~sel~~spwidth, splength = iris[idx, [:SepalWidth], iris[idx, :SepalLength]] ~~plt[2] =~~ scatter!(~~scene2~~plt2, ~~sel[1]~~spwidth, ~~sel[2]~~splength, color = ~~colors~~[:green, :red, :blue][center], legend = false) ~~limits = FRect(1.5, 4.0, 3.0, 4.0))~~ end plot(plt1, plt2) ~~scene2[Axis][:names][:axisnames] = scene1[Axis][:names][:axisnames] =~~ </syntaxhighlight> ~~("Sepal Width", "Sepal Length")~~ ~~t1 = text(Theme(), "Species Classification", camera=campixel!)~~ ~~t2 = text(Theme(), "Kmeans Classification", camera=campixel!)~~ ~~vbox(hbox(plt[1], t1), hbox(plt[2], t2))~~ ~~</lang>~~ =={{header\|Kotlin}}== Line 2,496 ⟶ 3,264: As in the case of the C example, the data is partitioned into 11 clusters though, unlike C (which doesn't use srand), the output will be different each time the program is run. <~~lang~~syntaxhighlight lang="scala">// version 1.2.21 import java.util.Random Line 2,657 ⟶ 3,425: val c = lloyd(v, PTS, K) printEps(v, PTS, c, K) }</~~lang~~syntaxhighlight> =={{header\|Lua}}== {{works with\|lua\|5.3}} <~~lang~~syntaxhighlight lang="lua"> local function load_data(npoints, radius) Line 2,862 ⟶ 3,630: -- end print_eps(data, N_CLUSTERS, centers, cluster) </syntaxhighlight> ~~</lang>~~ =={{header\|Mathematica}}/{{header\|Wolfram Language}}== '''Solution - Initial kmeans code comes from http://mathematica.stackexchange.com/questions/7441/k-means-clustering, now extended to kmeans++ by introducing the function initM.<br> Was not able to upload pictures of the result...''': <syntaxhighlight lang="text">initM[list_List, k_Integer, distFunc_Symbol] := Module[{m = {RandomChoice[list]}, n, d}, While[Length[m] < k, Line 2,889 ⟶ 3,657: {clusters, m} ] ];</~~lang~~syntaxhighlight> Extra credit:<br> Line 2,956 ⟶ 3,724: =={{header\|Nim}}== {{trans\|Python, C}} <~~lang~~syntaxhighlight lang="nim"># # compile: # nim c -d:release kmeans.nim Line 3,148 ⟶ 3,916: printEps(points, clusterCentrs) main()</~~lang~~syntaxhighlight> =={{header\|Phix}}== Line 3,155 ⟶ 3,923: {{trans\|Go}} I nicked the initial dataset creation from Go, as an alternative <~~lang~~syntaxhighlight ~~Phix~~lang="phix">-- demo\rosetta\K_means_clustering.exw -- Press F5 to restart include pGUI.e Line 3,336 ⟶ 4,104: end procedure main()</~~lang~~syntaxhighlight> Probably the hardest part of handling more than 2 dimensions would be deleteing all the GUI code, or modifying it to produce an n-dimensional representation. Obviously Line 3,343 ⟶ 4,111: =={{header\|Python}}== {{trans\|D}} <~~lang~~syntaxhighlight lang="python">from math import pi, sin, cos from collections import namedtuple from random import random, choice Line 3,516 ⟶ 4,284: main()</~~lang~~syntaxhighlight> =={{header\|Racket}}== The k-means clustering: <~~lang~~syntaxhighlight lang="racket"> #lang racket (require racket/dict Line 3,555 ⟶ 4,323: (for/and ([a (in-list c1)] [b (in-list c2)]) (< ((metric) a b) (* 0.001 min-distance)))) </syntaxhighlight> ~~</lang>~~ Initialization methods <~~lang~~syntaxhighlight lang="racket"> ;; picks k points from a dataset randomly (define (random-choice data k) Line 3,574 ⟶ 4,342: (sample (discrete-dist data weights))) (cons new-centroid centroids))) </syntaxhighlight> ~~</lang>~~ Different metrics <~~lang~~syntaxhighlight lang="racket"> (define (euclidean-distance a b) (for/sum ([x (in-vector a)] [y (in-vector b)]) Line 3,589 ⟶ 4,357: (define metric (make-parameter euclidean-distance)) (define (distanse-to x) (curry (metric) x)) </syntaxhighlight> ~~</lang>~~ The fixed point operator <~~lang~~syntaxhighlight lang="racket"> (define (fixed-point f x0 #:same-test [same? equal?]) (let loop ([x x0] [fx (f x0)]) (if (same? x fx) fx (loop fx (f fx))))) </syntaxhighlight> ~~</lang>~~ Creating sample clusters Line 3,608 ⟶ 4,376: <~~lang~~syntaxhighlight lang="racket"> (define (gaussian-cluster N #:stdev (σ 1) Line 3,624 ⟶ 4,392: (define φ (* 2 pi (sample (uniform-dist)))) (vector-map + r0 (vector (* r (cos φ)) (* r (sin φ)))))) </syntaxhighlight> ~~</lang>~~ Visualization <~~lang~~syntaxhighlight lang="racket"> (require plot) Line 3,644 ⟶ 4,412: #:line-width 3))) #:title (format "Initializing by ~a" (object-name method))))) </syntaxhighlight> ~~</lang>~~ Testing <~~lang~~syntaxhighlight lang="racket"> (module+ test (define circle (uniform-cluster 30000)) Line 3,658 ⟶ 4,426: (parameterize ([metric manhattan-distance]) (show-clustering circle 6))) </syntaxhighlight> ~~</lang>~~ The difficult case. <~~lang~~syntaxhighlight lang="racket"> (module+ test (define clouds Line 3,675 ⟶ 4,443: ; using standard k-means method (show-clustering clouds 4 #:method random-choice)) </syntaxhighlight> ~~</lang>~~ Multi-dimensional case. <~~lang~~syntaxhighlight lang="racket"> (module+ test (define 5d-data Line 3,692 ⟶ 4,460: (map (curry vector-map round) centroids)) </syntaxhighlight> ~~</lang>~~ Output shows that centroids were found correctly. <pre> Line 3,706 ⟶ 4,474: {{works with\|rakudo\|2015-09-16}} We use Complex numbers to represent points in the plane. We feed the algorithm with three artificially made clouds of points so we can easily see if the output makes sense. <syntaxhighlight lang="raku" ~~perl6~~line>sub postfix:«-means++»(Int $K) { return sub (@data) { my @means = @data.pick; Line 3,731 ⟶ 4,499: my @data = flat @centers.map: { ($_ + .5 - rand + (.5 - rand) * i) xx 100 } @data.=pick(); .say for 3-means++(@data);</~~lang~~syntaxhighlight> {{out}} Line 3,740 ⟶ 4,508: =={{header\|Rust}}== {{trans\|Python}} (the initial point selection part) <~~lang~~syntaxhighlight lang="rust">extern crate csv; extern crate getopts; extern crate gnuplot; Line 4,039 ⟶ 4,807: } } </syntaxhighlight> ~~</lang>~~ [Plots exist but file upload is broken at the moment.] Line 4,070 ⟶ 4,838: Nothing special is needed to handle multiple dimensions: all points are represented as lists, which the euclidean distance function works through in a loop. <~~lang~~syntaxhighlight lang="scheme"> (import (scheme base) ; headers for R7RS Scheme (scheme file) Line 4,249 ⟶ 5,017: (tester-2 30000 6 10) (tester-3 30000 6 5) </syntaxhighlight> ~~</lang>~~ Images in eps files are output for the 2D unit square and unit circle. Line 4,265 ⟶ 5,033: =={{header\|SequenceL}}== {{trans\|C}} <~~lang~~syntaxhighlight lang="sequencel"> import <Utilities/Sequence.sl>; import <Utilities/Random.sl>; Line 4,403 ⟶ 5,171: in printEPS(points.first, result.first, result.second,k,10.0); </syntaxhighlight> ~~</lang>~~ =={{header\|Tcl}}== {{trans\|C}}{{tcllib\|math::constants}} <~~lang~~syntaxhighlight lang="tcl">package require Tcl 8.5 package require math::constants math::constants::constants pi Line 4,509 ⟶ 5,277: } return $cent }</~~lang~~syntaxhighlight> Demonstration/visualization code: {{libheader\|Tk}} <~~lang~~syntaxhighlight lang="tcl">package require Tk image create photo disp -width 400 -height 400 pack [label .l -image disp] Line 4,535 ⟶ 5,303: plot $cx $cy black } }}</~~lang~~syntaxhighlight> =={{header\|Wren}}== {{trans\|Kotlin}} {{libheader\|Wren-dynamic}} {{libheader\|Wren-fmt}} <syntaxhighlight lang="wren">import "random" for Random import "./dynamic" for Struct import "./fmt" for Fmt var Point = Struct.create("Point", ["x", "y", "group"]) var r = Random.new() var hugeVal = Num.infinity var RAND_MAX = Num.maxSafeInteger var PTS = 100000 var K = 11 var W = 400 var H = 400 var rand = Fn.new { r.int(RAND_MAX) } var randf = Fn.new { \|m\| m rand.call() / (RAND_MAX - 1) } var genXY = Fn.new { \|count, radius\| var pts = List.filled(count, null) /* note: this is not a uniform 2-d distribution / for (i in 0...count) { pts[i] = Point.new(0, 0, 0) var ang = randf.call(2 Num.pi) var r = randf.call(radius) pts[i].x = r * ang.cos pts[i].y = r * ang.sin } return pts } var dist2 = Fn.new { \|a, b\| var x = a.x - b.x var y = a.y - b.y return x * x + y * y } var nearest = Fn.new { \|pt, cent, nCluster\| var minD = hugeVal var minI = pt.group for (i in 0...nCluster) { var d = dist2.call(cent[i], pt) if (minD > d) { minD = d minI = i } } return [minI, minD] } var copyPoint = Fn.new { \|pt\| Point.new(pt.x, pt.y, pt.group) } var kpp = Fn.new { \|pts, len, cent\| var nCent = cent.count var d = List.filled(len, 0) cent[0] = copyPoint.call(pts[rand.call() % len]) for (nCluster in 1...nCent) { var sum = 0 for (j in 0...len) { d[j] = nearest.call(pts[j], cent, nCluster)[1] sum = sum + d[j] } sum = randf.call(sum) for (j in 0...len) { sum = sum - d[j] if (sum > 0) continue cent[nCluster] = copyPoint.call(pts[j]) break } } for (j in 0...len) pts[j].group = nearest.call(pts[j], cent, nCent)[0] } var lloyd = Fn.new { \|pts, len, nCluster\| var cent = List.filled(nCluster, null) for (i in 0...nCluster) cent[i] = Point.new(0, 0, 0) kpp.call(pts, len, cent) while(true) { /* group element for centroids are used as counters / for (i in 0...nCluster) { cent[i].x = 0 cent[i].y = 0 cent[i].group = 0 } for (j in 0...len) { var p = pts[j] var c = cent[p.group] c.group = c.group + 1 c.x = c.x + p.x c.y = c.y + p.y } for (i in 0...nCluster) { var c = cent[i] c.x = c.x / c.group c.y = c.y / c.group } var changed = 0 / find closest centroid of each point / for (j in 0...len) { var p = pts[j] var minI = nearest.call(p, cent, nCluster)[0] if (minI != p.group) { changed = changed + 1 p.group = minI } } / stop when 99.9% of points are good / if (changed <= (len >> 10)) break } for (i in 0...nCluster) cent[i].group = i return cent } var printEps = Fn.new { \|pts, len, cent, nCluster\| var colors = List.filled(nCluster 3, 0) for (i in 0...nCluster) { colors[3 * i + 0] = (3 * (i + 1) % 11) / 11 colors[3 * i + 1] = (7 * i % 11) / 11 colors[3 * i + 2] = (9 * i % 11) / 11 } var minX = hugeVal var minY = hugeVal var maxX = -hugeVal var maxY = -hugeVal for (j in 0...len) { var p = pts[j] if (maxX < p.x) maxX = p.x if (minX > p.x) minX = p.x if (maxY < p.y) maxY = p.y if (minY > p.y) minY = p.y } var scale = (W / (maxX - minX)).min(H / (maxY - minY)) var cx = (maxX + minX) / 2 var cy = (maxY + minY) / 2 System.print("\%!PS-Adobe-3.0\n\%\%BoundingBox: -5 -5 %(W + 10) %(H + 10)") System.print("/l {rlineto} def /m {rmoveto} def") System.print("/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def") System.write("/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath ") System.write(" gsave 1 setgray fill grestore gsave 3 setlinewidth") System.print(" 1 setgray stroke grestore 0 setgray stroke }def") var f1 = "$g $g $g setrgbcolor" var f2 = "$.3f $.3f c" var f3 = "\n0 setgray $g $g s" for (i in 0...nCluster) { var c = cent[i] Fmt.print(f1, colors[3 * i], colors[3 * i + 1], colors[3 * i + 2]) for (j in 0...len) { var p = pts[j] if (p.group != i) continue Fmt.print(f2, (p.x - cx) * scale + W / 2, (p.y - cy) * scale + H / 2) } Fmt.print(f3, (c.x - cx) * scale + W / 2, (c.y - cy) * scale + H / 2) } System.write("\n\%\%EOF") } var v = genXY.call(PTS, 10) var c = lloyd.call(v, PTS, K) printEps.call(v, PTS, c, K)</syntaxhighlight> =={{header\|XPL0}}== Line 4,544 ⟶ 5,480: somewhat for these sins. Alas, image uploads appears to be broken. <~~lang~~syntaxhighlight ~~XPL0~~lang="xpl0">include c:\cxpl\codes; \intrinsic 'code' declarations def N = 30000; \number of points Line 4,610 ⟶ 5,546: I:= ChIn(1); \wait for keystroke SetVid($03); \restore normal text screen ]</~~lang~~syntaxhighlight>