# IML exercise set 6, problem 3: k-means clustering # Mika Wahlroos 2016 # Requires loadmnist.R for loading and visualizing the MNIST dataset, and of course # the dataset itself. # Note: Using the rdist function requires the "fields" package to be installed, # e.g. with install.packages("fields") # a) implement the k-means algorithm use_rdist = TRUE if (use_rdist) { library(fields) } # A couple of implementations for computing the squared distances: # Plain R and rdist. # Computes the squared Euclidean distances between points (row vectors) # in the given data and given exemplars. sqdists <- function(data, exemplars) { if (use_rdist) { dists = sqdists.rdist(data, exemplars) } else { dists = sqdists.direct(data, exemplars) } dists } # Returns the squared Euclidean distance between two given vectors. sqdist <- function(v1, v2) { sum((v1 - v2)^2) } # Plain R implementation of computing squared Euclidean distances # between data points and means sqdists.direct <- function(data, exemplars) { dists = matrix(nrow=nrow(data), ncol=nrow(exemplars)) for (i in 1:nrow(exemplars)) { dists[,i] = apply(data, MARGIN=1, FUN=sqdist, exemplars[i,]) } dists } # Squared distances between data points and means using rdist sqdists.rdist <- function(data, exemplars) { rdist(data, exemplars)^2 } kmeans <- function(data, k, exemplars) { if (missing(exemplars)) { # select random points from the data as initial exemplars exemplars = data[sample(nrow(data), size=k),] } # retain initial exemplars so that they can be returned for analysis # along with the actual results exemplars.init = exemplars p = ncol(data) # initialize stuff for the first iteration previous_exemplars = matrix(rep(Inf, times=k*ncol(data)), ncol=ncol(data)) iter = 0 # data point cluster memberships assignments = integer(nrow(data)) while (!all(exemplars == previous_exemplars)) { iter = iter + 1 print(paste("iteration", iter, sep=" ")) previous_exemplars = exemplars dists = sqdists(data, exemplars) # get index of the nearest exemplar for each data point assignments = apply(dists, MARGIN=1, FUN=which.min) # update exemplar locations to the means of their corresponding clusters for (i in 1:k) { members = matrix(data[assignments == i], ncol=p) mean = colMeans(members) if (!any(is.nan(mean))) { exemplars[i,] = mean } else { exemplars[i,] = data[sample(nrow(data), size=1),] } } } output = list() output$means = exemplars output$assignments = assignments output$exemplars.init = exemplars.init output } set.seed(0) # parameters for the data mean = 0 var = 1 # covariance matrix Sigma = matrix(c(var, 0, 0, var), nrow=2) # generate test data and initial exemplars n = 100 data = cbind(rnorm(n, mean, sqrt(Sigma[1,1])), rnorm(n, mean, sqrt(Sigma[2,2]))) # do clustering # (several runs to see the difference with different randomly initialized means) a_clusterings = list() a_clusterings$k3_1 = kmeans(data, k=3) a_clusterings$k3_2 = kmeans(data, k=3) a_clusterings$k3_3 = kmeans(data, k=3) a_clusterings$k3_4 = kmeans(data, k=3) a_clusterings$k4 = kmeans(data, k=4) a_clusterings$k6 = kmeans(data, k=6) a_clusterings$k8 = kmeans(data, k=8) a_clusterings$k10 = kmeans(data, k=10) # visualize the clusters plotclusters <- function(data, clusters) { assignments = clusters$assignments k = nrow(clusters$means) colors = rainbow(k) xlim = c(floor(min(data[,1])), ceiling(max(data[,1]))) ylim = c(floor(min(data[,2])), ceiling(max(data[,2]))) par(pch=24) # draw the cluster means as triangles plot(clusters$means, ylim=ylim, xlim=xlim, xlab="X1", ylab="X2") for (i in 1:k) { # draw data points as normal solid circles members = matrix(data[assignments == i], ncol=ncol(data)) par(new=TRUE, pch=1) plot(members, col=colors[i], ylim=ylim, xlim=xlim, axes=FALSE, xlab="", ylab="") } # as a final curiosity, also draw the initial exemplars of each cluster for (i in 1:nrow(clusters$exemplars.init)) { x = clusters$exemplars.init[i,1] y = clusters$exemplars.init[i,2] points(x, y, pch=19, col=colors[i]) } } oldpar = par() par(mfrow=c(2,4)) par(mar=c(2.5,2.5,1,1)) for (clustering in a_clusterings) { plotclusters(data, clustering) } #par(oldpar) # b) try k-means with the MNIST dataset source("loadmnist.R") load_mnist() set.seed(0) # prepare data (subset) mnist500 = list() mnist500$x = train$x[1:500,] mnist500$y = train$y[1:500] # perform clustering k = 10 exemplars = mnist500$x[1:10,] mnist500$clustering <- kmeans(mnist500$x, k, exemplars) # prepare visualization oldpar = par() par(mfrow=c(2,5)) par(mar=c(2.5,2.5,1,1)) # visualize means for (i in 1:k) { show_digit(mnist500$clustering$means[i,]) } # take a look at some of the values in the clusters mnist500.clusters = list() for (i in 1:k) { mnist500.clusters[[i]] = matrix( mnist500$x[mnist500$clustering$assignments == i], ncol=ncol(mnist500$x) ) } par(mfrow=c(10,10)) par(mar=c(0,0,0,0)) # visualize the first 10 members of each cluster, with one row per cluster for (cluster in mnist500.clusters) { for (i in 1:10) { show_digit(cluster[i,]) } } # ... etc. # At a first glance, some of the digits seem to correspond somewhat with the digits. # The only cluster initialized with a 0 seems to have 0s. On the other hand, for example # the only cluster initialized with a 3 has all kinds of stuff, at least 5, 3, 8, 2 and 0 # already in the first 9 members of the cluster. One cluster initialized with a 1 # (there are 3 clusters like that) has stuff all over the place. # # Tried also with randomly selected data points as initial means. Results: # The clusters don't really seem to correspond very well with the digits. # in some cases instances of the same digit seem to have ended up in the same # cluster, but not very systematically. # # One reason might be that it makes little difference whether pixels in the # same general area of the image make a line (e.g. in 7) or loop (e.g. in 9). # # For example one of the clusters seemed, at least at first glance, to have # different digits that are significantly slanted to the right. # c) same thing, but with the first instance of each class as the initial cluster mean exemplars = mnist500$x[1:10,] for (i in 1:10) { exemplars[i,] = mnist500$x[mnist500$y == i %% 10,][1,] } # show exemplar data points as images # for (i in 1:9) { show_digit(exemplars[i,]) } # do clustering mnist500$clustering.unique_means <- kmeans(mnist500$x, k, exemplars) # visualize par(mfrow=c(2,5)) par(mar=c(2.5,2.5,1,1)) # visualize means for (i in 1:10) { show_digit(mnist500$clustering.unique_means$means[i,]) } # take a look at the clusters mnist500.clusters_unique_means = list() for (i in 1:k) { mnist500.clusters_unique_means[[i]] = matrix( mnist500$x[mnist500$clustering.unique_means$assignments == i], ncol=ncol(mnist500$x) ) } par(mfrow=c(10,10)) par(mar=c(0,0,0,0)) # visualize the first 10 members of each cluster, with one row per cluster for (cluster in mnist500.clusters_unique_means) { for (i in 1:10) { show_digit(cluster[i,]) } } # Now try clustering with the full training dataset, just to see how long it takes. # This took a few minutes with rdist, 15-20 min with the plain R distance implementation. # Timing with the plain R one: # user system elapsed # 1028.923 44.620 1077.423 #system.time(mnist.clusters.full <- kmeans(train$x, k, exemplars)) # visualize means #par(mfrow=c(2,5)) #par(mar=c(2.5,2.5,1,1)) #for (i in 1:10) { show_digit(mnist500$clustering.unique_means$means[i,]) }