R12 - K Means Clustering x1 <- c(1, 2, 3, 4, 7, 8, 10) x2 <- c(8, 2, 3, 1, 11, 8, 10) X <- cbind(x1, x2) plot(X, pch=16) identify(X, labels=1:7) X.km <- kmeans(X,2) X.km points(X,pch=X.km$cluster+1, col=X.km$cluster+1) points(X.km$centers, col=2:3, pch=2:3, cex=1.5) library(MASS) mu3 <- c(4.5, 4.5) Sigma2 <- matrix(c(2.25,1.5,1.5,2.25),nrow=2) unimodal <- mvrnorm(50, mu=mu3, Sigma=Sigma2) plot(unimodal, pch=16) uni.3m1 <- kmeans(unimodal,3) # Random starts - uni.3m2 <- kmeans(unimodal,3) # different answers plot(unimodal, pch=uni.3m1$cluster, col=uni.3m1$cluster) points(unimodal, pch=uni.3m2$cluster+3, col=uni.3m2$cluster) table(data.frame(km1=uni.3m1$cluster,km2=uni.3m2$cluster)) uni.3m1$withinss sum(uni.3m1$withinss) uni.3m2$withinss sum(uni.3m2$withinss) uni.3m3 <- kmeans(unimodal,3, nstart=25) uni.3m4 <- kmeans(unimodal,3, nstart=25) plot(unimodal, pch=uni.3m3$cluster, col=uni.3m3$cluster) points(unimodal, pch=uni.3m4$cluster+3, col=uni.3m4$cluster) table(data.frame(km1=uni.3m3$cluster,km2=uni.3m4$cluster)) uni.3m3$withinss sum(uni.3m3$withinss) # Swiss Canton Data swiss.4m <- kmeans(scale(swiss), 4, nstart=25) pairs(swiss, pch=swiss.4m$cluster, col=swiss.4m$cluster) library(rgl) plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.4m$cluster) swiss.4m$withinss sum(swiss.4m$withinss) # Number of clusters? "Pseudo-F" Statistic source("E:/S5600/pseudoF.txt") pseudoF pseudoF(scale(swiss), 2:6) swiss.3m <- kmeans(scale(swiss), 3) sum(swiss.3m$withinss) pairs(swiss, pch=swiss.3m$cluster, col=swiss.3m$cluster) plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.3m$cluster) # Compare to hierarchical results swiss.hc.w <- hclust(dist(scale(swiss)), "ward") plot(swiss.hc.w) table(data.frame(km3=swiss.3m$cluster,hc3=cutree(swiss.hc.w,3))) # Interpretations pairs(swiss.3m$centers, pch=1:3, col=1:3) swiss.3m$centers # Predictions bim.2m <- kmeans(bimodal, 2) plot(bimodal, pch=bim.2m$cluster, col = bim.2m$cluster) points(bim.2m$centers, pch=3, col=1:2) points(unimodal, pch=4, col=4) source("E:/S5600/predict.kmeans.txt") predict.kmeans pred.uni.bim <- predict.kmeans(bim.2m, unimodal) pred.uni.bim points(unimodal, pch=pred.uni.bim+2, col=pred.uni.bim) # Validation iris0 <- iris[,1:4] s <- sample(150, 75) iris1 <- iris0[s,] # Calibration Set iris2 <- iris0[-s,] # Validation Set pseudoF(iris1, 2:6) iris1.3m <- kmeans(iris1, 3, 25) pairs(iris1, pch=iris1.3m$cluster, col=iris1.3m$cluster) pred.iris2.iris1 <- predict.kmeans(iris1.3m, iris2) iris2.3m <- kmeans(iris2, 3, 25) pairs(iris2, pch=pred.iris2.iris1, col=iris2.3m$cluster) table(data.frame(S1=pred.iris2.iris1,S2=iris2.3m$cluster)) iris1.4m <- kmeans(iris1, 4, 25) pairs(iris1, pch=iris1.4m$cluster, col=iris1.4m$cluster) pred.iris2.iris1 <- predict.kmeans(iris1.4m, iris2) iris2.4m <- kmeans(iris2, 4, 25) pairs(iris2, pch=pred.iris2.iris1, col=iris2.4m$cluster) table(data.frame(S1=pred.iris2.iris1,S2=iris2.4m$cluster)) iris1.2m <- kmeans(iris1, 2, 25) pred.iris2.iris1 <- predict.kmeans(iris1.2m, iris2) iris2.2m <- kmeans(iris2, 2, 25) pairs(iris2, pch=pred.iris2.iris1, col=iris2.2m$cluster) table(data.frame(S1=pred.iris2.iris1,S2=iris2.2m$cluster))