#### Clustering #### library(MASS) # kmeans on the iris data, asking for 3 clusters kk<-kmeans(iris[,1:4],3) table(kk$cluster,iris$Species) # Check if the cluster indices overal with the true species labels # Not great - messing up versicolor and virginica # Can you classify? ll<-lda(iris[,1:4],iris$Species) pl<-predict(ll,iris[,1:4]) table(pl$class,iris$Species) # Supervised (classification) can predict the labels - class discovery (clustering) is a more difficult problem! #### ######################################### # How many clusters in the data? # Run for K=1 to 10 and summarize the deviation from the cluster means # The so-called Within-Sum-of-Squares W<-rep(0,10) for (k in 2:10) { kk<-kmeans(iris[,1:4],k) W[k]<-kk$tot.with } mu<-apply(iris[,1:4],2,mean) iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1])) W[1]<-sum(iriss^2) plot(seq(1,10),W,xlab="K",ylab="Within-SS",type="l") # This looks a bit like the RSS-curve in regression. Remember what to look for? When does the curve level off? # When does it stop paying off to add clusters? No point going beyond 2-3 ############# kk<-kmeans(iris[,1:4],3) pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=kk$cluster) # 3 clusters for the IRIS data - check point symbol (cluster index) vs color (true label) kk<-kmeans(iris[,1:4],2) pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=kk$cluster) ######################### # PAM - partion around mediods - a robust version of kmeans pp<-pam(iris[,1:4],3) plot(pp) # Are the 3 clusters well separated? pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=pp$cluster) # W<-rep(0,10) for (k in 2:10) { kk<-pam(dist(iris[,1:4]),k) W[k]<-kk$sil$avg } mu<-apply(iris[,1:4],2,mean) iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1])) plot(seq(2,10),W[2:10],xlab="K",ylab="Silhouette Width",type="l") # For which K is the silhouette width maximized? ############################## # hierarchical clustering hh<-hclust(dist(iris[,1:4])) plot(hh,label=iris$Species) # pretty good separation # Try a different joining mechanism in the tree hh<-hclust(dist(iris[,1:4]),"single") plot(hh,label=iris$Species) # Big impact! You get what you ask for in clustering... cc<-cor(t(iris[,1:4])) hh<-hclust(as.dist(1-cc)) plot(hh,label=iris$Species) # Also - how you measure distance between the observations also has an impact. # Choices: distance metric AND joining meachnism (linkage) ########################################################## # Let's look at the numbers data library(ElemStatLearn) data(zip.train) Numbers<-as.matrix(zip.train) Nmat<-as.matrix(Numbers[,-1]) # iu<-sample(seq(1,7291),500) # 500 digits chosen at random # kk<-kmeans(Nmat[iu,],10) table(kk$cluster,Numbers[iu,1]) # Check if the cluster indices overlap with the true species labels # Not great... # # How many clusters in the data? # Run for K=1 to 10 and summarize the deviation from the cluster means # The so-called Within-Sum-of-Squares W<-rep(0,25) for (k in 2:25) { kk<-kmeans(Nmat[iu,],k) W[k]<-kk$tot.with } mu<-apply(Nmat[iu,],2,mean) mss<-Nmat[iu,]-t(matrix(rep(mu,dim(Nmat[iu,])[1]),256,dim(Nmat[iu,])[1])) W[1]<-sum(mss^2) plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l") # Levels off slowly - not obvious clustering structure # ### Clustering after PCA dimension reduction ssn<-svd(Nmat[iu,]) plot3d(ssn$u[,1:3],col=as.numeric(Numbers[iu,1])+1) # A lot of overlap - only one cluster really stands out - those are the 1s W<-rep(0,25) for (k in 2:25) { kk<-kmeans(ssn$u[,1:10],k) W[k]<-kk$tot.with } mu<-apply(ssn$u[,1:10],2,mean) mss<-ssn$u[,1:10]-t(matrix(rep(mu,dim(ssn$u[,1:10])[1]),10,dim(ssn$u[,1:10])[1])) W[1]<-sum(mss^2) plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l") # ############# # PAM - partion around mediods - a robust version of kmeans pp<-pam(Nmat[iu,],4) plot(pp) # plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h") points(seq(1,500),rep(-0.1,500),pch=as.character((Numbers[iu,1])[as.numeric(rownames(pp$silinfo$widths))])) # Looks like an OK cluster separation - but are the digits in the clusters? table(pp$clustering,Numbers[iu,1]) # Not great... ############################## # hierarchical clustering hh<-hclust(dist(Nmat[iu,]),"complete") plot(hh,label=Numbers[iu,1]) # hh<-hclust(dist(Nmat[iu,]),"single") plot(hh,label=Numbers[iu,1]) # Single linkage aggressively builds the 1s as a cluster first hh<-hclust(dist(Nmat[iu,]),"average") plot(hh,label=Numbers[iu,1]) # cc<-cor(t(Nmat[iu,])) hh<-hclust(as.dist(1-cc)) plot(hh,label=Numbers[iu,1]) # One cluster can be founds but not all # Clustering is much more difficult than classification!!!