##### DEMO 13 #### library(cluster) # cluster prediction strength for selecting then number of clusters # the code below is cheating a bit since not trying all combos - just for illustration # there is package fpc that has many cluster validation functions in it - see further down ii<-sample(seq(1,150),100) # Kvec<-seq(2,10) pvec<-0*Kvec ij<-sample(seq(1,length(ii)),round(length(ii)/2)) Aset<-iris[ii,][ij,] Bset<-iris[ii,][-ij,] for (kk in (1:length(Kvec))) { ppA<-pam(Aset[,1:4],Kvec[kk]) ppB<-pam(Bset[,1:4],Kvec[kk]) if (Kvec[kk]<6) { pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust)) p<-locator() } # Check if clusters overlap from training (Aset) to test (Bset) - colors and symbols overlap? lA<-lda(Aset[,1:4],ppA$clust) pA<-predict(lA,newdata=Bset[,1:4],type="class") lB<-lda(Bset[,1:4],ppB$clust) pB<-predict(lB,newdata=Aset[,1:4],type="class") # tAB<-table(pA$class,ppB$clust) tBA<-table(pB$class,ppA$clust) pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) } plot(Kvec,pvec,xlab="K",ylab="Overlap") # where is this maximized? #### # ### creating better separation to see if I can find 3 clusters then irisnew<-iris irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 # ii<-sample(seq(1,150),100) Kvec<-seq(2,10) pvec<-0*Kvec ij<-sample(seq(1,length(ii)),round(length(ii)/2)) Aset<-irisnew[ii,][ij,] Bset<-irisnew[ii,][-ij,] for (kk in (1:length(Kvec))) { ppA<-pam(Aset[,1:4],Kvec[kk]) ppB<-pam(Bset[,1:4],Kvec[kk]) if (Kvec[kk]<6) { pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust)) p<-locator() } lA<-lda(Aset[,1:4],ppA$clust) pA<-predict(lA,newdata=Bset[,1:4],type="class") lB<-lda(Bset[,1:4],ppB$clust) pB<-predict(lB,newdata=Aset[,1:4],type="class") # tAB<-table(pA$class,ppB$clust) tBA<-table(pB$class,ppA$clust) pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) } plot(Kvec,pvec,xlab="K",ylab="Overlap") # ### creating 4 separated clusters irisnew<-iris irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"] irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3 # ii<-sample(seq(1,150),100) Kvec<-seq(2,10) pvec<-0*Kvec ij<-sample(seq(1,length(ii)),round(length(ii)/2)) Aset<-irisnew[ii,][ij,] Bset<-irisnew[ii,][-ij,] for (kk in (1:length(Kvec))) { ppA<-pam(Aset[,1:4],Kvec[kk]) ppB<-pam(Bset[,1:4],Kvec[kk]) if (Kvec[kk]<6) { pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust)) p<-locator() } lA<-lda(Aset[,1:4],ppA$clust) pA<-predict(lA,newdata=Bset[,1:4],type="class") lB<-lda(Bset[,1:4],ppB$clust) pB<-predict(lB,newdata=Aset[,1:4],type="class") # tAB<-table(pA$class,ppB$clust) tBA<-table(pB$class,ppA$clust) pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) } plot(Kvec,pvec,xlab="K",ylab="Overlap") #### # #### Package for cluster validation library(fpc) # Read the helpfile about the package library(help="fpc") # online manual also pp<-prediction.strength(iris[,-5], Gmin=2,Gmax=5,clustermethod=kmeansCBI, classification="centroid",M=10) # You can try other clustering methods like hierarchical etc and other classifiers like kNN etc # Check help(prediction.strength) for more info plot(seq(2,5),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength") # irisnew<-iris irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"] irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+4 pp<-prediction.strength(irisnew[,-5], Gmin=2,Gmax=10,clustermethod=kmeansCBI, classification="centroid",M=10) plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength") # Do a couple of times and check how many clusters are selected ## # You can also try different clustering and classification methods... irisnew<-iris irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"] irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+4 pp<-prediction.strength(irisnew[,-5], Gmin=2,Gmax=10,clustermethod=hclustCBI, method="complete",cut="level",classification="knn",nnk=5,M=10) plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength") ###### ## Cluster comparisons # Rand index = number of pairs that are co-clustered in both + number of pairs that are not co-clustered in both sets / all # vi = varation of information (information theoretic measure - small means lots of overlap) kk<-kmeans(irisnew[,-5],3) hh<-cutree(hclust(dist(irisnew[,-5])),3) cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T) table(kk$cluster,hh) # kk<-kmeans(irisnew[,-5],2) hh<-cutree(hclust(dist(irisnew[,-5])),2) cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T) table(kk$cluster,hh) ###### # Stability of clustering results - using bootstrap to assess # similar to my code above par(mfrow=c(2,2)) pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=3,multipleboot = F) pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=4,multipleboot = F) pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=5,multipleboot = F) ### # Summaring variable distributions for each cluster kk<-kmeans(irisnew[,-5],2) cluster.varstats(kk$cluster,irisnew[,-5]) # If you run this on a higher dimensional data set, set ask to False and then check for each variable separately! cc<-cluster.varstats(kk$cluster,irisnew[,-5],ask=F) plot(cc[[1]]) # checking variable 1 ###