# Install the packages and activate them library(mclust) library(clustvarsel) # # To explore the sensitivity of Mclust, here I run the code 3 times on # a random subset of observations. for (kk in (1:3)) { ii<-sample(seq(1,150),100) mm<-Mclust(iris[ii,-5]) print(summary(mm)) # This print command tells you how many clusters and how complex # (equal or varying volume, shape or orientation) the clusters have. print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) # checking if the clusters can find the original class labels plot(mm,what="classification") # the plotting command illustrates what the cluster distributions look like # If you only want to plot a subset of the features use the command # plot(mm,c(subset),what="classification") e.g. c(1,2) if you want # to plot only the sepal width and length. p<-locator() } # # Below I use Mclust with specific modeling assumptions # VVV=volume, shape and orientation varies between clusters ii<-sample(seq(1,150),100) mm<-Mclust(iris[ii,-5],G=3,modelName=c("VVV")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # EEE, volume, shape and orientation same for all clusters mm<-Mclust(iris[ii,-5],G=3,modelName=c("EEE")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # VVI, volume and shape varies, no correlation mm<-Mclust(iris[ii,-5],G=3,modelName=c("VVI")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() #EII, equal volume, spherical clusters mm<-Mclust(iris[ii,-5],G=3,modelName=c("EII")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # # Clustvarsel with 3 clusters for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=2) print(cc$step) print(cc$subset) p<-locator()} # # You can also play with different Mclust models for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=3,emModels2=mclust.options("VEV")) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # # I add some noise features to the data. Is ClustVarSel able to handle this? newiris<-iris newiris<-as.data.frame(cbind(matrix(rnorm(150*4),150,4),iris)) names(newiris)<-c("1","2","3","4",names(iris)) for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(newiris[ii,-9],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # # Here I add a set of features that are duplicates of the iris data but # with added noise. ClustVarSel does a pretty good job at finding the # noiseless features as cluster related and leaving the noisy features # as indirectly related to the clustering. # Try different noise levels by changing the sd in the rnorm(). newiris<-iris newiris<-as.data.frame(cbind(matrix(rnorm(150*4,sd=.1),150,4)+iris[,1:4],iris)) names(newiris)<-c("1","2","3","4",names(iris)) for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(newiris[ii,-9],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } ## ################################### # On the Numbers data library(ElemStatLearn) data(zip.train) Numbers<-as.matrix(zip.train) library(irlba) ssn<-svd(Numbers[,-1],10,10) # leading svd components # mm<-Mclust(ssn$u[,1:10]) # takes a while, checks lots of different models and number of clusters # plot(mm) # selection 1 to check number of clusters for different models # print(summary(mm)) print(table(Numbers[,1],apply(mm$z,1,sort.list)[mm$G,])) # When I ran this, mclust selected 3 clusters # One cluster with 1s, one with 4, 7 and 9 and the rest in one cluster plot(mm,what="classification") #### mm<-Mclust(ssn$u[,1:10],G=10) print(summary(mm)) print(table(Numbers[,1],apply(mm$z,1,sort.list)[mm$G,])) # With 10 clusters, One with 2,4,6, one with 0s, one with 1s, # one with 0s,5s,6s, one with 8s, one with 1s, one with 3s, one with 7s, # one with 9s and one with 1s (again). plot(mm,what="classification") #########################################3