# Install the packages and activate them library(mclust) library(clustvarsel) # # To explore the sensitivity of Mclust, here I run the code 3 times on # a random subset of observations. for (kk in (1:3)) { ii<-sample(seq(1,150),100) mm<-Mclust(iris[ii,-5]) print(summary(mm)) # This print command tells you how many clusters and how complex # (equal or varying volume, shape or orientation) the clusters have. print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) # checking if the clusters can find the original class labels plot(mm,what="classification") # the plotting command illustrates what the cluster distributions look like # If you only want to plot a subset of the features use the command # plot(mm,c(subset),what="classification") e.g. c(1,2) if you want # to plot only the sepal width and length. p<-locator() } # # Below I use Mclust with specific modeling assumptions # VVV=volume, shape and orientation varies between clusters ii<-sample(seq(1,150),100) mm<-Mclust(iris[ii,-5],G=3,modelName=c("VVV")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # EEE, volume, shape and orientation same for all clusters mm<-Mclust(iris[ii,-5],G=3,modelName=c("EEE")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # VVI, volume and shape varies, no correlation mm<-Mclust(iris[ii,-5],G=3,modelName=c("VVI")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() #EII, equal volume, spherical clusters mm<-Mclust(iris[ii,-5],G=3,modelName=c("EII")) print(summary(mm)) print(table(iris[ii,5],apply(mm$z,1,sort.list)[mm$G,])) plot(mm,what="classification") p<-locator() # # Clustvarsel with 3 clusters for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=2) print(cc$step) print(cc$subset) p<-locator()} # # You can also play with different Mclust models for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(iris[ii,-5],G=3,emModels2=mclust.options("VEV")) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # # I add some noise features to the data. Is ClustVarSel able to handle this? newiris<-iris newiris<-as.data.frame(cbind(matrix(rnorm(150*4),150,4),iris)) names(newiris)<-c("1","2","3","4",names(iris)) for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(newiris[ii,-9],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } # # Here I add a set of features that are duplicates of the iris data but # with added noise. ClustVarSel does a pretty good job at finding the # noiseless features as cluster related and leaving the noisy features # as indirectly related to the clustering. # Try different noise levels by changing the sd in the rnorm(). newiris<-iris newiris<-as.data.frame(cbind(matrix(rnorm(150*4,sd=.1),150,4)+iris[,1:4],iris)) names(newiris)<-c("1","2","3","4",names(iris)) for (kk in (1:3)) { ii<-sample(seq(1,150),100) cc<-clustvarsel(newiris[ii,-9],G=3) print(cc$step) # summarizes the features selection procedure print(cc$subset) # final variable set p<-locator() } ##