##### DEMO 13 ####
library(cluster)
# cluster prediction strength for selecting then number of clusters
# the code below is cheating a bit since not trying all combos - just for illustration
# there is package fpc that has many cluster validation functions in it - see further down
ii<-sample(seq(1,150),100)
#
Kvec<-seq(2,10)
pvec<-0*Kvec
ij<-sample(seq(1,length(ii)),round(length(ii)/2))
Aset<-iris[ii,][ij,]
Bset<-iris[ii,][-ij,]
for (kk in (1:length(Kvec))) {
  ppA<-pam(Aset[,1:4],Kvec[kk])
  ppB<-pam(Bset[,1:4],Kvec[kk])
  if (Kvec[kk]<6) {
    pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
    p<-locator() } # Check if clusters overlap from training (Aset) to test (Bset) - colors and symbols overlap?
  lA<-lda(Aset[,1:4],ppA$clust)
  pA<-predict(lA,newdata=Bset[,1:4],type="class")
  lB<-lda(Bset[,1:4],ppB$clust)
  pB<-predict(lB,newdata=Aset[,1:4],type="class")
  #
  tAB<-table(pA$class,ppB$clust)
  tBA<-table(pB$class,ppA$clust)
  pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) }
plot(Kvec,pvec,xlab="K",ylab="Overlap") # where is this maximized? 
####  
#
### creating better separation to see if I can find 3 clusters then
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
#
ii<-sample(seq(1,150),100)
Kvec<-seq(2,10)
pvec<-0*Kvec
ij<-sample(seq(1,length(ii)),round(length(ii)/2))
Aset<-irisnew[ii,][ij,]
Bset<-irisnew[ii,][-ij,]
for (kk in (1:length(Kvec))) {
  ppA<-pam(Aset[,1:4],Kvec[kk])
  ppB<-pam(Bset[,1:4],Kvec[kk])
  if (Kvec[kk]<6) {
    pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
    p<-locator() }
  lA<-lda(Aset[,1:4],ppA$clust)
  pA<-predict(lA,newdata=Bset[,1:4],type="class")
  lB<-lda(Bset[,1:4],ppB$clust)
  pB<-predict(lB,newdata=Aset[,1:4],type="class")
  #
  tAB<-table(pA$class,ppB$clust)
  tBA<-table(pB$class,ppA$clust)
  pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) }
plot(Kvec,pvec,xlab="K",ylab="Overlap")
#
### creating 4 separated clusters
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3
#
ii<-sample(seq(1,150),100)
Kvec<-seq(2,10)
pvec<-0*Kvec
ij<-sample(seq(1,length(ii)),round(length(ii)/2))
Aset<-irisnew[ii,][ij,]
Bset<-irisnew[ii,][-ij,]
for (kk in (1:length(Kvec))) {
  ppA<-pam(Aset[,1:4],Kvec[kk])
  ppB<-pam(Bset[,1:4],Kvec[kk])
  if (Kvec[kk]<6) {
    pairs(rbind(Aset,Bset)[,-5],col=c(ppA$clust,rep("grey",dim(Bset)[1])),pch=c(rep(1,dim(Aset)[1]),ppB$clust))
    p<-locator() }
  lA<-lda(Aset[,1:4],ppA$clust)
  pA<-predict(lA,newdata=Bset[,1:4],type="class")
  lB<-lda(Bset[,1:4],ppB$clust)
  pB<-predict(lB,newdata=Aset[,1:4],type="class")
  #
  tAB<-table(pA$class,ppB$clust)
  tBA<-table(pB$class,ppA$clust)
  pvec[kk]<-sum(apply(tAB,1,max))+sum(apply(tBA,1,max)) }
plot(Kvec,pvec,xlab="K",ylab="Overlap")
####
#
#### Package for cluster validation
library(fpc)
# Read the helpfile about the package
library(help="fpc")
# online manual also
pp<-prediction.strength(iris[,-5], Gmin=2,Gmax=5,clustermethod=kmeansCBI, classification="centroid",M=10)
# You can try other clustering methods like hierarchical etc and other classifiers like kNN etc
# Check help(prediction.strength) for more info
plot(seq(2,5),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
#
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+4
pp<-prediction.strength(irisnew[,-5], Gmin=2,Gmax=10,clustermethod=kmeansCBI, classification="centroid",M=10)
plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
# Do a couple of times and check how many clusters are selected
##
# You can also try different clustering and classification methods...
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+4
pp<-prediction.strength(irisnew[,-5], Gmin=2,Gmax=10,clustermethod=hclustCBI, method="complete",cut="level",classification="knn",nnk=5,M=10)
plot(seq(2,10),pp$mean.pred[-1],xlab="Number of clusters", ylab="Prediction strength")
######
## Cluster comparisons
# Rand index = number of pairs that are co-clustered in both + number of pairs that are not co-clustered in both sets / all
# vi = varation of information (information theoretic measure - small means lots of overlap)
kk<-kmeans(irisnew[,-5],3)
hh<-cutree(hclust(dist(irisnew[,-5])),3)
cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T)
table(kk$cluster,hh)
#
kk<-kmeans(irisnew[,-5],2)
hh<-cutree(hclust(dist(irisnew[,-5])),2)
cluster.stats(clustering=kk$cluster,alt.clustering=hh,compareonly = T)
table(kk$cluster,hh)
######
# Stability of clustering results - using bootstrap to assess 
# similar to my code above
par(mfrow=c(2,2))
pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=3,multipleboot = F)
pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=4,multipleboot = F)
pp<-clusterboot(irisnew[,-5],B=2,showplots=T,clustermethod=kmeansCBI,krange=5,multipleboot = F)
###
# Summaring variable distributions for each cluster
kk<-kmeans(irisnew[,-5],2)
cluster.varstats(kk$cluster,irisnew[,-5])
# If you run this on a higher dimensional data set, set ask to False and then check for each variable separately!
cc<-cluster.varstats(kk$cluster,irisnew[,-5],ask=F)
plot(cc[[1]]) # checking variable 1
###