#### Clustering ####
library(MASS)
# kmeans on the iris data, asking for 3 clusters
kk<-kmeans(iris[,1:4],3)
table(kk$cluster,iris$Species)
# Check if the cluster indices overal with the true species labels
# Not great - messing up versicolor and virginica
# Can you classify?
ll<-lda(iris[,1:4],iris$Species)
pl<-predict(ll,iris[,1:4])
table(pl$class,iris$Species)
# Supervised (classification) can predict the labels - class discovery (clustering) is a more difficult problem!
####
#########################################
# How many clusters in the data?
# Run for K=1 to 10 and summarize the deviation from the cluster means
# The so-called Within-Sum-of-Squares
W<-rep(0,10)
for (k in 2:10) {
  kk<-kmeans(iris[,1:4],k)
  W[k]<-kk$tot.with }
mu<-apply(iris[,1:4],2,mean)
iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1]))
W[1]<-sum(iriss^2)
plot(seq(1,10),W,xlab="K",ylab="Within-SS",type="l")
# This looks a bit like the RSS-curve in regression. Remember what to look for? When does the curve level off? 
# When does it stop paying off to add clusters? No point going beyond 2-3
############# 
kk<-kmeans(iris[,1:4],3)
pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=kk$cluster)
# 3 clusters for the IRIS data - check point symbol (cluster index) vs color (true label)
kk<-kmeans(iris[,1:4],2)
pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=kk$cluster)
#########################
# PAM - partion around mediods - a robust version of kmeans
pp<-pam(iris[,1:4],3)
plot(pp)
# Are the 3 clusters well separated?
pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=pp$cluster)
#
W<-rep(0,10)
for (k in 2:10) {
  kk<-pam(dist(iris[,1:4]),k)
  W[k]<-kk$sil$avg }
mu<-apply(iris[,1:4],2,mean)
iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1]))
plot(seq(2,10),W[2:10],xlab="K",ylab="Silhouette Width",type="l")
# For which K is the silhouette width maximized?
##############################
# hierarchical clustering
hh<-hclust(dist(iris[,1:4]))
plot(hh,label=iris$Species)
# pretty good separation
# Try a different joining mechanism in the tree
hh<-hclust(dist(iris[,1:4]),"single")
plot(hh,label=iris$Species)
# Big impact! You get what you ask for in clustering...
cc<-cor(t(iris[,1:4]))
hh<-hclust(as.dist(1-cc))
plot(hh,label=iris$Species)
# Also - how you measure distance between the observations also has an impact.
# Choices: distance metric AND joining meachnism (linkage)
##########################################################
# Let's look at the numbers data
library(ElemStatLearn)
data(zip.train)
Numbers<-as.matrix(zip.train)
Nmat<-as.matrix(Numbers[,-1])
#
iu<-sample(seq(1,7291),500) # 500 digits chosen at random
#
kk<-kmeans(Nmat[iu,],10)
table(kk$cluster,Numbers[iu,1])
# Check if the cluster indices overlap with the true species labels
# Not great...
#
# How many clusters in the data?
# Run for K=1 to 10 and summarize the deviation from the cluster means
# The so-called Within-Sum-of-Squares
W<-rep(0,25)
for (k in 2:25) {
  kk<-kmeans(Nmat[iu,],k)
  W[k]<-kk$tot.with }
mu<-apply(Nmat[iu,],2,mean)
mss<-Nmat[iu,]-t(matrix(rep(mu,dim(Nmat[iu,])[1]),256,dim(Nmat[iu,])[1]))
W[1]<-sum(mss^2)
plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l")
# Levels off slowly - not obvious clustering structure
#
### Clustering after PCA dimension reduction
ssn<-svd(Nmat[iu,])
plot3d(ssn$u[,1:3],col=as.numeric(Numbers[iu,1])+1)
# A lot of overlap - only one cluster really stands out - those are the 1s
W<-rep(0,25)
for (k in 2:25) {
  kk<-kmeans(ssn$u[,1:10],k)
  W[k]<-kk$tot.with }
mu<-apply(ssn$u[,1:10],2,mean)
mss<-ssn$u[,1:10]-t(matrix(rep(mu,dim(ssn$u[,1:10])[1]),10,dim(ssn$u[,1:10])[1]))
W[1]<-sum(mss^2)
plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l")
# 
############# 
# PAM - partion around mediods - a robust version of kmeans
pp<-pam(Nmat[iu,],4)
plot(pp)
#
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
points(seq(1,500),rep(-0.1,500),pch=as.character((Numbers[iu,1])[as.numeric(rownames(pp$silinfo$widths))]))
# Looks like an OK cluster separation - but are the digits in the clusters?
table(pp$clustering,Numbers[iu,1])
# Not great... 
##############################
# hierarchical clustering
hh<-hclust(dist(Nmat[iu,]),"complete")
plot(hh,label=Numbers[iu,1])
# 
hh<-hclust(dist(Nmat[iu,]),"single")
plot(hh,label=Numbers[iu,1])
# Single linkage aggressively builds the 1s as a cluster first
hh<-hclust(dist(Nmat[iu,]),"average")
plot(hh,label=Numbers[iu,1])
#
cc<-cor(t(Nmat[iu,]))
hh<-hclust(as.dist(1-cc))
plot(hh,label=Numbers[iu,1])
# One cluster can be founds but not all
# Clustering is much more difficult than classification!!!