library(MASS)
#
library(GGally)
data(iris)
ggpairs(data=iris, # data.frame with variables
        columns=1:4, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color = Species))
# Run kmeans a couple of times ... kmeans starts from a random partition - can fail!
# random starting point is used..
kk<-kmeans(iris[,1:4],3)
table(kk$cluster,iris$Species) # Check if the cluster indices overal with the true species labels
#
mydata<-iris
mydata$km<-as.factor(kk$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:4, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color = Species, shape = km))
#
ll<-lda(iris[,1:4],iris$Species)
pl<-predict(ll,iris[,1:4])
table(pl$class,iris$Species)
# Supervised (classification) can predict the labels - class discovery (clustering) is a more difficult problem!
mydata[,1:4]<-scale(mydata[,1:4])
kk<-kmeans(iris[,1:4],3)
table(kk$cluster,iris$Species) # Check if the cluster indices overal with the true species labels
#
mydata$km<-as.factor(kk$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:4, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color = Species, shape = km)
)
#
#########################################
# How many clusters in the data?
# Run for K=1 to 10 and summarize the deviation from the cluster means
# The so-called Within-Sum-of-Squares
data(iris)
W<-rep(0,10)
for (k in 2:10) {
  kk<-kmeans(iris[,1:4],k)
  W[k]<-kk$tot.with }
mu<-apply(iris[,1:4],2,mean)
iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1]))
W[1]<-sum(iriss^2)
plot(seq(1,10),W,xlab="K",ylab="Within-SS",type="b")
# This looks a bit like the RSS-curve in regression. 
# Remember what to look for? When does the curve level off? 
# When does it stop paying off to add clusters?
############# 
library(cluster) 
#########################
# PAM - partion around mediods - a robust version of kmeans
pp<-pam(iris[,1:4],3)
plot(pp)
# Are the 3 clusters well separated?
pairs(iris[,1:4],col=as.numeric(iris$Species)+1,pch=pp$cluster)
#
clusplot(iris[,1:4], pp$cluster, color=TRUE, shade=TRUE, 
         labels=2, lines=0)

#
W<-rep(0,10)
for (k in 2:10) {
  kk<-pam(dist(iris[,1:4]),k)
  W[k]<-kk$sil$avg }
mu<-apply(iris[,1:4],2,mean)
iriss<-iris[,1:4]-t(matrix(rep(mu,dim(iris)[1]),4,dim(iris)[1]))
plot(seq(2,10),W[2:10],xlab="K",ylab="Silhouette Width",type="l")
# For which K is the silhouette width maximized?
##############################
# hierarchical clustering
hh<-hclust(dist(iris[,1:4]))
plot(hh,label=iris$Species)
table(cutree(hh,k=3),iris$Species)
# pretty good separation
# Try a different joining mechanism in the tree
hh<-hclust(dist(iris[,1:4]),"single")
plot(hh,label=iris$Species)
table(cutree(hh,k=3),iris$Species)
#
hh<-hclust(dist(iris[,1:4]),"ward.D")
plot(hh,label=iris$Species)
table(cutree(hh,k=3),iris$Species)
# Big impact! You get what you ask for in clustering...
cc<-cor(t(iris[,1:4]))
hh<-hclust(as.dist(1-cc))
plot(hh,label=iris$Species)
table(cutree(hh,k=3),iris$Species)
# Also - how you measure distance between the observations also has an impact.
# Choices: distance metric AND joining meachnism (linkage)
##########################################################
# Let's look at the numbers data
Nmat<-Numbers[,-1]
N<-2000
iu<-sample(seq(1,7291),N)
# N digits chosen at random
kk<-kmeans(Nmat[iu,],10)
table(kk$cluster,Numbers[iu,1])
# Check if the cluster indices overal with the true species labels
#
# How many clusters in the data?
# Run for K=1 to 10 and summarize the deviation from the cluster means
# The so-called Within-Sum-of-Squares
W<-rep(0,25)
for (k in 2:25) {
  kk<-kmeans(Nmat[iu,],k)
  W[k]<-kk$tot.with }
mu<-apply(Nmat[iu,],2,mean)
mss<-Nmat[iu,]-t(matrix(rep(mu,dim(Nmat[iu,])[1]),256,dim(Nmat[iu,])[1]))
W[1]<-sum(mss^2)
plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l")
### Clustering after PCA dimension reduction
ssn<-svd(Nmat[iu,])
plot3d(ssn$u[,1:3],col=as.numeric(Numbers[iu,1]))
#
W<-rep(0,25)
for (k in 2:25) {
  kk<-kmeans(ssn$u[,1:10],k)
  W[k]<-kk$tot.with }
mu<-apply(ssn$u[,1:10],2,mean)
mss<-ssn$u[,1:10]-t(matrix(rep(mu,dim(ssn$u[,1:10])[1]),10,dim(ssn$u[,1:10])[1]))
W[1]<-sum(mss^2)
plot(seq(1,25),W,xlab="K",ylab="Within-SS",type="l")
############# 
# PAM - partion around mediods - a robust version of kmeans
pp<-pam(Nmat[iu,],4)
plot(pp)
#
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
points(seq(1,N),rep(-0.1,N),pch=as.character((Numbers[iu,1])[as.numeric(rownames(pp$silinfo$widths))]))
# Looks like an OK cluster separation - but are the digits in the clusters?
table(pp$clustering,Numbers[iu,1])
#
pp<-pam(ssn$u[,1:15],4)
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
points(seq(1,N),rep(-0.1,N),pch=as.character((Numbers[iu,1])[as.numeric(rownames(pp$silinfo$widths))]))
# Looks like an OK cluster separation - but are the digits in the clusters?
table(pp$clustering,Numbers[iu,1])
# Not great!!!
##############################
# hierarchical clustering
hh<-hclust(dist(Nmat[iu,]),"complete")
plot(hh,label=Numbers[iu,1])
# 
hh<-hclust(dist(Nmat[iu,]),"single")
plot(hh,label=Numbers[iu,1])
# Single linkage aggressively builds the 1s as a cluster first
hh<-hclust(dist(Nmat[iu,]),"average")
plot(hh,label=Numbers[iu,1])
#
cc<-cor(t(Nmat[iu,]))
hh<-hclust(as.dist(1-cc))
plot(hh,label=Numbers[iu,1])
#####################
library(mclust)
kk<-Mclust(iris[,1:4])
table(kk$class,iris$Species)
summary(kk)
plot(kk) # try density, BIC
#
kk<-Mclust(iris[,1:4],model="EEE")
table(kk$class,iris$Species)
summary(kk)
plot(kk) # try density, BIC
#
kk<-Mclust(iris[,1:4],G=3)
table(kk$class,iris$Species)
summary(kk)
plot(kk) # try density, BIC
###########################
kk<-Mclust(Nmat[iu,]) # REALLY SLOW!!!
table(kk$class,Numbers[iu,1])
#
ssn<-svd(Nmat[iu,])
kk<-Mclust(ssn$u[,1:15]) # reduced dimension
table(kk$class,Numbers[iu,1])
plot(kk)
# Not so easy....
##############################
install.packages("dbscan")
library(dbscan)
##
db<-dbscan(iris[,1:4], eps=1, minPts = 10)
table(db$cluster,iris$Species)
#
db<-dbscan(iris[,1:4], eps=.5, minPts = 10)
table(db$cluster,iris$Species)
#
db<-dbscan(iris[,1:4], eps=2, minPts = 5)
table(db$cluster,iris$Species)
# tuning parameters eps and minPts + choice of distance
kNNdistplot(iris[,1:4], k=10) # eps .5?
#
db<-dbscan(iris[,1:4], eps=.5, minPts = 2)
table(db$cluster,iris$Species)
#
mydata<-iris
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:4, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color = Species, shape=cl))
#
par(mfrow=c(1,1))
hullplot(iris[,1:4],db)
## other types of distances
db<-dbscan(dist(apply(iris[,1:4],2,scale),"manhattan"), eps=1, minPts = 15)
table(db$cluster,iris$Species)
#
mydata<-iris
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:4, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color = Species, shape=cl))
# 
library(kernlab)
data(spirals)
spirs<-as.data.frame(spirals)
names(spirs)<-c("x","y")
ggpairs(data=spirs, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="spirals data", # title of the plot
)
#
kNNdistplot(spirs,k=5) #.17?
#
db<-dbscan(spirs[,1:2], eps=.17, minPts = 5)
mydata<-spirs
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))

# sensitive to choices...
db<-dbscan(spirs[,1:2], eps=.17, minPts = 10, borderPoints = T)
mydata<-spirs
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
db<-dbscan(spirs[,1:2], eps=.2, minPts = 5, borderPoints = T)
mydata<-spirs
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
kk<-kmeans(spirs[,1:2],2)
mydata<-spirs
mydata$cl<-as.factor(kk$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
kk<-cutree(hclust(dist(spirs[,1:2])),k=2)
mydata<-spirs
mydata$cl<-as.factor(kk)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
kk<-cutree(hclust(dist(spirs[,1:2])),k=10)
mydata<-spirs
mydata$cl<-as.factor(kk)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
####################
db<-hdbscan(spirs[,1:2], minPts = 8)
plot(db$hc)
db
#
mydata<-spirs
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Iris Data", # title of the plot
        mapping = ggplot2::aes(color =cl))
###
data(moons)
ggpairs(data=moons, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Moons", # title of the plot
        )
#
kNNdistplot(moons[,1:2],k=5) # .4?
#
db<-dbscan(moons[,1:2], eps=.4, minPts = 5)
mydata<-moons
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Moons", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
db<-dbscan(moons[,1:2], eps=.5, minPts = 5)
mydata<-moons
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Moons", # title of the plot
        mapping = ggplot2::aes(color =cl))
#
db<-hdbscan(moons[,1:2], minPts = 5)
plot(db$hc)
db
#
mydata<-moons
mydata$cl<-as.factor(db$cluster)
ggpairs(data=mydata, # data.frame with variables
        columns=1:2, # columns to plot, default to all.
        title="Moons", # title of the plot
        mapping = ggplot2::aes(color =cl))
# some nice plotting functions
db<-dbscan(moons[,1:2], eps=.5, minPts = 5)
library("factoextra")
fviz_cluster(db, moons, stand = FALSE, frame = FALSE, geom = "point")
########
ssn<-svd(Nmat[iu,])
kNNdistplot(ssn$u[,1:6],k=10) # eps 0.02?
# 
db<-dbscan(ssn$u[,1:6], eps=.015, minPts = 10)
db
table(db$clust,Numbers[iu,1])
#
kNNdistplot(ssn$u[,1:25],k=10) #
# 
db<-dbscan(ssn$u[,1:25], eps=.04, minPts = 10)
db
table(db$clust,Numbers[iu,1])
#