###### DEMO 13b ######
load("TCGAData.RData")
# 
library(irlba)
# fast svd for large data sets
library(plot3D)
library(rgl)
####
# compute the leading K components
K<-10
ss<-irlba(TCGA,K,K)
plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# Kmeans on the leading Principal components
kk<-kmeans(ss$u[,1:10],6)
#
library(mda)
confusion(kk$cluster,TCGAclassstr)
### SVD + kmeans - find the cancer classes?
# Finds subtypes of breast cancer, combines some of the other cancers
# What about pam?
library(cluster)
pp<-pam(ss$u,6)
plot(pp)
# Better silhouette plot
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
confusion(pp$clustering,TCGAclassstr)
### what about PAM
# Not bad!
#
#### consensus clustering 
# I am using a random subset of genes to cluster, repeat 50 times - check overlap
DD<-matrix(0,2887,2887)
for (zz in (1:50)) {
  ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10) #random set of 2500 genes from the 20530
  kk<-kmeans(ss$u[,1:10],6) #apply kmeans to leading components
  dd<-as.matrix(dist(kk$cluster)) # check if observations are in the same cluster or not
  dd[dd!=0]<-1 # set distance to 1 if not in same cluster
  DD<-DD+(1-dd) # add all distances together across 50 iterations
  cat(zz)}  # takes a while- cat(zz) prints the iteration number so you see where you are in the loop
#
hc<-hclust(as.dist(1-DD/50)) # clustering based on consensus
plot(hc,labels=TCGAclassstr)
# nicer dendrogram
library(dendextend)
dend<-as.dendrogram(hc)
cols<-as.numeric(as.factor(TCGAclassstr))
labels_colors(dend)<-cols[order.dendrogram(dend)]
plot(dend)
# Not bad! Even picks up the small cancers as mainly co-clustered
k<-cutree(hc,k=6)
confusion(k,TCGAclassstr) # can't find uterine cancer (the smallest class)
#
k<-cutree(hc,k=8)
confusion(k,TCGAclassstr) # if you ask for more clusters, you start splitting breast cancer into smaller clusters
#
k<-cutree(hc,k=7)
confusion(k,TCGAclassstr)
# how reproducible are the clusters?
image(DD)
image(DD[hc$order,hc$order])
### BC is much larger and more variable - need more clusters to summarize 
### Difficult to detect Uterine cancer (small group)
### split of BC into subtypes
#
#
########
# Let's try some filtering of features
vv<-apply(TCGA,2,sd)
vv<-rev(sort.list(vv)) # top most varying genes
# some examples of filtered features
par(mfrow=c(2,2))
hist(TCGA[,vv[2]])
hist(TCGA[,vv[30]])
hist(TCGA[,vv[2530]])
hist(TCGA[,vv[20530]])
# The most varying genes may pick up different cancers..
guse<-vv[1:250]  # use top 250 genes
#
library(gplots)
heatmap.2(as.matrix(TCGA[,guse]),
          col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
          hclust=function(x) hclust(x,method="complete"),scale="column",trace="none") #euclidean distance for clustering
#bi-clustering: clustering both observations and variables
heatmap.2(as.matrix(TCGA[,guse]),
          col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
          hclust=function(x) hclust(x,method="complete"),scale="column",trace="none",
          distfun=function(x) as.dist((1-cor(t(x),method="pearson",use="pairwise.complete.obs"))/2))
#correlation-based clustering
#
### SVD on filtered data
guse<-vv[1:2000]
# use top 2000 genes
ss<-irlba(TCGA,10,10)
plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# 
plot3d(TCGA[,vv[1:3]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# Note that just filtering on variance might lead to a particular class dominating - here it's Lung cancer
plot3d(TCGA[,vv[4:6]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# Risk missing a small class this way...
plot3d(TCGA[,vv[7:9]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# subtypes of Lung cancer? 
#######################################
# consensus clustering
source("https://bioconductor.org/biocLite.R")
biocLite("ConsensusClusterPlus")
browseVignettes("ConsensusClusterPlus")
# Cannot install directly to Rstudio depending on version - go via Bioconductor
library(ConsensusClusterPlus)
# this can be a bit slot depending on the number of features and observations!
ii<-sample(seq(1,2887),500) # I run on a random set of 500 observations (for speed only - try different numbers of observations and repeat)
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,clusterAlg="km")
# Check for different number of clusters - tends to be 3-5 selected for this data set depending on the observations you randomly choose
ccu<-cc[[5]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
### looks like we can't find all cancer classes using clustering..
# What about using a different method? correlation and hierarchical clustering
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="hc")
ccu<-cc[[4]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
# correlation and pam
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="pam")
ccu<-cc[[5]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
#### Very different clustering outcome depending on method and distance metric used!!!
#######
#######
#
# subspace clustering of the iris data
library(subspace)
#
irisnew<-iris
irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2
ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"]
irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3
# I create an iris data set with 4 distinct clusters here
cc<-CLIQUE(irisnew[,-5],xi=15,tau=0.15)
plot(cc,irisnew[,-5])
#
#cc<-P3C(irisnew[,-5],0.01,5)
#cc<-SubClu(irisnew[,-5],minSupport=10,epsilon=3)
# other subspace methods
#
library(orclus)
cc<-orclus(as.matrix(iris[,1:4]),k=3,l=2,k0=5,inner.loops=10) # method says how many clusters and subspace dimension used
plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5]))
cc$subspaces
#
kk<-kmeans(iris[,1:4],3)
plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5]))
# to try at home: create a data set where clusters really do live in different
# subspace (i.e. mean differences in some but not all dimensions)
####
# on the cancer data
ss<-irlba(as.matrix(TCGA[ii,guse[1:100]]),10,10)
#
cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[ii])
# try different size subspaces and clusters
# try on feature selected data
cc<-orclus(as.matrix(TCGA[ii,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[ii])
###
### High-dimensional classification package 
library(HDclassif)
cc<-hddc(irisnew[,-5],K=2:5) # chooses number of clusters and subspace dimension
table(cc$class,irisnew[,5]) 
cc$d # which intrinsic dimension do the clusters live in?
cc$all_results #
### This takes a while.... Run on a smaller set of observations and/or dimensions first 
ccTCGA<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10)
table(ccTCGA$class,TCGAclassstr[ii])
ccTCGA$all_results
plot(ccTCGA)
# Using same subspace for all clusters
ccTCGA2<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10,model="AjBQD")
table(ccTCGA2$class,TCGAclassstr[ii])
plot(ccTCGA2)
ccTCGA2$all_results
# compare with just leading SVD ACROSS whole data set = NOT the same thing as common PC for all clusters!
ss<-svd(TCGA[ii,guse[1:100]])
library(mclust)
kk<-Mclust(ss$u[,1:dim(ccTCGA2$Q)[2]])
table(kk$class,TCGAclassstr[ii])
# Notice that filtering based on variance and using model based dimension reduction works much better than svd filtering on all
#
# Even more generous filtering
ccTCGA3<-hddc(as.matrix(TCGA[ii,guse[1:250]]),K=3:10,model="AjBQD")
table(ccTCGA3$class,TCGAclassstr[ii])
plot(ccTCGA3)
#
ss<-svd(TCGA[ii,guse[1:250]])
kk<-Mclust(ss$u[,1:dim(ccTCGA3$Q)[2]])
table(kk$class,TCGAclassstr[ii])
#####
# Conclusion: How you filter matters - different clusters are detected...
kk<-Mclust(TCGA[ii,guse[1:100]])
table(kk$class,TCGAclassstr[ii])
#
kk<-Mclust(TCGA[ii,guse[1:250]])
table(kk$class,TCGAclassstr[ii])
#
kk<-Mclust(TCGA[ii,guse[1:250]],G=8)
table(kk$class,TCGAclassstr[ii])
# increasing the number of clusters if we use just filtering does not help...
### SVD filtering only 
kk<-Mclust(ss$u[,1:50],G=8)
table(kk$class,TCGAclassstr[ii])
# 
kk<-Mclust(ss$u[,1:25])
table(kk$class,TCGAclassstr[ii])