library(irlba)
# fast svd for large data sets
library(plot3D)
library(rgl)
####
ss<-irlba(TCGA,10,10)
plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
#
kk<-kmeans(ss$u[,1:10],6)
#
library(mda)
confusion(kk$cluster,TCGAclassstr)
### SVD + kmeans - find the cancer classes?
pp<-pam(ss$u,6)
plot(pp)
plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h")
confusion(pp$clustering,TCGAclassstr)
### what about PAM
#
#### consensus clustering - using a random subset of genes to cluster
DD<-matrix(0,2887,2887)
for (zz in (1:50)) {
  ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10)
  kk<-kmeans(ss$u[,1:10],6)
  dd<-as.matrix(dist(kk$cluster))
  dd[dd!=0]<-1
  DD<-DD+(1-dd)
  cat(zz)} 
hc<-hclust(as.dist(1-DD/50))
plot(hc,labels=TCGAclassstr)
# nicer dendrogram
library(dendextend)
dend<-as.dendrogram(hc)
cols<-as.numeric(as.factor(TCGAclassstr))
labels_colors(dend)<-cols[order.dendrogram(dend)]
plot(dend)
#
k<-cutree(hc,k=6)
confusion(k,TCGAclassstr)
#
k<-cutree(hc,k=8)
confusion(k,TCGAclassstr)
#
k<-cutree(hc,k=7)
confusion(k,TCGAclassstr)
# how reproducible are the clusters?
image(DD)
image(DD[hc$order,hc$order])
### BC is much larger and more variable - need more clusters to summarize 
### Difficult to detect Uterine cancer (small group)
### split of BC into subtypes (in fact there are recognized subtypes of BC)
#
#
########
# Let's try some filtering of features
vv<-apply(TCGA,2,sd)
vv<-rev(sort.list(vv))
# some examples of filtered features
par(mfrow=c(2,2))
hist(TCGA[,vv[2]])
hist(TCGA[,vv[30]])
hist(TCGA[,vv[2530]])
hist(TCGA[,vv[20500]])
#
guse<-vv[1:250]
# use top 250 genes
library(gplots)
heatmap.2(as.matrix(TCGA[,guse]),
          col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
          hclust=function(x) hclust(x,method="complete"),scale="column",trace="none")
#euclidean distance
heatmap.2(as.matrix(TCGA[,guse]),
          col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols),
          hclust=function(x) hclust(x,method="complete"),scale="column",trace="none",
          distfun=function(x) as.dist((1-cor(t(x),method="pearson",use="pairwise.complete.obs"))/2))
#correlation 
### SVD on filtered data
guse<-vv[1:2000]
# use top 2000 genes
ss<-irlba(TCGA,10,10)
plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# 
plot3d(TCGA[,vv[1:3]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# Not that just filtering on variance might lead to a particular class dominating - here it's Lung cancer
plot3d(TCGA[,vv[4:6]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# Risk missing a small class this way...
plot3d(TCGA[,vv[7:9]],col=as.numeric(as.factor(TCGAclassstr)))
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))
# subtypes of Lung cancer? 
#######################################
# consensus clustering
source("https://bioconductor.org/biocLite.R")
biocLite("ConsensusClusterPlus")
browseVignettes("ConsensusClusterPlus")
# Cannot install directly to Rstudio depending on version - go via Bioconductor
library(ConsensusClusterPlus)
# this can be a bit slot depending on the number of features and observations!
ii<-sample(seq(1,2887),500)
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,clusterAlg="km")
# Check for different number of clusters - tends to be 3-5 selected
ccu<-cc[[5]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
### looks like we can't find all cancer classes using clustering..
# What about using a different method? correlation and hierarchical clustering
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="hc")
ccu<-cc[[4]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
#
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="pam")
ccu<-cc[[5]]
confusion(ccu$consensusClass,TCGAclassstr[ii])
calcICL(cc)
#### Very different clustering outcome depending on method and distance metric used!!!
#######
#######
#
# subspace clustering of the iris data
library(subspace)
irisnew<-iris # here you can try with different cluster separations etc
# SubClu - bottom up method based on dbscan
cc<-SubClu(irisnew[,-5],minSupport=10,epsilon=3) # Also CLIQUE and other methods
plot(cc,irisnew[,-5])
head(cc)
cc[[1]]$subspace
cc[[1]]$objects

#
library(orclus)
cc<-orclus(as.matrix(iris[,1:4]),k=3,l=2,k0=5,inner.loops=10)
plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5]))
cc$subspaces
#
kk<-kmeans(iris[,1:4],3)
plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5]))
# 
# Simulated data
x<-rnorm(150)+c(rep(0,50),rep(0,50),rep(10,50))
x<-cbind(x,rnorm(150)+c(rep(-10,50),rep(0,50),rep(0,50)))
x<-data.frame(cbind(x,rnorm(150)))
names(x)<-c("a","b","c")
cc<-SubClu(x,minSupport=10,epsilon=3) # Also CLIQUE and other methods
plot(cc,x)
head(cc)
cc[[4]]$subspace
cc[[4]]$objects
#
cc<-orclus(as.matrix(x),k=3,l=1,k0=5,inner.loops=10) # l is the dimension for subspaces for each cluster
plot(x[,1:2],col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50)))
cc$subspaces
#
plot(as.matrix(x)%*%as.matrix(cc$subspaces[[1]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50)))
plot(as.matrix(x)%*%as.matrix(cc$subspaces[[2]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50)))
plot(as.matrix(x)%*%as.matrix(cc$subspaces[[3]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50)))

# to try at home: create a richer data set where clusters really do live in different
# subspace (i.e. mean differences in some but not all dimensions)
####
# on the cancer data
ii<-sample(seq(1,2887),500)
ss<-irlba(as.matrix(TCGA[ii,guse[1:100]]),10,10)
#
cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[ii])
# try different size subspaces and clusters
# try on feature selected data
cc<-orclus(as.matrix(TCGA[ii,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[ii])
###
### 
library(HDclassif)
cc<-hddc(irisnew[,-5],K=2:5)
table(cc$class,irisnew[,5])
cc$d
cc$all_results
### This takes a while.... Run on a smaller set of observations and/or dimensions first 
ccTCGA<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10)
table(ccTCGA$class,TCGAclassstr[ii])
ccTCGA$all_results
plot(ccTCGA)
# Same subspace for all clusters
ccTCGA2<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10,model="AjBQD")
table(ccTCGA2$class,TCGAclassstr[ii])
plot(ccTCGA2)
ccTCGA2$all_results
# compare with just leading SVD ACROSS whole data set = NOT the same thing as common PC for all clusters!
ss<-svd(TCGA[ii,guse[1:100]])
kk<-Mclust(ss$u[,1:dim(ccTCGA2$Q)[2]])
table(kk$class,TCGAclassstr[ii])
# Even more generous filtering
ccTCGA3<-hddc(as.matrix(TCGA[ii,guse[1:250]]),K=3:10,model="AjBQD")
table(ccTCGA3$class,TCGAclassstr[ii])
plot(ccTCGA3)
#
ss<-svd(TCGA[ii,guse[1:250]])
kk<-Mclust(ss$u[,1:dim(ccTCGA3$Q)[2]])
table(kk$class,TCGAclassstr[ii])
#####
# Conclusion: How you filter matters - different clusters are detected...
kk<-Mclust(TCGA[ii,guse[1:100]])
table(kk$class,TCGAclassstr[ii])
#
kk<-Mclust(TCGA[ii,guse[1:250]])
table(kk$class,TCGAclassstr[ii])
#
kk<-Mclust(TCGA[ii,guse[1:250]],G=8)
table(kk$class,TCGAclassstr[ii])
# increasing the number of clusters if we use just filtering does not help...
### SVD filtering only 
kk<-Mclust(ss$u[,1:50],G=8)
table(kk$class,TCGAclassstr[ii])
# 
kk<-Mclust(ss$u[,1:25])
table(kk$class,TCGAclassstr[ii])