library(irlba)
# fast svd for large data sets
library(plot3D)
library(rgl)



ss<-irlba(TCGA,10,10)
plot3d(ss$u[,1:3],col=TCGAclass+1)
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))

kk<-kmeans(ss$u[,1:10],6)

library(mda)
confusion(kk$cluster,TCGAclassstr)
### SVD + kmeans - find the cancer classes?


pp<-pam(ss$u,6)
plot(pp)
plot(pp$silinfo$widths[,3],type="h",col=sort(pp$clustering))
confusion(pp$clustering,TCGAclassstr)
### what about PAM

#### consensus clustering
DD<-matrix(0,2887,2887)
for (zz in (1:50)) {
  ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10)
  kk<-kmeans(ss$u[,1:10],6)
  dd<-as.matrix(dist(kk$cluster))
  dd[dd!=0]<-1
  DD<-DD+(1-dd)
  cat(zz)} 
hc<-hclust(as.dist(1-DD/50))
plot(hc)
k<-cutree(hc,k=6)
confusion(k,TCGAclassstr)

DDb<-matrix(0,2887,2887)
for (zz in (1:50)) {
  ss<-as.matrix(TCGA[,sample(seq(1,20530),250)])
  kk<-kmeans(ss,6)
  dd<-as.matrix(dist(kk$cluster))
  dd[dd!=0]<-1
  DDb<-DDb+(1-dd)} 
#
hcb<-hclust(as.dist(1-DDb/50))
plot(hcb)
kb<-cutree(hcb,k=6)
confusion(kb,TCGAclassstr)


DDc<-matrix(0,2887,2887)
for (zz in (1:50)) {
  ss<-irlba(as.matrix(TCGA[,sample(seq(1,20530),250)]),10,10)
  kk<-kmeans(ss$u,6)
  dd<-as.matrix(dist(kk$cluster))
  dd[dd!=0]<-1
  DDc<-DDc+(1-dd)} 
#
hcc<-hclust(as.dist(1-DDc/50))
plot(hcc)
kc<-cutree(hcc,k=6)
confusion(kc,TCGAclassstr)
#####
### Difficult to detect Uterine cancer (small group)
### split of BC into subtypes

# filtering using the diptest
library(diptest)
dipval<-apply(TCGA,2,dip)
hist(dipval)
ds<-sort.list(dipval)
par(mfrow=c(2,2))
hist(TCGA[,ds[2000]])
hist(TCGA[,ds[3000]])
hist(TCGA[,ds[20530]])
hist(TCGA[,ds[20529]])

guse<-seq(1,20530)[rev(ds)[1:2000]]
# use top 2000 genes
ss<-irlba(TCGA[,guse],10,10)
plot3d(ss$u[,1:3],col=TCGAclass+1)
legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))), pch = 5, col=seq(1,6), cex=1, inset=c(0.02))

library(ConsensusClusterPlus)
library(caret)
ii<-createDataPartition(TCGAclassstr,p=.25)$Res

# this is rather slow!
cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:100]])),maxK=6,reps=100,pItem=.6,pFeature=.6,clusterAlg="km")
cc3<-cc[[3]]
confusion(cc3$consensusClass,TCGAclassstr[ii])

calcICL(cc)
### looks like we can't find all cancer classes using clustering..
#######
# subspace clustering of the iris data
library(subspace)
cc<-CLIQUE(iris[,1:4],xi=2,tau=.5)
plot(cc,iris[,1:4])


library(orclus)
cc<-orclus(as.matrix(iris[,1:4]),k=3,l=3,k0=5,inner.loops=10)
plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5]))
cc$subspaces

kk<-kmeans(iris[,1:4],3)
plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5]))
# to try at home: create a data set where clusters really do live in different
# subspace (i.e. mean differences in some but not all dimensions)
####
# on the cancer data
itc<-createDataPartition(TCGAclass,p=.9)
ss<-irlba(as.matrix(TCGA[itc$Re,guse[1:100]]),10,10)
#
cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[itc$Re])
# try different size subspaces and clusters
# try on feature selected data
cc<-orclus(as.matrix(TCGA[itc$Re,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10)
confusion(cc$cluster,TCGAclassstr[itc$Re])
###
### Feature transformation seems a better choice here