library(irlba) # fast svd for large data sets library(plot3D) library(rgl) ss<-irlba(TCGA,10,10) plot3d(ss$u[,1:3],col=TCGAclass+1) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) kk<-kmeans(ss$u[,1:10],6) library(mda) confusion(kk$cluster,TCGAclassstr) ### SVD + kmeans - find the cancer classes? pp<-pam(ss$u,6) plot(pp) plot(pp$silinfo$widths[,3],type="h",col=sort(pp$clustering)) confusion(pp$clustering,TCGAclassstr) ### what about PAM #### consensus clustering DD<-matrix(0,2887,2887) for (zz in (1:50)) { ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10) kk<-kmeans(ss$u[,1:10],6) dd<-as.matrix(dist(kk$cluster)) dd[dd!=0]<-1 DD<-DD+(1-dd) cat(zz)} hc<-hclust(as.dist(1-DD/50)) plot(hc) k<-cutree(hc,k=6) confusion(k,TCGAclassstr) DDb<-matrix(0,2887,2887) for (zz in (1:50)) { ss<-as.matrix(TCGA[,sample(seq(1,20530),250)]) kk<-kmeans(ss,6) dd<-as.matrix(dist(kk$cluster)) dd[dd!=0]<-1 DDb<-DDb+(1-dd)} # hcb<-hclust(as.dist(1-DDb/50)) plot(hcb) kb<-cutree(hcb,k=6) confusion(kb,TCGAclassstr) DDc<-matrix(0,2887,2887) for (zz in (1:50)) { ss<-irlba(as.matrix(TCGA[,sample(seq(1,20530),250)]),10,10) kk<-kmeans(ss$u,6) dd<-as.matrix(dist(kk$cluster)) dd[dd!=0]<-1 DDc<-DDc+(1-dd)} # hcc<-hclust(as.dist(1-DDc/50)) plot(hcc) kc<-cutree(hcc,k=6) confusion(kc,TCGAclassstr) ##### ### Difficult to detect Uterine cancer (small group) ### split of BC into subtypes # filtering using the diptest library(diptest) dipval<-apply(TCGA,2,dip) hist(dipval) ds<-sort.list(dipval) par(mfrow=c(2,2)) hist(TCGA[,ds[2000]]) hist(TCGA[,ds[3000]]) hist(TCGA[,ds[20530]]) hist(TCGA[,ds[20529]]) guse<-seq(1,20530)[rev(ds)[1:2000]] # use top 2000 genes ss<-irlba(TCGA[,guse],10,10) plot3d(ss$u[,1:3],col=TCGAclass+1) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) library(ConsensusClusterPlus) library(caret) ii<-createDataPartition(TCGAclassstr,p=.25)$Res # this is rather slow! cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:100]])),maxK=6,reps=100,pItem=.6,pFeature=.6,clusterAlg="km") cc3<-cc[[3]] confusion(cc3$consensusClass,TCGAclassstr[ii]) calcICL(cc) ### looks like we can't find all cancer classes using clustering.. ####### # subspace clustering of the iris data library(subspace) cc<-CLIQUE(iris[,1:4],xi=2,tau=.5) plot(cc,iris[,1:4]) library(orclus) cc<-orclus(as.matrix(iris[,1:4]),k=3,l=3,k0=5,inner.loops=10) plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5])) cc$subspaces kk<-kmeans(iris[,1:4],3) plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5])) # to try at home: create a data set where clusters really do live in different # subspace (i.e. mean differences in some but not all dimensions) #### # on the cancer data itc<-createDataPartition(TCGAclass,p=.9) ss<-irlba(as.matrix(TCGA[itc$Re,guse[1:100]]),10,10) # cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[itc$Re]) # try different size subspaces and clusters # try on feature selected data cc<-orclus(as.matrix(TCGA[itc$Re,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[itc$Re]) ### ### Feature transformation seems a better choice here