library(irlba) # fast svd for large data sets library(plot3D) library(rgl) #### ss<-irlba(TCGA,10,10) plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # kk<-kmeans(ss$u[,1:10],6) # library(mda) confusion(kk$cluster,TCGAclassstr) ### SVD + kmeans - find the cancer classes? pp<-pam(ss$u,6) plot(pp) plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h") confusion(pp$clustering,TCGAclassstr) ### what about PAM # #### consensus clustering - using a random subset of genes to cluster DD<-matrix(0,2887,2887) for (zz in (1:50)) { ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10) kk<-kmeans(ss$u[,1:10],6) dd<-as.matrix(dist(kk$cluster)) dd[dd!=0]<-1 DD<-DD+(1-dd) cat(zz)} hc<-hclust(as.dist(1-DD/50)) plot(hc,labels=TCGAclassstr) # nicer dendrogram library(dendextend) dend<-as.dendrogram(hc) cols<-as.numeric(as.factor(TCGAclassstr)) labels_colors(dend)<-cols[order.dendrogram(dend)] plot(dend) # k<-cutree(hc,k=6) confusion(k,TCGAclassstr) # k<-cutree(hc,k=8) confusion(k,TCGAclassstr) # k<-cutree(hc,k=7) confusion(k,TCGAclassstr) # how reproducible are the clusters? image(DD) image(DD[hc$order,hc$order]) ### BC is much larger and more variable - need more clusters to summarize ### Difficult to detect Uterine cancer (small group) ### split of BC into subtypes (in fact there are recognized subtypes of BC) # # ######## # Let's try some filtering of features vv<-apply(TCGA,2,sd) vv<-rev(sort.list(vv)) # some examples of filtered features par(mfrow=c(2,2)) hist(TCGA[,vv[2]]) hist(TCGA[,vv[30]]) hist(TCGA[,vv[2530]]) hist(TCGA[,vv[20500]]) # guse<-vv[1:250] # use top 250 genes library(gplots) heatmap.2(as.matrix(TCGA[,guse]), col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols), hclust=function(x) hclust(x,method="complete"),scale="column",trace="none") #euclidean distance heatmap.2(as.matrix(TCGA[,guse]), col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols), hclust=function(x) hclust(x,method="complete"),scale="column",trace="none", distfun=function(x) as.dist((1-cor(t(x),method="pearson",use="pairwise.complete.obs"))/2)) #correlation ### SVD on filtered data guse<-vv[1:2000] # use top 2000 genes ss<-irlba(TCGA,10,10) plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # plot3d(TCGA[,vv[1:3]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # Not that just filtering on variance might lead to a particular class dominating - here it's Lung cancer plot3d(TCGA[,vv[4:6]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # Risk missing a small class this way... plot3d(TCGA[,vv[7:9]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # subtypes of Lung cancer? ####################################### # consensus clustering source("https://bioconductor.org/biocLite.R") biocLite("ConsensusClusterPlus") browseVignettes("ConsensusClusterPlus") # Cannot install directly to Rstudio depending on version - go via Bioconductor library(ConsensusClusterPlus) # this can be a bit slot depending on the number of features and observations! ii<-sample(seq(1,2887),500) cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,clusterAlg="km") # Check for different number of clusters - tends to be 3-5 selected ccu<-cc[[5]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) ### looks like we can't find all cancer classes using clustering.. # What about using a different method? correlation and hierarchical clustering cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="hc") ccu<-cc[[4]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) # cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="pam") ccu<-cc[[5]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) #### Very different clustering outcome depending on method and distance metric used!!! ####### ####### # # subspace clustering of the iris data library(subspace) irisnew<-iris # here you can try with different cluster separations etc # SubClu - bottom up method based on dbscan cc<-SubClu(irisnew[,-5],minSupport=10,epsilon=3) # Also CLIQUE and other methods plot(cc,irisnew[,-5]) head(cc) cc[[1]]$subspace cc[[1]]$objects # library(orclus) cc<-orclus(as.matrix(iris[,1:4]),k=3,l=2,k0=5,inner.loops=10) plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5])) cc$subspaces # kk<-kmeans(iris[,1:4],3) plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5])) # # Simulated data x<-rnorm(150)+c(rep(0,50),rep(0,50),rep(10,50)) x<-cbind(x,rnorm(150)+c(rep(-10,50),rep(0,50),rep(0,50))) x<-data.frame(cbind(x,rnorm(150))) names(x)<-c("a","b","c") cc<-SubClu(x,minSupport=10,epsilon=3) # Also CLIQUE and other methods plot(cc,x) head(cc) cc[[4]]$subspace cc[[4]]$objects # cc<-orclus(as.matrix(x),k=3,l=1,k0=5,inner.loops=10) # l is the dimension for subspaces for each cluster plot(x[,1:2],col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50))) cc$subspaces # plot(as.matrix(x)%*%as.matrix(cc$subspaces[[1]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50))) plot(as.matrix(x)%*%as.matrix(cc$subspaces[[2]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50))) plot(as.matrix(x)%*%as.matrix(cc$subspaces[[3]]),col=cc$cluster,pch=c(rep("a",50),rep("b",50),rep("c",50))) # to try at home: create a richer data set where clusters really do live in different # subspace (i.e. mean differences in some but not all dimensions) #### # on the cancer data ii<-sample(seq(1,2887),500) ss<-irlba(as.matrix(TCGA[ii,guse[1:100]]),10,10) # cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[ii]) # try different size subspaces and clusters # try on feature selected data cc<-orclus(as.matrix(TCGA[ii,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[ii]) ### ### library(HDclassif) cc<-hddc(irisnew[,-5],K=2:5) table(cc$class,irisnew[,5]) cc$d cc$all_results ### This takes a while.... Run on a smaller set of observations and/or dimensions first ccTCGA<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10) table(ccTCGA$class,TCGAclassstr[ii]) ccTCGA$all_results plot(ccTCGA) # Same subspace for all clusters ccTCGA2<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10,model="AjBQD") table(ccTCGA2$class,TCGAclassstr[ii]) plot(ccTCGA2) ccTCGA2$all_results # compare with just leading SVD ACROSS whole data set = NOT the same thing as common PC for all clusters! ss<-svd(TCGA[ii,guse[1:100]]) kk<-Mclust(ss$u[,1:dim(ccTCGA2$Q)[2]]) table(kk$class,TCGAclassstr[ii]) # Even more generous filtering ccTCGA3<-hddc(as.matrix(TCGA[ii,guse[1:250]]),K=3:10,model="AjBQD") table(ccTCGA3$class,TCGAclassstr[ii]) plot(ccTCGA3) # ss<-svd(TCGA[ii,guse[1:250]]) kk<-Mclust(ss$u[,1:dim(ccTCGA3$Q)[2]]) table(kk$class,TCGAclassstr[ii]) ##### # Conclusion: How you filter matters - different clusters are detected... kk<-Mclust(TCGA[ii,guse[1:100]]) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(TCGA[ii,guse[1:250]]) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(TCGA[ii,guse[1:250]],G=8) table(kk$class,TCGAclassstr[ii]) # increasing the number of clusters if we use just filtering does not help... ### SVD filtering only kk<-Mclust(ss$u[,1:50],G=8) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(ss$u[,1:25]) table(kk$class,TCGAclassstr[ii])