###### DEMO 13b ###### load("TCGAData.RData") # library(irlba) # fast svd for large data sets library(plot3D) library(rgl) #### # compute the leading K components K<-10 ss<-irlba(TCGA,K,K) plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # Kmeans on the leading Principal components kk<-kmeans(ss$u[,1:10],6) # library(mda) confusion(kk$cluster,TCGAclassstr) ### SVD + kmeans - find the cancer classes? # Finds subtypes of breast cancer, combines some of the other cancers # What about pam? library(cluster) pp<-pam(ss$u,6) plot(pp) # Better silhouette plot plot(pp$silinfo$widths[,3],col=pp$silinfo$widths[,1],type="h") confusion(pp$clustering,TCGAclassstr) ### what about PAM # Not bad! # #### consensus clustering # I am using a random subset of genes to cluster, repeat 50 times - check overlap DD<-matrix(0,2887,2887) for (zz in (1:50)) { ss<-irlba(TCGA[,sample(seq(1,20530),2500)],10,10) #random set of 2500 genes from the 20530 kk<-kmeans(ss$u[,1:10],6) #apply kmeans to leading components dd<-as.matrix(dist(kk$cluster)) # check if observations are in the same cluster or not dd[dd!=0]<-1 # set distance to 1 if not in same cluster DD<-DD+(1-dd) # add all distances together across 50 iterations cat(zz)} # takes a while- cat(zz) prints the iteration number so you see where you are in the loop # hc<-hclust(as.dist(1-DD/50)) # clustering based on consensus plot(hc,labels=TCGAclassstr) # nicer dendrogram library(dendextend) dend<-as.dendrogram(hc) cols<-as.numeric(as.factor(TCGAclassstr)) labels_colors(dend)<-cols[order.dendrogram(dend)] plot(dend) # Not bad! Even picks up the small cancers as mainly co-clustered k<-cutree(hc,k=6) confusion(k,TCGAclassstr) # can't find uterine cancer (the smallest class) # k<-cutree(hc,k=8) confusion(k,TCGAclassstr) # if you ask for more clusters, you start splitting breast cancer into smaller clusters # k<-cutree(hc,k=7) confusion(k,TCGAclassstr) # how reproducible are the clusters? image(DD) image(DD[hc$order,hc$order]) ### BC is much larger and more variable - need more clusters to summarize ### Difficult to detect Uterine cancer (small group) ### split of BC into subtypes # # ######## # Let's try some filtering of features vv<-apply(TCGA,2,sd) vv<-rev(sort.list(vv)) # top most varying genes # some examples of filtered features par(mfrow=c(2,2)) hist(TCGA[,vv[2]]) hist(TCGA[,vv[30]]) hist(TCGA[,vv[2530]]) hist(TCGA[,vv[20530]]) # The most varying genes may pick up different cancers.. guse<-vv[1:250] # use top 250 genes # library(gplots) heatmap.2(as.matrix(TCGA[,guse]), col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols), hclust=function(x) hclust(x,method="complete"),scale="column",trace="none") #euclidean distance for clustering #bi-clustering: clustering both observations and variables heatmap.2(as.matrix(TCGA[,guse]), col=colorRampPalette(c("red","white","blue"))(64),RowSideColors = as.character(cols), hclust=function(x) hclust(x,method="complete"),scale="column",trace="none", distfun=function(x) as.dist((1-cor(t(x),method="pearson",use="pairwise.complete.obs"))/2)) #correlation-based clustering # ### SVD on filtered data guse<-vv[1:2000] # use top 2000 genes ss<-irlba(TCGA,10,10) plot3d(ss$u[,1:3],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # plot3d(TCGA[,vv[1:3]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # Note that just filtering on variance might lead to a particular class dominating - here it's Lung cancer plot3d(TCGA[,vv[4:6]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # Risk missing a small class this way... plot3d(TCGA[,vv[7:9]],col=as.numeric(as.factor(TCGAclassstr))) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))[sort.list(as.numeric(as.factor(unique(TCGAclassstr))))]), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) # subtypes of Lung cancer? ####################################### # consensus clustering source("https://bioconductor.org/biocLite.R") biocLite("ConsensusClusterPlus") browseVignettes("ConsensusClusterPlus") # Cannot install directly to Rstudio depending on version - go via Bioconductor library(ConsensusClusterPlus) # this can be a bit slot depending on the number of features and observations! ii<-sample(seq(1,2887),500) # I run on a random set of 500 observations (for speed only - try different numbers of observations and repeat) cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,clusterAlg="km") # Check for different number of clusters - tends to be 3-5 selected for this data set depending on the observations you randomly choose ccu<-cc[[5]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) ### looks like we can't find all cancer classes using clustering.. # What about using a different method? correlation and hierarchical clustering cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="hc") ccu<-cc[[4]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) # correlation and pam cc<-ConsensusClusterPlus(as.matrix(t(TCGA[ii,guse[1:500]])),maxK=8,reps=100,pItem=.6,pFeature=.6,distance="pearson",clusterAlg="pam") ccu<-cc[[5]] confusion(ccu$consensusClass,TCGAclassstr[ii]) calcICL(cc) #### Very different clustering outcome depending on method and distance metric used!!! ####### ####### # # subspace clustering of the iris data library(subspace) # irisnew<-iris irisnew[iris[,5]=="virginica",1]<-irisnew[iris[,5]=="virginica",1]+2 ss<-seq(1,dim(iris)[1])[iris[,5]=="virginica"] irisnew[ss[1:25],2]<-irisnew[ss[1:25],2]+3 # I create an iris data set with 4 distinct clusters here cc<-CLIQUE(irisnew[,-5],xi=15,tau=0.15) plot(cc,irisnew[,-5]) # #cc<-P3C(irisnew[,-5],0.01,5) #cc<-SubClu(irisnew[,-5],minSupport=10,epsilon=3) # other subspace methods # library(orclus) cc<-orclus(as.matrix(iris[,1:4]),k=3,l=2,k0=5,inner.loops=10) # method says how many clusters and subspace dimension used plot(iris[,3:4],col=cc$cluster,pch=as.numeric(iris[,5])) cc$subspaces # kk<-kmeans(iris[,1:4],3) plot(iris[,3:4],col=kk$cluster,pch=as.numeric(iris[,5])) # to try at home: create a data set where clusters really do live in different # subspace (i.e. mean differences in some but not all dimensions) #### # on the cancer data ss<-irlba(as.matrix(TCGA[ii,guse[1:100]]),10,10) # cc<-orclus(ss$u,k=6,l=5,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[ii]) # try different size subspaces and clusters # try on feature selected data cc<-orclus(as.matrix(TCGA[ii,guse[1:100]]),k=6,l=6,k0=10,inner.loops=10) confusion(cc$cluster,TCGAclassstr[ii]) ### ### High-dimensional classification package library(HDclassif) cc<-hddc(irisnew[,-5],K=2:5) # chooses number of clusters and subspace dimension table(cc$class,irisnew[,5]) cc$d # which intrinsic dimension do the clusters live in? cc$all_results # ### This takes a while.... Run on a smaller set of observations and/or dimensions first ccTCGA<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10) table(ccTCGA$class,TCGAclassstr[ii]) ccTCGA$all_results plot(ccTCGA) # Using same subspace for all clusters ccTCGA2<-hddc(as.matrix(TCGA[ii,guse[1:100]]),K=3:10,model="AjBQD") table(ccTCGA2$class,TCGAclassstr[ii]) plot(ccTCGA2) ccTCGA2$all_results # compare with just leading SVD ACROSS whole data set = NOT the same thing as common PC for all clusters! ss<-svd(TCGA[ii,guse[1:100]]) library(mclust) kk<-Mclust(ss$u[,1:dim(ccTCGA2$Q)[2]]) table(kk$class,TCGAclassstr[ii]) # Notice that filtering based on variance and using model based dimension reduction works much better than svd filtering on all # # Even more generous filtering ccTCGA3<-hddc(as.matrix(TCGA[ii,guse[1:250]]),K=3:10,model="AjBQD") table(ccTCGA3$class,TCGAclassstr[ii]) plot(ccTCGA3) # ss<-svd(TCGA[ii,guse[1:250]]) kk<-Mclust(ss$u[,1:dim(ccTCGA3$Q)[2]]) table(kk$class,TCGAclassstr[ii]) ##### # Conclusion: How you filter matters - different clusters are detected... kk<-Mclust(TCGA[ii,guse[1:100]]) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(TCGA[ii,guse[1:250]]) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(TCGA[ii,guse[1:250]],G=8) table(kk$class,TCGAclassstr[ii]) # increasing the number of clusters if we use just filtering does not help... ### SVD filtering only kk<-Mclust(ss$u[,1:50],G=8) table(kk$class,TCGAclassstr[ii]) # kk<-Mclust(ss$u[,1:25]) table(kk$class,TCGAclassstr[ii])