# SVD of the mnist data library(ElemStatLearn) library(nsprcomp) data(zip.train) Numbers<-zip.train[sample(seq(1,7291),2000),] ssn<-svd(Numbers[,-1]) plot(ssn$d) # most of variance explained by 50 components (of 256) plot(ssn$u[,1:2]) pp<-identify(ssn$u[,1:2]) #[1] 1241, 2564, 3819, 4214 image(t(matrix(as.numeric(Numbers[pp[3],-1]),16,16,byrow=T))[,16:1]) # highligthing the structure that the 1st components identify and separate plot(ssn$u[,1:2],pch=as.character(Numbers[,1]),col=as.numeric(Numbers[,1])+1) plot(ssn$u[,2:3],pch=as.character(Numbers[,1]),col=as.numeric(Numbers[,1])+1) ####### par(mfrow=c(3,3)) for (zz in (1:9)) { iz<-sample(seq(1,dim(Numbers)[1])[Numbers[,1]==zz],1) image(t(matrix(as.numeric(Numbers[iz,-1]),16,16,byrow=T))[,16:1]) } #### svd decomposition library(svdvis) svd.obj <- svd(Numbers[,-1]) svd.scree(svd.obj, subr=5, axis.title.x="Full scree plot", axis.title.y="% Var Explained") #### 3D plot of the scores library(plot3D) library(rgl) library(irlba) plot3d(svd.obj$u[,1:3],col=as.numeric(Numbers[,1])+1) legend3d("topright", legend = paste('Type', c(unique(Numbers[,1]))), pch = 5, col=seq(1,10), cex=1, inset=c(0.02)) plot3d(Numbers[,sample(seq(2,257),3)],col=as.numeric(Numbers[,1])+1) legend3d("topright", legend = paste('Type', c(unique(Numbers[,1]))), pch = 5, col=seq(1,10), cex=1, inset=c(0.02)) ####################### # Approximation of an image using svd Nmat<-as.matrix(Numbers[,-1]) iz<-sample(seq(1,dim(Numbers)[1]),1) ss<-svd(t(matrix(Nmat[iz,],16,16,byrow=T))[,16:1]) par(mfrow=c(1,2)) image(ss$u[,1:2]%*%diag(ss$d[1:2])%*%t(ss$v[,1:2])) image(t(matrix(Nmat[iz,],16,16,byrow=T))[,16:1]) #### either sparse and many or dense and few components to approximate library(nsprcomp) Im<-t(matrix(Nmat[iz,],16,16,byrow=T))[,16:1] ss<-nsprcomp(Im,k=c(rep(10,4)),ncomp=4,tol=.1,scale=F,center=F) #10 loadings in each of 4 components UU<-Im%*%ss$rotation ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation) par(mfrow=c(1,2)) image(ImA) image(Im) # Im<-t(matrix(Nmat[iz,],16,16,byrow=T))[,16:1] ss<-nsprcomp(Im,k=c(rep(5,8)),ncomp=8,tol=.1,scale=F,center=F) # only 5 loadings in each of 8 components UU<-Im%*%ss$rotation ImA<-UU%*%diag(ss$sdev)%*%t(ss$rotation) par(mfrow=c(1,2)) image(ImA) image(Im) ##### pp<-prcomp(Nmat) Nbr.svd.sparse<-nsprcomp(Nmat,k=c(rep(25,10))) # This step takes quite some time to run image(t(matrix(Nbr.svd.sparse$rotation[,1],16,16,byrow=T))[,16:1]) image(t(matrix(Nbr.svd.sparse$rotation[,2],16,16,byrow=T))[,16:1]) image(t(matrix(Nbr.svd.sparse$rotation[,3],16,16,byrow=T))[,16:1]) plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1),xlim=c(1,10)) lines(cumsum(Nbr.svd.sparse$sdev)/sum(pp$sdev)) # losing some information by going sparse... but more interpretable UU<-as.matrix(Nmat)%*%Nbr.svd.sparse$rotation plot(UU[,1:2],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1])) plot(UU[,3:4],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1])) plot(UU[,5:6],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1])) plot(UU[,7:8],col=as.numeric(Numbers[,1]),pch=as.character(Numbers[,1])) ################# heart disease data pp<-prcomp(SAheart,scale=T) plot(cumsum(pp$sdev)/sum(pp$sdev),ylim=c(0,1)) sp<-nsprcomp(SAheart,k=c(10,rep(10,9)),scale=T) lines(cumsum(sp$sdev)/sum(sp$sdev)) sp<-nsprcomp(SAheart,k=c(3,rep(2,9)),scale=T) lines(cumsum(sp$sdev)/sum(sp$sdev),col=2) sp<-nsprcomp(SAheart,k=c(2,rep(2,9)),scale=T) lines(cumsum(sp$sdev)/sum(sp$sdev),col=3) plot(sp$rotation[,1]) #adiposity,obseity plot(sp$rotation[,2]) #tobacco, age plot(pp$rotation[,1]) ############################################# #SOM library(kohonen) # data(crabs) crabs2<-crabs head(crabs) crabs2[,1]<-as.numeric(crabs2[,1]) crabs2[,2]<-as.numeric(crabs2[,1]) pp<-prcomp(crabs2[,-3],scale=T) CC<-as.matrix(crabs2[,-3])%*%pp$rot plot(CC[,1],CC[,2],col=crabs2$sp,pch=crabs2$sex) plot(CC[,3],CC[,4],col=crabs2$sp,pch=crabs2$sex) # PCA can fail miserably for mixed data ss<-som(as.matrix(crabs2[,-3]),grid=somgrid(5,5)) plot(ss,type="mapping",col=crabs2$sp,pch=crabs2$sex) # where observations are co-located plot(ss,type="codes") # radius of wedge is the magnitude of that feature in this grid. som_model <- som(as.matrix(crabs2[,-3]), grid=somgrid(5,5), rlen=100, alpha=c(0.05,0.01), keep.data = TRUE) plot(som_model,type="count") # where are the objects located plot(som_model,type="dist.neighbours") plot(som_model,type="codes") # # visualize one variable across the map plot(som_model, type = "property", property = som_model$codes[[1]][,4], main=names(som_model$data)[4]) plot(som_model, type = "property", property = som_model$codes[[1]][,1], main=names(som_model$data)[1]) som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 6) plot(som_model, type="mapping", bgcol = som_cluster, main = "Clusters") add.cluster.boundaries(som_model, som_cluster) # cluster the prototypes ###### # iz<-sample(seq(1,7291),2000) Nmat<-as.matrix(zip.train[iz,-1]) Nlab<-zip.train[iz,1] # this takes some time... som_modelN <- som(as.matrix(Nmat), grid=somgrid(25,25), rlen=100, alpha=c(0.05,0.01), keep.data = TRUE) plot(som_modelN,type="changes") plot(som_modelN,type="count") plot(som_modelN,type="dist.neighbours") som_cluster <- cutree(hclust(dist(som_modelN$codes[[1]])), 10) plot(som_modelN, type="mapping", bgcol = som_cluster, main = "Clusters") add.cluster.boundaries(som_model, som_cluster) plot(som_modelN,type="mapping",pch=as.character(Nlab),col=as.numeric(Nlab)+1) plot(som_modelN,type="mapping",pch=as.character(Nlab),bgcol = som_cluster) # 625 images to look at: space it out by 5 par(mfrow=c(5,5)) kvec<-sort(sample(seq(1,625),25)) for (kk in (1:25)) { image(t(matrix(som_modelN$codes[[1]][kvec[kk],],16,16,byrow=T))[,16:1]) } ###################### ####### supervised SOM Nbr.som2<-xyf(Nmat,Y=as.factor(Nlab),grid=somgrid(5,5)) plot(Nbr.som2) plot(Nbr.som2,type="mapping",pch=as.character(Nlab)) plot(Nbr.som2,type="codes") # 25 code images to look at. par(mfrow=c(5,5)) for (kk in (1:25)) { image(t(matrix(Nbr.som2$codes[[1]][kk,],16,16,byrow=T))[,16:1]) } ######## # Heart disease data sh<-som(as.matrix(SAheart[,-10]),grid=somgrid(5,5)) plot(sh) plot(sh,type="count") plot(sh,type="mapping",pch=as.character(SAheart[,10])) plot(sh,type="dist.neighbours") sh_cluster <- cutree(hclust(dist(sh$codes[[1]])), 6) plot(sh, type="mapping", bgcol = som_cluster, main = "Clusters") add.cluster.boundaries(sh, sh_cluster) plot(sh,type="mapping",pch=as.character(SAheart[,10]),col=as.numeric(SAheart[,10])+1) sh2<-xyf(X=as.matrix(SAheart[,-10]),Y=as.factor(SAheart[,10]),grid=somgrid(5,5)) plot(sh2) plot(sh2,type="mapping",pch=as.character(SAheart[,10]),col=as.numeric(SAheart[,10])+1) ####### When data is really big! #irlba library(irlba) # first SVD components only load("TCGAdata.RData") dim(TCGA) TCGAclass<-as.numeric(as.factor(TCGAclassstr)) ssTCGA<-irlba(TCGA,25,25) plot3d(ssTCGA$u[,1:3],col=TCGAclass+1) legend3d("topright", legend = paste('Type', c(unique(TCGAclassstr))), pch = 5, col=seq(1,6), cex=1, inset=c(0.02)) plot(ssTCGA$d) ############################