library("NMF") # a subset of handwritten digits N<-500 # sample 500 digits to try NMF on it<-sample(seq(1,7291),N) Nmat<-zip.train[it,-1] Nlab<-zip.train[it,1] K<-10 aa<-nmf(Nmat-min(Nmat),K) coefmap(aa) # this is what the codebook looks like (each is a raster scan of a 16*16 image) plot(basis(fit(aa))[,1:2],col=gray(Nlab/10), ,pch=as.character(Nlab)) # plotting basis functions 1:2 - separates which digits? # # Try a few of the digits and look what the K-rank rep captures par(mfrow=c(1,2)) ss<-sample(seq(1,length(it)),1) image(matrix(Nmat[ss,],16,16,byrow=T)) image(matrix(fitted(aa)[ss,],16,16,byrow=T)) ##### # Try with different K # summary(aa) # percent of variance explained 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) # K<-3 aa<-nmf(Nmat-min(Nmat),K) par(mfrow=c(1,1)) coefmap(aa) summary(aa) 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) # ### different algorithms K<-3 aa<-nmf(Nmat-min(Nmat),K,"lee") coefmap(aa) summary(aa) 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) # K<-3 aa<-nmf(Nmat-min(Nmat),K,"brunet") coefmap(aa) summary(aa) 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) # sparse NMF K<-3 aa<-nmf(Nmat-min(Nmat),K,"snmf/r") coefmap(aa) summary(aa) 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) # K<-3 aa<-nmf(Nmat-min(Nmat),K,"snmf/l") coefmap(aa) summary(aa) 1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) ######## ####### can we find the digits based on the low-dim space? kk<-kmeans(fitted(aa),10) table(kk$cluster,Nlab) # library(dendextend) hc <- hclust(dist(fitted(aa)),method="complete") dend <- as.dendrogram(hc) labels_colors(dend) <- (as.numeric(Nlab))[order.dendrogram(dend)] plot(dend) hcc<-cutree(hc,10) table(hcc,Nlab) ######## ####### Check which NMF method and K works best for finding the digits #### ######## K<-10 VarExpl<-matrix(0,K,2) ss<-svd(Nmat-min(Nmat)) VarExpl[,1]<-cumsum(ss$d[1:K]^2)/sum(ss$d^2) for (kk in (1:K)) { aa<-nmf(Nmat-min(Nmat),kk) VarExpl[kk,2]<-1-sum(((Nmat-min(Nmat))-fitted(aa))^2)/sum((Nmat-min(Nmat))^2) } plot(seq(1,K),VarExpl[,1],ymin=c(min(VarExpl),max(VarExpl)),type="l") lines(seq(1,K),VarExpl[,2],col=2) #### #### SVD vs NMF #### % explained vs interpretability #### You loose some % var explained with NMF, gain some interpretability ###### library(plot3D) library(rgl) library(svdvis) # puppy<-as.matrix(read.table("puppy.txt")) image(puppy,col=gray.colors(256,0,1)) ssp<-svd(puppy-mean(puppy)) svd.scree(ssp, subr=15, axis.title.x="Full scree plot", axis.title.y="% Var Explained") plot3d(ssp$v[,1:3]) #### Low-rank approximations par(mfrow=c(2,2)) k<-2 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) k<-5 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) k<-15 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) k<-25 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) #### Missing values (corrupted image) aa<-sample(seq(1,322),100,replace=T) bb<-sample(seq(1,300),100,replace=T) puppy2<-puppy puppy2[aa,bb]<-NA par(mfrow=c(1,1)) image(puppy2,col=gray.colors(256,0,1)) library(softImpute) #### Using low-rank approximations to fill in missing values pcomp<-softImpute(puppy2,rank=10,lambda=.85) image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1)) pcomp<-softImpute(puppy2,rank=35,lambda=.9) image(pcomp$u%*%diag(pcomp$d)%*%t(pcomp$v),col=gray.colors(256,0,1)) #### or denoising puppy3<-puppy puppy3<-(puppy+matrix(rnorm(322*300,sd=.12),322,300)) image(puppy3,col=gray.colors(256,0,1)) ssp<-svd(puppy3) k<-15 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) k<-35 image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) ####### ### the low rank approximations of the puppy image ssp<-svd(puppy-mean(puppy)) par(mfrow=c(4,4)) for (k in 1:16) { if (k>1) { image(ssp$u[,1:k]%*%diag(ssp$d[1:k])%*%t(ssp$v[,1:k]),col=gray.colors(256,0,1)) } if (k==1) { image(ssp$d[1]*matrix(ssp$u[,1:k],322,1)%*%matrix(ssp$v[,1:k],1,300),col=gray.colors(256,0,1)) } } # The SVD layers par(mfrow=c(4,4)) for (k in 1:16) { image(ssp$d[k]*matrix(ssp$u[,k],322,1)%*%matrix(ssp$v[,k],1,300),col=gray.colors(256,0,1)) } ##################### ### And now NMF K<-16 aa<-nmf(puppy,K) par(mfrow=c(1,1)) image(fitted(aa),col=gray.colors(256,0,1)) # the NMF layers par(mfrow=c(4,4)) for (k in (1:16)) { image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1)) } # adding the layers up par(mfrow=c(4,4)) for (k in (1:16)) { if (k>1) { image((basis(aa))[,1:k]%*%(coef(aa)[1:k,]),col=gray.colors(256,0,1)) } if (k==1) { image(matrix((basis(aa))[,k],322,1)%*%matrix((coef(aa)[k,]),1,300),col=gray.colors(256,0,1)) } } ########################### #### on a big data set of digits # this takes a while to run! N<-500 itNbr<-sample(seq(1,7291),N) Nmat<-zip.train[itNbr,-1] Nlab<-zip.train[itNbr,1] K<-16 aaNbr<-nmf(Nmat-min(Nmat),K) # par(mfrow=c(1,1)) coefmap(aaNbr) basismap(aaNbr) ###############################3 par(mfrow=c(4,4)) for (k in (1:16)) { image(matrix(coef(aaNbr)[k,],16,16,byrow=T),col=gray.colors(256,0,1)) } # how good is the approximation? Look at observation u (try different ones) # here I try the a couple of observations su<-sample(seq(1,length(itNbr)),8) for (zz in (1:length(su))) { u<-su[zz] s1<-(basis(aaNbr))[u,]%*%(coef(aaNbr)) image(matrix(s1,16,16,byrow=T),col=gray.colors(256,0,1)) image(matrix(Nmat[u,],16,16,byrow=T),col=gray.colors(256,0,1)) } ###### svdN<-svd(Nmat) par(mfrow=c(4,4)) for (k in 1:16) { image(matrix(svdN$v[,k],16,16,byrow=T),col=gray.colors(256,0,1)) } for (zz in (1:length(su))) { u<-su[zz] s1<-svdN$u[u,1:16]%*%diag(svdN$d[1:16])%*%t(svdN$v[,1:16]) image(matrix(s1,16,16,byrow=T),col=gray.colors(256,0,1)) image(matrix(Nmat[u,],16,16,byrow=T),col=gray.colors(256,0,1)) } # ########################### # sOM vs MDS library(kohonen) # par(mfrow=c(1,1)) data(crabs) crabs2<-crabs head(crabs) crabs2[,1]<-as.numeric(crabs2[,1]) crabs2[,2]<-as.numeric(crabs2[,2]) pp<-prcomp(crabs2[,-3],scale=T) CC<-as.matrix(crabs2[,-3])%*%pp$rot plot(CC[,1],CC[,2],col=crabs2$sp,pch=crabs2$sex) plot(CC[,3],CC[,4],col=crabs2$sp,pch=crabs2$sex) ## PCA of craps data ## It really performs poorly since the binary (categorical) data ## is not well summarized in the SVD ss<-som(as.matrix(crabs2[,-3]),grid=somgrid(5,5)) plot(ss,type="mapping",col=crabs2$sp,pch=crabs2$sex) plot(ss,type="codes") ### SOMS som_model <- som(as.matrix(crabs2[,-3]), grid=somgrid(5,5), rlen=100, alpha=c(0.05,0.01), keep.data = TRUE) plot(som_model,type="count") plot(som_model,type="dist.neighbours") plot(som_model,type="codes") # #visualize one variable across the map plot(som_model, type = "property", property = som_model$codes[[1]][,4], main=names(som_model$data)[4]) som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), 6) plot(som_model, type="mapping", bgcol = som_cluster, main = "Clusters") add.cluster.boundaries(som_model, som_cluster) #### MDS #### library(cluster) dd<-daisy(crabs[,-3]) # daisy understands that some variables are categorical! ff<-cmdscale(dd,2) plot(ff,col=crabs2[,1],pch=crabs2[,2]) # library(plot3D) library(rgl) ff<-cmdscale(dd,3) plot3d(ff,col=crabs2[,1],pch=crabs2[,2]) # In 3D we get a nice separation of species and gender! ###### ## The number data (takes a while to run) par(mfrow=c(1,1)) ddNbr<-daisy(Nmat) ffNbr3<-cmdscale(ddNbr,3) plot3d(ffNbr3,col=gray(Nlab/10)) # ffNbr2<-cmdscale(ddNbr,2) par(mfrow=c(1,1)) plot(ffNbr2,pch=as.character(Nlab),col=gray(Nlab/10)) # ssNbr<-som(Nmat,grid=somgrid(5,5)) plot(ssNbr,type="mapping",pch=as.character(Nlab),col=gray(Nlab/10)) #### slow on numbers data - try on a smaller subset library(vegan) dd<-daisy(crabs[,-3]) isoc<-isomap(dd,ndim=2,k=25)) plot(isoc$points,col=crabs2[,1],pch=crabs2[,2]) isoc<-isomap(dd,ndim=2,k=10) plot(isoc$points,col=crabs2[,1],pch=crabs2[,2]) # Take a really long time isoNbr<-isomap(dist(Nmat),ndim=2,k=10) plot(isoNbr$points,col=gray(Nlab/10),pch=as.character(Nlab)) isoNbr<-isomap(dist(Nmat),ndim=2,k=5) plot(isoNbr$points,col=gray(Nlab/10),pch=as.character(Nlab)) # tSNE library(tsne) colors = rainbow(length(unique(iris$Species))) names(colors) = unique(iris$Species) ecb = function(x,y){ plot(x,t='n'); text(x,labels=iris$Species, col=colors[iris$Species]) } tsne_iris = tsne(iris[,1:4], epoch_callback = ecb, perplexity=30) tsne_iris = tsne(iris[,1:4], whiten=F,epoch_callback = ecb, perplexity=30) # standardize or no? iris2<-iris iris2[,1:4]<-apply(iris[,1:4],2,standardize) tsne_iris = tsne(iris2[,1:4], whiten=T,epoch_callback = ecb, perplexity=30) # standardize or not? #### iz<-sample(seq(1,7291),2000) Nmat<-zip.train[iz,-1] Nlab<-zip.train[iz,1] ecb = function(x,y){ plot(x,t='n'); text(x,labels=Nlab, col=gray(Nlab/10)) } tsne_N = tsne(Nmat[,-1], epoch_callback = ecb, perplexity=25,max_iter=50) # takes a while to run if you run many iterations # perplexity - bandwidth choice par(mfrow=c(1,1)) plot(tsne_N,col="white") text(tsne_N,labels=Nlab,col=gray(Nlab/10))