## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette bgCol <- "#FAFAFA" theme_set(theme_minimal()) # Set a less verbose standard theme # Packages for actual computation ## ----nmf-example, echo=FALSE, cache=TRUE--------------------------------- set.seed(291828827) data_train <- ElemStatLearn::zip.train colnames(data_train) <- c("digit", sprintf("x%d", 1:256)) # Sub-sample 50 examples from each class data_subsample <- as_tibble(data_train) %>% group_by(digit) %>% sample_n(100) X_train <- t(as.matrix(data_subsample[,-1])) # # Check the rank of the feature matrix # Matrix::rankMatrix(X_train) # -> 256, i.e. full-rank # Number of plotted components M <- 10 # Perform PCA by scaling the data and performing SVD X_svd <- svd(scale(X_train)) # # Projection on first two PCs # X_pca <- t(X_svd$u[,1:2]) %*% scale(X_train) # Five M vectors in PCA basis Y_pca <- X_svd$u[,1:M] # Run a rank 50 NMF fit_nmf <- NMF::nmf((X_train + 1) / 2, rank = 50) # Five M vectors in NMF basis Y_nmf <- fit_nmf@fit@W[,1:M] # Reconstruction from 50 first PCs X_reco_svd <- X_svd$u[,1:50] %*% diag(X_svd$d[1:50]) %*% t(X_svd$v[,1:50]) # ...and from 50 NMF components X_reco_nmf <- fit_nmf@fit@W %*% fit_nmf@fit@H ## ----nmf-prep-plots, echo=FALSE, cache=TRUE------------------------------ basis_vecs <- cbind(Y_pca, Y_nmf) colnames(basis_vecs) <- c(sprintf("PCA %d", 1:M), sprintf("NMF %d", 1:M)) # Scaling is only to get all pictures on the same scale and make them # comparable visually data_plot <- as_tibble(basis_vecs) %>% mutate_all(function(x) x / max(abs(x))) %>% bind_cols(r = (1:dim(basis_vecs)[1]) - 1, .) %>% gather(component, value, -r) %>% mutate( component = factor( component, levels = c(sprintf("PCA %d", 1:M), sprintf("NMF %d", 1:M)), ordered = TRUE), x = r %% 16, y = 16 - r %/% 16) %>% select(component, x, y, value) reco <- cbind( X_reco_svd[,((1:5) - 1) * 100 + 1], X_reco_nmf[,((1:5) - 1) * 100 + 1]) colnames(reco) <- c(sprintf("PCA %d", 1:5), sprintf("NMF %d", 1:5)) data_reco_plot <- reco %>% as_tibble %>% bind_cols(r = (1:dim(reco)[1]) - 1, .) %>% gather(image, value, -r) %>% mutate( image = factor( image, levels = c(sprintf("PCA %d", 1:5), sprintf("NMF %d", 1:5)), ordered = TRUE), x = r %% 16, y = 16 - r %/% 16) %>% select(image, x, y, value) ## ----nmf-reconstruction, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggplot(data_reco_plot, aes(x, y)) + geom_tile(aes(fill = value), width = 1, height = 1, colour = bgCol) + scale_fill_gradient2( low = "red", mid = "white", high = "black", guide = FALSE) + facet_wrap(~ image, ncol = 5) + theme_void() + coord_fixed() + theme( strip.text = element_text(size = 7)) ## ----nmf-components, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggplot(data_plot, aes(x, y)) + geom_tile(aes(fill = value), width = 1, height = 1, colour = bgCol) + scale_fill_gradient2( low = "red", mid = "white", high = "black", guide = FALSE) + facet_wrap(~ component, ncol = 5) + theme_void() + coord_fixed() + theme( strip.text = element_text(size = 7)) ## ----nmf-coefficients, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Coefficients for PCA H <- diag(X_svd$d[1:50]) %*% t(X_svd$v[,1:50]) colnames(H) <- 1:dim(H)[2] data_plot <- bind_cols(y = 1:dim(H)[1], as_tibble(H)) %>% gather(x, value, -y) %>% mutate(x = as.integer(x)) p1 <- ggplot(data_plot, aes(x, y)) + geom_tile(aes(fill = value), width = 1, height = 1) + scale_fill_gradient2( low = "red", mid = "white", high = "black", guide = FALSE) + theme_void() + ggtitle("SVD coefficients") + theme( plot.title = element_text(size = 7)) # Coefficients for NMF H <- fit_nmf@fit@H colnames(H) <- 1:dim(H)[2] data_plot <- bind_cols(y = 1:dim(H)[1], as_tibble(H)) %>% gather(x, value, -y) %>% mutate(x = as.integer(x)) p2 <- ggplot(data_plot, aes(x, y)) + geom_tile(aes(fill = value), width = 1, height = 1) + scale_fill_gradient2( low = "red", mid = "white", high = "black", guide = FALSE) + theme_void() + ggtitle("NMF coefficients") + theme( plot.title = element_text(size = 7)) ggpubr::ggarrange(p1, p2, nrow = 2, ncol = 1) ## ----pca-no-help, fig.width=3.5, fig.height=1.4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- set.seed(2019) n <- 250 moon1 <- tibble( moon = 1, r = rnorm(n, 3, .25), theta = runif(n, 0, pi), x = r * cos(theta), y = -0.5 + r * sin(theta)) %>% select(moon, x, y) moon2 <- tibble( moon = 2, r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi), x = 3 + r * cos(theta), y = 0.5 + r * sin(theta)) %>% select(moon, x, y) data_moons <- bind_rows(moon1, moon2) %>% mutate(moon = as.factor(moon)) p1 <- ggplot(data_moons, aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + ggtitle("Raw data") + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size =7)) data_moons_pca <- dimRed::embed(data_moons[,c("x", "y")], "PCA")@data@data colnames(data_moons_pca) <- c("x", "y") p2 <- ggplot( bind_cols(moon = data_moons$moon, as_tibble(data_moons_pca)), aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_x_continuous("PC1") + scale_y_continuous("PC2") + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + ggtitle("Transformed with PCA") + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size =7)) ggpubr::ggarrange(p1, p2, ncol = 2) ## ----pca-aug-help, fig.width=4.5, fig.height=1.4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- set.seed(2019) n <- 250 c1 <- tibble(x = rnorm(n), y = rnorm(n), cluster = 1) c2 <- tibble(r = rnorm(n, 5, .25), theta = runif(n, 0, 2 * pi), x = r * cos(theta), y = r * sin(theta), cluster = 2) %>% select(x, y, cluster) data_plot <- bind_rows(c1, c2) %>% mutate(cluster = as.factor(cluster)) %>% mutate( z = x ^ 2 + y ^ 2, colour = case_when( cluster == 1 ~ cbPalette[2], TRUE ~ cbPalette[3])) par( mfrow = c(1, 3), mar = c(2, 2, 1, 1), cex.main = 0.7) plot3D::scatter2D( data_plot$x, data_plot$y, colkey = FALSE, col = data_plot$colour, cex = 0.8, pch = 16, main = "Raw data", xaxt = "n", yaxt = "n", xlab = "", ylab = "") title(ylab = "y", xlab = "x", line = 0.25) plot3D::scatter3D( data_plot$x, data_plot$y, data_plot$z, colkey = FALSE, col = data_plot$colour, cex = 0.8, phi = 10, theta = -45, pch = 16, main = "Augmented data with x² + y²") circles_pca <- dimRed::embed( data_plot[,c("x", "y", "z")], "PCA")@data@data plot3D::scatter2D( circles_pca[,1], circles_pca[,2], colkey = FALSE, col = data_plot$colour, cex = 0.8, pch = 16, main = "PCA of augmented data", xaxt = "n", yaxt = "n", xlab = "", ylab = "") title(ylab = "y", xlab = "x", line = 0.25) ## ----kpca-example-setup, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- set.seed(2019) n <- 250 moon1 <- tibble( moon = 1, r = rnorm(n, 3, .25), theta = runif(n, 0, pi), x = r * cos(theta), y = -0.5 + r * sin(theta)) %>% select(moon, x, y) moon2 <- tibble( moon = 2, r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi), x = 3 + r * cos(theta), y = 0.5 + r * sin(theta)) %>% select(moon, x, y) data_moons <- bind_rows(moon1, moon2) %>% mutate(moon = as.factor(moon)) p1 <- ggplot(data_moons, aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + ggtitle("Raw data") + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size =7)) data_moons_pca <- dimRed::embed(data_moons[,c("x", "y")], "PCA")@data@data colnames(data_moons_pca) <- c("x", "y") p2 <- ggplot( bind_cols(moon = data_moons$moon, as_tibble(data_moons_pca)), aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_x_continuous("PC1") + scale_y_continuous("PC2") + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + ggtitle("Transformed with PCA") + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size =7)) data_moons_kpca <- dimRed::embed( data_moons[,c("x", "y")], "kPCA", kernel = "rbfdot", kpar = list(sigma = 0.7))@data@data colnames(data_moons_kpca) <- c("x", "y") p3 <- ggplot( bind_cols(moon = data_moons$moon, as_tibble(data_moons_kpca)), aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_x_continuous("PC1") + scale_y_continuous("PC2") + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + labs( title = "Transformed with kPCA", subtitle = TeX("RBF kernel, $\\sigma = 0.7$")) + theme( plot.title = element_text(size = 8), plot.subtitle = element_text(size = 7), axis.title = element_text(size = 7), axis.text = element_text(size = 7)) ## ----data-pca-plot, fig.width=4, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggpubr::ggarrange(p1, p2, ncol = 2) ## ----kpca-plot, fig.width=2, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p3