## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette theme_set(theme_minimal()) # Set a less verbose standard theme # Packages for actual computation # library(dimRed) ## ----swiss-role-setup, echo=FALSE, cache=TRUE---------------------------- set.seed(2019294729) # Number of samples n <- 1000 # Create swiss roll x <- matrix(runif(2 * n), nrow = 2) v <- 3 * pi / 2 * (0.1 + 2 * x[1,]) X <- matrix(0, ncol = n, nrow = 3) X[2,] <- 20 * x[2,] X[1,] <- -cos(v) * v X[3,] <- sin(v) * v ## ----problematic-geometry, fig.width=4.5, fig.height=3.25, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- par(mfrow = c(2, 3), mar = c(1, 1, 1, 1), cex.main = 0.7) # Show only the data plot3D::scatter3D( X[1,], X[2,], X[3,], colvar = v, phi = 20, theta = 25, pch = 16, cex = 0.5, colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)") # Ideal unrolled shape data_points <- tibble( x = v, y = X[2,], colour = v) plot3D::scatter2D( data_points$x, data_points$y, colvar = data_points$colour, pch = 16, cex = 0.5, colkey = FALSE, main = "Ideal unrolled graph", xlab = "", ylab = "", xaxt = "n", yaxt = "n") # Very naively apply PCA on data X_svd <- svd(scale(X, scale = FALSE)) Y_pca <- t(t(X_svd$u[,1:2]) %*% scale(X, scale = FALSE)) data_points <- tibble( x = Y_pca[,1], y = Y_pca[,2], colour = v) plot3D::scatter2D( data_points$x, data_points$y, colvar = data_points$colour, pch = 16, cex = 0.5, colkey = FALSE, main = "PCA", xlab = "", ylab = "", xaxt = "n", yaxt = "n") # Apply kernel-PCA to data dr_kpca <- dimRed::embed(t(X), "kPCA", kpar = list(sigma = 0.13)) Y_kpca <- dr_kpca@data@data data_points <- tibble( x = Y_kpca[,1], y = Y_kpca[,2], colour = v) plot3D::scatter2D( data_points$x, data_points$y, colvar = data_points$colour, pch = 16, cex = 0.5, colkey = FALSE, main = "kPCA (RBF kernel, sigma = 0.13)", xlab = "", ylab = "", xaxt = "n", yaxt = "n") # Calculate the distance matrix D1 <- matrix(rep.int(colSums(X ^ 2), n), nrow = n, byrow = TRUE) D2 <- D1 + t(D1) - 2 * t(X) %*% X D <- sqrt(D2 - diag(D2)) # To illustrate classical scaling, apply MDS directly to distances J <- diag(rep(1, n)) - matrix(1, ncol = n, nrow = n) / n K <- -0.5 * J %*% (D ^ 2) %*% J K_decomp <- eigen(K) V <- K_decomp$vectors Delta <- K_decomp$values Yraw <- V[,1:2] %*% diag(sqrt(Delta[1:2])) data_points <- tibble( x = Yraw[,1], y = Yraw[,2], colour = v) plot3D::scatter2D( data_points$x, data_points$y, colvar = data_points$colour, pch = 16, cex = 0.5, colkey = FALSE, main = "Classical scaling", xlab = "", ylab = "", xaxt = "n", yaxt = "n") # # Classical scaling as implemented in R # cls <- cmdscale(dist(t(X))) # data_points <- tibble( # x = cls[,1], # y = cls[,2], # colour = v) ## ----graph-knn-distances, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- #### From here, isomap is effectively ignorant to how the distance matrix #### was created. The isomap algorithm could be completely implemented on #### distance matrices. # Sort the distances... DNN <- apply(D, 2, sort) NN <- apply(D, 2, order) # and find nearest neighbours k <- 6 NN <- NN[2:(k + 1),] DNN <- DNN[2:(k + 1),] # Create the graph adjacency matrix... B <- rep(1:n, each = k) A <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = 1) A <- as(A, "dgTMatrix") # ... and distance weighted adjacency matrix W <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = as.vector(DNN)) ## ----isomap-problem-and-graph, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- par(mfrow = c(1, 2), mar = c(0, 0, 1, 0), cex.main = 0.7) # Show only the data plot3D::scatter3D( X[1,], X[2,], X[3,], colvar = v, phi = 20, theta = 25, pch = 16, cex = 0.5, colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)") # Show the data and the respective neighbourhood graph plot3D::segments3D( X[1,A@i + 1], X[2,A@i + 1], X[3,A@i + 1], X[1,A@j + 1], X[2,A@j + 1], X[3,A@j + 1], alpha = 0.3, phi = 20, theta = 25, main = "Nearest neighbours (k = 6)") plot3D::scatter3D( X[1,], X[2,], X[3,], colvar = v, pch = 16, cex = 0.5, colkey = FALSE, bty = "b", add = TRUE) ## ----isomap2, fig.width=4.5, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- # Since only a finite set of nearest neighbours is used the adjacency matrix # is not necessarily symmetric GD <- as.matrix(W) # Make GD symmetric GD <- (GD + t(GD)) / 2 # Use the igraph package for an efficient implementation of graph # distance calculation g <- igraph::graph_from_adjacency_matrix(GD, "undirected", weighted = TRUE) # Creates a matrix of shortest distances (weighted with edge weight) SPS <- igraph::shortest.paths(g, igraph::V(g), igraph::V(g)) # # Floyd's algorithm would be a simple but very slow alternative # # to calculate geodesic distances # sum(igraph::edge_attr(g, "weight", sps$epath[[2]])) # sps$vpath # # Finish algorithm initialisation # GD[GD == 0] <- Inf # # Add connections between points and themselves # diag(GD) <- 0 # # Determine the geodesic distance matrix # for (k in 1:n) { # GD <- pmin( # GD, # matrix(rep(GD[,k], n), ncol = n) + # matrix(rep(GD[k,], each = n), ncol = n)) # } # SPS <- GD # Remove points that were unconnected SPS[is.infinite(SPS)] <- 0 # Perform multidimensional scaling for embedding J <- diag(rep(1, n)) - matrix(1, ncol = n, nrow = n) / n K <- -0.5 * J %*% (SPS ^ 2) %*% J K_decomp <- eigen(K) V <- K_decomp$vectors Delta <- K_decomp$values Y <- V[,1:2] %*% diag(sqrt(Delta[1:2])) data_custom_isomap <- tibble( x = Y[,1], y = Y[,2], colour = v) data_edges <- tibble( x = Y[A@i + 1, 1], y = Y[A@i + 1, 2], xend = Y[A@j + 1, 1], yend = Y[A@j + 1, 2]) p_custom_isomap <- ggplot(mapping = aes(x, y)) + geom_segment( aes(xend = xend, yend = yend), data = data_edges, size = 0.1) + geom_point( aes(colour = colour), data = data_custom_isomap, size = 1, alpha = 0.6) + scale_colour_gradientn( colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) + ggtitle("Isomap (knn = 6)") + theme_void() # In practice, all of the above is implemented in e.g. the dimRed package # Run with knn = 20 to show a caveat of the method dr_isomap <- dimRed::embed(t(X), "Isomap", knn = 20) Y_dr_isomap <- dr_isomap@data@data # Used to plot segments DNN <- apply(D, 2, sort) NN <- apply(D, 2, order) # and find nearest neighbours k <- 20 NN <- NN[2:(k + 1),] DNN <- DNN[2:(k + 1),] # Create the graph adjacency matrix... B <- rep(1:n, each = k) A <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = 1) A <- as(A, "dgTMatrix") data_points <- tibble( x = Y_dr_isomap[,1], y = Y_dr_isomap[,2], colour = v) data_edges <- tibble( x = Y_dr_isomap[A@i + 1, 1], y = Y_dr_isomap[A@i + 1, 2], xend = Y_dr_isomap[A@j + 1, 1], yend = Y_dr_isomap[A@j + 1, 2]) p_dimred_isomap <- ggplot(mapping = aes(x, y)) + geom_segment( aes(xend = xend, yend = yend), data = data_edges, size = 0.1) + geom_point(aes(colour = colour), data = data_points, size = 1, alpha = 0.6) + scale_colour_gradientn( colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) + ggtitle("Isomap (knn = 20)") + theme_void() ggpubr::ggarrange( p_custom_isomap, p_dimred_isomap, ncol = 2) ## ----tsne-simple-example, fig.width=4.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- data_tsne <- dimRed::embed(t(X), "tSNE", perplexity = 30)@data@data par(mfrow = c(1, 3), mar = c(1, 1, 1, 1), cex.main = 0.7) plot3D::scatter3D( X[1,], X[2,], X[3,], colvar = v, phi = 20, theta = 25, pch = 16, cex = 0.5, colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)") plot3D::scatter2D(data_custom_isomap$x, data_custom_isomap$y, colvar = v, colkey = FALSE, pch = 16, cex = 0.8, main = "Isomap (knn = 6)") plot3D::scatter2D(data_tsne[,1], data_tsne[,2], colvar = v, colkey = FALSE, pch = 16, cex = 0.8, main = "tSNE (perplexity = 30)") ## ----tsne-bigger-example, fig.width=4.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- data_train <- ElemStatLearn::zip.train mnist_isomap <- dimRed::embed(data_train[,-1], "Isomap", knn = 20) mnist_tsne <- dimRed::embed(data_train[,-1], "tSNE", perplexity = 20) p_mnist_isomap <- ggplot( cbind(digit = data_train[,1], mnist_isomap@data@data) %>% as_tibble, aes(`iso 1`, `iso 2`)) + geom_point(aes(colour = as.factor(digit)), size = 0.4) + scale_x_continuous("Iso1", breaks = NULL) + scale_y_continuous("Iso2", breaks = NULL) + scale_colour_discrete("Digit") + ggtitle("Isomap (knn = 20)") + theme_void() + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size = 7), legend.title = element_text(size = 7), legend.text = element_text(size = 7)) p_mnist_tsne <- ggplot( cbind(digit = data_train[,1], mnist_tsne@data@data) %>% as_tibble, aes(tSNE1, tSNE2)) + geom_point(aes(colour = as.factor(digit)), size = 0.4) + scale_colour_discrete("Digit") + ggtitle("tSNE (perplexity = 20)") + scale_x_continuous("tSNE1", breaks = NULL) + scale_y_continuous("tSNE2", breaks = NULL) + theme_void() + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size = 7), legend.title = element_text(size = 7), legend.text = element_text(size = 7)) ggpubr::ggarrange( p_mnist_isomap, p_mnist_tsne, ncol = 2, common.legend = TRUE, legend = "bottom") ## ----tsne-complexities-prep, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- set.seed(2019) n <- 250 moon1 <- tibble( moon = 1, r = rnorm(n, 3, .25), theta = runif(n, 0, pi), x = r * cos(theta), y = -0.5 + r * sin(theta)) %>% select(moon, x, y) moon2 <- tibble( moon = 2, r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi), x = 3 + r * cos(theta), y = 0.5 + r * sin(theta)) %>% select(moon, x, y) data_moons <- bind_rows(moon1, moon2) %>% mutate(moon = as.factor(moon)) p_moons_raw <- ggplot(data_moons, aes(x, y)) + geom_point(aes(colour = moon), size = 0.6) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + ggtitle("Raw data") + theme( plot.title = element_text(size = 8), axis.title = element_text(size = 7), axis.text = element_text(size =7)) data_moons_tsne <- do.call(rbind, lapply(1:10, function(i) cbind( run = i, dimRed::embed( data_moons[,c("x", "y")], "tSNE", perplexity = 20)@data@data))) p_tsne_reps <- ggplot( bind_cols(moon = rep(data_moons$moon, 10), as_tibble(data_moons_tsne)), aes(tSNE1, tSNE2)) + geom_point(aes(colour = moon), size = 0.4) + facet_wrap(~ as.factor(run), ncol = 5) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + labs( title = "Transformed with tSNE", subtitle = TeX("Perplexity = 20, multiple runs")) + theme( plot.title = element_text(size = 8), plot.subtitle = element_text(size = 7), axis.title = element_text(size = 7), axis.text = element_text(size = 7)) data_moons_tsne_perp <- do.call(rbind, lapply(c(2, 5, 15, 30, 50, 100), function(i) cbind( perplexity = i, dimRed::embed( data_moons[,c("x", "y")], "tSNE", perplexity = i)@data@data))) p_tsne_perp <- ggplot( bind_cols(moon = rep(data_moons$moon, 6), as_tibble(data_moons_tsne_perp)), aes(tSNE1, tSNE2)) + geom_point(aes(colour = moon), size = 0.4) + facet_wrap(~ as.factor(perplexity), ncol = 3) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + labs( title = "Transformed with tSNE", subtitle = TeX("Varying perplexity")) + theme( plot.title = element_text(size = 8), plot.subtitle = element_text(size = 7), axis.title = element_text(size = 7), axis.text = element_text(size = 7)) ## ----moons-plot, fig.width=2, fig.height=1.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggplot(data_moons, aes(x, y)) + geom_point(aes(colour = moon), size = 0.8) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7)) ## ----tsne-perplexity, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p_tsne_perp ## ----tsne-multiple-runs, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p_tsne_reps