## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette
theme_set(theme_minimal()) # Set a less verbose standard theme

# Packages for actual computation
# library(dimRed)


## ----swiss-role-setup, echo=FALSE, cache=TRUE----------------------------
set.seed(2019294729)
# Number of samples
n <- 1000
# Create swiss roll
x <- matrix(runif(2 * n), nrow = 2)
v <- 3 * pi / 2 * (0.1 + 2 * x[1,])
X <- matrix(0, ncol = n, nrow = 3)
X[2,] <- 20 * x[2,]
X[1,] <- -cos(v) * v
X[3,] <- sin(v) * v


## ----problematic-geometry, fig.width=4.5, fig.height=3.25, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
par(mfrow = c(2, 3), mar = c(1, 1, 1, 1), cex.main = 0.7)
# Show only the data
plot3D::scatter3D(
  X[1,], X[2,], X[3,], colvar = v,
  phi = 20, theta = 25, pch = 16, cex = 0.5,
  colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)")

# Ideal unrolled shape
data_points <- tibble(
  x = v,
  y = X[2,],
  colour = v)

plot3D::scatter2D(
  data_points$x,
  data_points$y,
  colvar = data_points$colour,
  pch = 16, cex = 0.5,
  colkey = FALSE, main = "Ideal unrolled graph",
  xlab = "", ylab = "", xaxt = "n", yaxt = "n")

# Very naively apply PCA on data
X_svd <- svd(scale(X, scale = FALSE))
Y_pca <- t(t(X_svd$u[,1:2]) %*% scale(X, scale = FALSE))

data_points <- tibble(
  x = Y_pca[,1],
  y = Y_pca[,2],
  colour = v)

plot3D::scatter2D(
  data_points$x,
  data_points$y,
  colvar = data_points$colour,
  pch = 16, cex = 0.5,
  colkey = FALSE, main = "PCA",
  xlab = "", ylab = "", xaxt = "n", yaxt = "n")

# Apply kernel-PCA to data
dr_kpca <- dimRed::embed(t(X), "kPCA", kpar = list(sigma = 0.13))
Y_kpca <- dr_kpca@data@data

data_points <- tibble(
  x = Y_kpca[,1],
  y = Y_kpca[,2],
  colour = v)

plot3D::scatter2D(
  data_points$x,
  data_points$y,
  colvar = data_points$colour,
  pch = 16, cex = 0.5,
  colkey = FALSE, main = "kPCA (RBF kernel, sigma = 0.13)",
  xlab = "", ylab = "", xaxt = "n", yaxt = "n")

# Calculate the distance matrix
D1 <- matrix(rep.int(colSums(X ^ 2), n), nrow =  n, byrow = TRUE)
D2 <- D1 + t(D1) - 2 * t(X) %*% X
D <- sqrt(D2 - diag(D2))

# To illustrate classical scaling, apply MDS directly to distances
J <- diag(rep(1, n)) - matrix(1, ncol = n, nrow = n) / n
K <- -0.5 * J %*% (D ^ 2) %*% J
K_decomp <- eigen(K)
V <- K_decomp$vectors
Delta <- K_decomp$values
Yraw <- V[,1:2] %*% diag(sqrt(Delta[1:2]))

data_points <- tibble(
  x = Yraw[,1],
  y = Yraw[,2],
  colour = v)

plot3D::scatter2D(
  data_points$x,
  data_points$y,
  colvar = data_points$colour,
  pch = 16, cex = 0.5,
  colkey = FALSE, main = "Classical scaling",
  xlab = "", ylab = "", xaxt = "n", yaxt = "n")

# # Classical scaling as implemented in R
# cls <- cmdscale(dist(t(X)))
# data_points <- tibble(
#   x = cls[,1],
#   y = cls[,2],
#   colour = v)


## ----graph-knn-distances, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
#### From here, isomap is effectively ignorant to how the distance matrix
#### was created. The isomap algorithm could be completely implemented on
#### distance matrices.
# Sort the distances...
DNN <- apply(D, 2, sort)
NN <- apply(D, 2, order)
# and find nearest neighbours
k <- 6
NN <- NN[2:(k + 1),]
DNN <- DNN[2:(k + 1),]
# Create the graph adjacency matrix...
B <- rep(1:n, each = k)
A <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = 1)
A <- as(A, "dgTMatrix")
# ... and distance weighted adjacency matrix
W <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = as.vector(DNN))


## ----isomap-problem-and-graph, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
par(mfrow = c(1, 2), mar = c(0, 0, 1, 0), cex.main = 0.7)
# Show only the data
plot3D::scatter3D(
  X[1,], X[2,], X[3,], colvar = v,
  phi = 20, theta = 25, pch = 16, cex = 0.5,
  colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)")

# Show the data and the respective neighbourhood graph
plot3D::segments3D(
  X[1,A@i + 1], X[2,A@i + 1], X[3,A@i + 1],
  X[1,A@j + 1], X[2,A@j + 1], X[3,A@j + 1],
  alpha = 0.3, phi = 20, theta = 25, main = "Nearest neighbours (k = 6)")
plot3D::scatter3D(
  X[1,], X[2,], X[3,],
  colvar = v, pch = 16, cex = 0.5,
  colkey = FALSE, bty = "b", add = TRUE)


## ----isomap2, fig.width=4.5, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
# Since only a finite set of nearest neighbours is used the adjacency matrix
# is not necessarily symmetric
GD <- as.matrix(W)
# Make GD symmetric
GD <- (GD + t(GD)) / 2

# Use the igraph package for an efficient implementation of graph
# distance calculation
g <- igraph::graph_from_adjacency_matrix(GD, "undirected", weighted = TRUE)
# Creates a matrix of shortest distances (weighted with edge weight)
SPS <- igraph::shortest.paths(g, igraph::V(g), igraph::V(g))

# # Floyd's algorithm would be a simple but very slow alternative
# # to calculate geodesic distances
# sum(igraph::edge_attr(g, "weight", sps$epath[[2]]))

# sps$vpath

# # Finish algorithm initialisation
# GD[GD == 0] <- Inf

# # Add connections between points and themselves
# diag(GD) <- 0

# # Determine the geodesic distance matrix
# for (k in 1:n) {
#   GD <- pmin(
#     GD,
#     matrix(rep(GD[,k], n), ncol = n) +
#     matrix(rep(GD[k,], each = n), ncol = n))
# }

# SPS <- GD

# Remove points that were unconnected
SPS[is.infinite(SPS)] <- 0

# Perform multidimensional scaling for embedding
J <- diag(rep(1, n)) - matrix(1, ncol = n, nrow = n) / n
K <- -0.5 * J %*% (SPS ^ 2) %*% J
K_decomp <- eigen(K)
V <- K_decomp$vectors
Delta <- K_decomp$values
Y <- V[,1:2] %*% diag(sqrt(Delta[1:2]))

data_custom_isomap <- tibble(
  x = Y[,1],
  y = Y[,2],
  colour = v)

data_edges <- tibble(
  x = Y[A@i + 1, 1],
  y = Y[A@i + 1, 2],
  xend = Y[A@j + 1, 1],
  yend = Y[A@j + 1, 2])

p_custom_isomap <- ggplot(mapping = aes(x, y)) +
  geom_segment(
    aes(xend = xend, yend = yend),
    data = data_edges,
    size = 0.1) +
  geom_point(
    aes(colour = colour),
    data = data_custom_isomap,
    size = 1, alpha = 0.6) +
  scale_colour_gradientn(
    colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) +
  ggtitle("Isomap (knn = 6)") +
  theme_void()

# In practice, all of the above is implemented in e.g. the dimRed package
# Run with knn = 20 to show a caveat of the method
dr_isomap <- dimRed::embed(t(X), "Isomap", knn = 20)
Y_dr_isomap <- dr_isomap@data@data

# Used to plot segments
DNN <- apply(D, 2, sort)
NN <- apply(D, 2, order)
# and find nearest neighbours
k <- 20
NN <- NN[2:(k + 1),]
DNN <- DNN[2:(k + 1),]
# Create the graph adjacency matrix...
B <- rep(1:n, each = k)
A <- Matrix::sparseMatrix(i = B, j = as.vector(NN), x = 1)
A <- as(A, "dgTMatrix")

data_points <- tibble(
  x = Y_dr_isomap[,1],
  y = Y_dr_isomap[,2],
  colour = v)

data_edges <- tibble(
  x = Y_dr_isomap[A@i + 1, 1],
  y = Y_dr_isomap[A@i + 1, 2],
  xend = Y_dr_isomap[A@j + 1, 1],
  yend = Y_dr_isomap[A@j + 1, 2])

p_dimred_isomap <- ggplot(mapping = aes(x, y)) +
  geom_segment(
    aes(xend = xend, yend = yend),
    data = data_edges,
    size = 0.1) +
  geom_point(aes(colour = colour), data = data_points, size = 1, alpha = 0.6) +
  scale_colour_gradientn(
    colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) +
  ggtitle("Isomap (knn = 20)") +
  theme_void()

ggpubr::ggarrange(
  p_custom_isomap, p_dimred_isomap, ncol = 2)


## ----tsne-simple-example, fig.width=4.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
data_tsne <- dimRed::embed(t(X), "tSNE", perplexity = 30)@data@data

par(mfrow = c(1, 3), mar = c(1, 1, 1, 1), cex.main = 0.7)
plot3D::scatter3D(
  X[1,], X[2,], X[3,], colvar = v,
  phi = 20, theta = 25, pch = 16, cex = 0.5,
  colkey = FALSE, bty = "b", main = "Swiss roll (n = 1000)")
plot3D::scatter2D(data_custom_isomap$x, data_custom_isomap$y,
  colvar = v, colkey = FALSE,
  pch = 16, cex = 0.8, main = "Isomap (knn = 6)")
plot3D::scatter2D(data_tsne[,1], data_tsne[,2],
  colvar = v, colkey = FALSE,
  pch = 16, cex = 0.8, main = "tSNE (perplexity = 30)")


## ----tsne-bigger-example, fig.width=4.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
data_train <- ElemStatLearn::zip.train
mnist_isomap <- dimRed::embed(data_train[,-1], "Isomap", knn = 20)
mnist_tsne <- dimRed::embed(data_train[,-1], "tSNE", perplexity = 20)

p_mnist_isomap <- ggplot(
  cbind(digit = data_train[,1], mnist_isomap@data@data) %>% as_tibble,
  aes(`iso 1`, `iso 2`)) +
  geom_point(aes(colour = as.factor(digit)), size = 0.4) +
  scale_x_continuous("Iso1", breaks = NULL) +
  scale_y_continuous("Iso2", breaks = NULL) +
  scale_colour_discrete("Digit") +
  ggtitle("Isomap (knn = 20)") +
  theme_void() +
  theme(
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    legend.title = element_text(size = 7),
    legend.text = element_text(size = 7))

p_mnist_tsne <- ggplot(
  cbind(digit = data_train[,1], mnist_tsne@data@data) %>% as_tibble,
  aes(tSNE1, tSNE2)) +
  geom_point(aes(colour = as.factor(digit)), size = 0.4) +
  scale_colour_discrete("Digit") +
  ggtitle("tSNE (perplexity = 20)") +
  scale_x_continuous("tSNE1", breaks = NULL) +
  scale_y_continuous("tSNE2", breaks = NULL) +
  theme_void() +
  theme(
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    legend.title = element_text(size = 7),
    legend.text = element_text(size = 7))

ggpubr::ggarrange(
  p_mnist_isomap, p_mnist_tsne, ncol = 2,
  common.legend = TRUE, legend = "bottom")


## ----tsne-complexities-prep, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
set.seed(2019)
n <- 250
moon1 <- tibble(
  moon = 1,
  r = rnorm(n, 3, .25), theta = runif(n, 0, pi),
  x = r * cos(theta), y = -0.5 + r * sin(theta)) %>%
  select(moon, x, y)
moon2 <- tibble(
  moon = 2,
  r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi),
  x = 3 + r * cos(theta), y = 0.5 + r * sin(theta)) %>%
  select(moon, x, y)

data_moons <- bind_rows(moon1, moon2) %>%
  mutate(moon = as.factor(moon))

p_moons_raw <- ggplot(data_moons, aes(x, y)) +
  geom_point(aes(colour = moon), size = 0.6) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  ggtitle("Raw data") +
  theme(
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 7),
    axis.text = element_text(size =7))

data_moons_tsne <- do.call(rbind, lapply(1:10, function(i) cbind(
  run = i, dimRed::embed(
  data_moons[,c("x", "y")], "tSNE",
  perplexity = 20)@data@data)))

p_tsne_reps <- ggplot(
  bind_cols(moon = rep(data_moons$moon, 10), as_tibble(data_moons_tsne)),
  aes(tSNE1, tSNE2)) +
  geom_point(aes(colour = moon), size = 0.4) +
  facet_wrap(~ as.factor(run), ncol = 5) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  labs(
    title = "Transformed with tSNE",
    subtitle = TeX("Perplexity = 20, multiple runs")) +
  theme(
    plot.title = element_text(size = 8),
    plot.subtitle = element_text(size = 7),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7))

data_moons_tsne_perp <- do.call(rbind,
  lapply(c(2, 5, 15, 30, 50, 100), function(i) cbind(
  perplexity = i, dimRed::embed(
  data_moons[,c("x", "y")], "tSNE",
  perplexity = i)@data@data)))

p_tsne_perp <- ggplot(
  bind_cols(moon = rep(data_moons$moon, 6), as_tibble(data_moons_tsne_perp)),
  aes(tSNE1, tSNE2)) +
  geom_point(aes(colour = moon), size = 0.4) +
  facet_wrap(~ as.factor(perplexity), ncol = 3) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  labs(
    title = "Transformed with tSNE",
    subtitle = TeX("Varying perplexity")) +
  theme(
    plot.title = element_text(size = 8),
    plot.subtitle = element_text(size = 7),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7))


## ----moons-plot, fig.width=2, fig.height=1.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
ggplot(data_moons, aes(x, y)) +
  geom_point(aes(colour = moon), size = 0.8) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7))


## ----tsne-perplexity, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
p_tsne_perp


## ----tsne-multiple-runs, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
p_tsne_reps