## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(grid)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette
theme_set(theme_minimal() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8),
    plot.subtitle = element_text(size = 7),
    legend.text = element_text(size = 7),
    legend.title = element_text(size = 7))) # Set a less verbose standard theme

# Packages for actual computation
library(subspace)   # Kind of an annoying package. Hard to install and has to
                    # be loaded globally. However, this seems to be the only
                    # reasonable package for subspace clustering in R
library(ggraph)     # Used for plotting of graphs
library(data.table) # Used for reading data from an URL, but this is actually
                    # a very good implementation of data frames that is highly
                    # optimized for large scale computation on a single machine


## ----subspace-clustering, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
data(subspace_dataset)
GGally::ggpairs(
  data = subspace_dataset,
  upper = list(continuous = GGally::wrap("points", size = 0.05)),
  lower = list(continuous = "blank"),
  diag = list(continuous = "densityDiag"),
  axisLabels = "none") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 7))


## ----clique-example, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
fit_clique <- CLIQUE(subspace_dataset, xi = 40, tau = 0.06)
cl_data <- bind_cols(
  as_tibble(subspace_dataset),
  cluster = rep(NA, dim(subspace_dataset)[1]))
for(i in 1:length(fit_clique)) {
  cl <- fit_clique[[i]]
  cl_data[cl$objects, "cluster"] <- i
}

GGally::ggpairs(
  data = cl_data,
  mapping = aes(colour = factor(cluster, exclude = NULL)),
  columns = 1:5,
  upper = list(continuous = GGally::wrap("points", size = 0.05)),
  lower = list(continuous = "blank"),
  diag = list(continuous = "densityDiag"),
  axisLabels = "none") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 7))


## ----proclus-example, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
fit_proclus <- ProClus(subspace_dataset, k = 12, d = 2.5)
cl_data <- bind_cols(
  as_tibble(subspace_dataset),
  cluster = rep(NA, dim(subspace_dataset)[1]))
for(i in 1:length(fit_proclus)) {
  cl <- fit_proclus[[i]]
  cl_data[cl$objects, "cluster"] <- i
}

GGally::ggpairs(
  data = cl_data,
  mapping = aes(colour = factor(cluster, exclude = NULL)),
  columns = 1:5,
  upper = list(continuous = GGally::wrap("points", size = 0.05)),
  lower = list(continuous = "blank"),
  diag = list(continuous = "densityDiag"),
  axisLabels = "none") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 7))


## ----intro-sc, fig.width=4.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
# A more simple example for visualisation
X <- matrix(
  c(-1.5, -1, -1, 1, 1, 1.5,
    0, 0.25, -0.25, 0.25, -0.25, 0), ncol = 2) +
  matrix(rnorm(12, sd = 0.05), ncol = 2) # Random perturbation
colnames(X) <- c("x", "y")
n <- nrow(X)

D1 <- matrix(rep.int(rowSums(X ^ 2), n), ncol =  n, byrow = TRUE)
D <- D1 + t(D1) - 2 * X %*% t(X)

c <- 0.16

W <- exp(-D / c)
# No entries on diagonal since points have no loop to themselves
diag(W) <- 0

p_weights <- tibble(
  y = rep(n + 1 - 1:n, n),
  x = rep(1:n, each = n),
  w = as.vector(W)) %>%
  ggplot(aes(x, y)) +
    geom_tile(aes(fill = w), colour = cbPalette[1]) +
    scale_fill_gradient2(
      low = "red", mid = "white", high = cbPalette[3],
      guide = guide_colourbar(barheight = unit(0.2, "cm"))) +
    coord_fixed() +
    scale_x_continuous(name = NULL, breaks = NULL, expand = c(0, 0)) +
    scale_y_continuous(name = NULL, breaks = NULL, expand = c(0, 0)) +
    ggtitle("Graph edge weights")

data_lines <- tibble(
  x = X[rep(1:n, n), 1],
  y = X[rep(1:n, n), 2],
  xend = X[rep(1:n, each = n), 1],
  yend = X[rep(1:n, each = n), 2],
  w = as.vector(W))

p_graph <- ggplot(as_tibble(X), aes(x, y)) +
  geom_segment(
    aes(xend = xend, yend = yend, alpha = w),
    data = data_lines) +
  geom_point(size = 0.8) +
  scale_x_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) +
  scale_y_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) +
  scale_alpha_continuous(guide = FALSE) +
  coord_fixed() +
  ggtitle("Raw data and graph")

# Using a random-walk normalized Laplacian
L <- diag(rep(1, n)) -  diag(1 / rowSums(W)) %*% W

# Calculate eigenvalues and eigenvectors
L_eigen <- eigen(L)
# Determine near-zero indices
idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps))
# And select only the corresponding eigenvectors
V <- L_eigen$vectors[,idx, drop = FALSE]
# These are only non-zero for the k-th component and should therefore
# lead to a perfect clustering
# However, I found randomly initialised kmeans to be quite unstable
# with respect to clustering outcome. Since we know that theoretically each
# eigenvector is only non-zero whereever the different components are,
# we can choose one example from each component and start kmeans there.
# In my experiments, this turned out to be much more stable.
init_cl <- diag(apply(V, 2, function(x) {
  idx <- which.max(abs(x))
  x[idx]
}), nrow = length(idx))
cl <- kmeans(V, init_cl)

# Check clustering vs ground-truth
p_cluster <- tibble(
  pred = cl$cluster,
  x = X[,1], y = X[,2],
  cluster = c(1, 1, 1, 2, 2, 2)) %>%
  ggplot(aes(x, y)) +
  geom_point(
    aes(colour = as.factor(pred), shape = as.factor(cluster)), size = 0.8) +
  scale_x_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) +
  scale_y_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  scale_shape_discrete(guide = FALSE) +
  coord_fixed() +
  ggtitle("Clustering result")

ggpubr::ggarrange(
  p_graph, p_weights, p_cluster, ncol = 3,
  common.legend = TRUE, legend = "bottom")


## ----spectral-clustering, fig.width=3.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
clusters <- do.call(bind_rows,
  lapply(list(c(1, 1, 100), c(2, 3, 150), c(3, 5, 250)), function(x) {
  tibble(
    cluster = rep(x[1], x[3]),
    r = rnorm(x[3], x[2], 0.1),
    theta = runif(x[3], 0, 2 * pi),
    x = r * cos(theta), y = r * sin(theta)) %>%
  select(cluster, x, y)
}))

# Visualise the data
p_data <- ggplot(clusters, aes(x, y)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  coord_fixed() +
  ggtitle("Raw data (n = 500")

X <- as.matrix(clusters[,c("x", "y")])
n <- nrow(X)

# Calculate the distance matrix
D1 <- matrix(rep.int(rowSums(X ^ 2), n), ncol =  n, byrow = TRUE)
D <- D1 + t(D1) - 2 * X %*% t(X)

c <- 0.25

W <- exp(-D / (c ^ 2))
# No entries on diagonal since points do not have a loop to themselves
diag(W) <- 0

# Calculate Laplacian
L <- diag(rowSums(W)) - W
L_eigen <- eigen(L)
# Determine near-zero indices
idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps))
# And select only the corresponding eigenvectors
# Scale to get entries into a reasonable range. Not necessarily needed for
# every application, but doesn't hurt either
V <- scale(L_eigen$vectors[,idx])
# These are only non-zero for the k-th component and should therefore
# lead to a perfect clustering
# Initialisation is done as described above
init_cl <- diag(apply(V, 2, function(x) {
  idx <- which.max(abs(x))
  x[idx]
}))
cl <- kmeans(V, init_cl)
# # Clustering was even stable when initial vectors were randomly perturbed
# cl <- kmeans(V, init_cl + matrix(rnorm(9, sd = 4), ncol = 3))

# Check clustering vs ground-truth
p_cluster <- bind_cols(pred = cl$cluster, clusters) %>%
  ggplot(aes(x, y)) +
  geom_point(
    aes(colour = as.factor(pred), shape = as.factor(cluster)),
    size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  scale_shape_discrete(guide = FALSE) +
  coord_fixed() +
  ggtitle("Spectral clustering")

# # Using a random-walk normalized Laplacian
# L <- diag(rep(1, n)) -  diag(1 / rowSums(W)) %*% W

# # Calculate eigenvalues and eigenvectors
# L_eigen <- eigen(L)
# # Determine near-zero indices
# idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps))
# # And select only the corresponding eigenvectors
# V <- L_eigen$vectors[,idx]
# # These are only non-zero for the k-th component and should therefore
# # lead to a perfect clustering
# # Initialisation is done as described above
# init_cl <- diag(apply(V, 2, function(x) {
#   idx <- which.max(abs(x))
#   x[idx]
# }))
# cl <- kmeans(V, init_cl)

# # Check clustering vs ground-truth
# bind_cols(pred = cl$cluster, clusters) %>%
#   ggplot(aes(x, y)) +
#   geom_point(aes(colour = as.factor(pred), shape = as.factor(cluster)))

ggpubr::ggarrange(
  p_data, p_cluster, ncol = 2)


## ----leim-pre, echo=FALSE------------------------------------------------
# Keep the drawing device open for knitr to draw everything into one
# plot and not create one for each step
opts_knit$set(global.device = TRUE)


## ----laplacian-dimred, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, fig.show="hide"----
define_region <- function(row, col){
  viewport(layout.pos.row = row, layout.pos.col = col)
}

X <- t(dimRed::loadDataSet("3D S Curve")@data)
# X <- t(dimRed::loadDataSet("Broken Swiss Roll")@data)

grid.newpage()
layout <- grid.layout(1, 3)
pushViewport(
  viewport(
    layout = layout,
    height = unit(1.75, "in"),
    width = unit(4.5, "in")))

# # Show viewport for easier manual scaling
# grid.rect(gp = gpar(fill = "green", alpha = 0.5))

pfun <- function() {
  par(mar = c(0, 1, 1, 1), cex = 0.7)
  plot3D::scatter3D(
    X[1,], X[2,], X[3,], colvar = X[3,],
    phi = 20, theta = 25, pch = 16, cex = 0.5,
    colkey = FALSE, bty = "b",
    main = paste0("S curve (n = ", ncol(X), ")"))
}
pushViewport(define_region(1, 1))
gridGraphics::grid.echo(pfun, newpage = FALSE)
grid.edit(
  "graphics-plot-2-main-1",
  x = unit(.35, "npc"),
  y = unit(1.0125, "npc"),
  gp = gpar(fontsize = 8, cex = 1, fontface = "plain"))

grid.edit(
  "graphics-plot-2-persp-xlab-1",
  gp = gpar(fontsize = 7, cex = 1))
grid.edit(
  "graphics-plot-2-persp-ylab-1",
  gp = gpar(fontsize = 7, cex = 1))
grid.edit(
  "graphics-plot-2-persp-zlab-1",
  gp = gpar(fontsize = 7, cex = 1))

# Return back up and start on the second plot
upViewport()

leim <- dimRed::embed(
  t(X), "LaplacianEigenmaps", sparse = "knn", knn = 6, t = Inf, norm = TRUE)

data_leim <- bind_cols(
  as_tibble(leim@data@data),
  colour = X[3,])

p_leim <- ggplot(data_leim, aes(LEIM1, LEIM2)) +
  geom_point(aes(colour = colour), size = 0.2) +
  scale_colour_gradientn(
    colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) +
  labs(
    title = "Laplacian Eigenmaps",
    subtitle = TeX("knn = 6, c = $\\infty$")) +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(breaks = NULL)
print(p_leim, vp = define_region(row = 1, col = 2))

# Printing does not enter the viewport, so we do not need to go back
# Start on the third plot

isomap <- dimRed::embed(
  t(X), "Isomap", knn = 6)

data_isomap <- bind_cols(
  as_tibble(isomap@data@data) %>%
    rename(ISO1 = `iso 1`, ISO2 = `iso 2`),
  colour = X[3,])

p_isomap <- ggplot(data_isomap, aes(ISO1, ISO2)) +
  geom_point(aes(colour = colour), size = 0.2) +
  scale_colour_gradientn(
    colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) +
  labs(title = "Isomap", subtitle = "knn = 6") +
  scale_x_continuous(breaks = NULL) +
  scale_y_continuous(breaks = NULL)
print(p_isomap, vp = define_region(row = 1, col = 3))


## ----leim-post, fig.width=4.5, fig.height=1.75, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Turn the global device off again
opts_knit$set(global.device = FALSE)


## ----glasso, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Given an empirical covariance matrix
s <- c(10, 1, 5, 4, 10, 2, 6, 10, 3, 10)
S <- matrix(0, nrow = 4, ncol = 4)
S[row(S) >= col(S)] <- s
S <- S + t(S)
diag(S) <- 10

plot_graph_from_prec <- function(
  O, ns = 1:dim(O)[1], mult = 0.2, size = 4, horizontal = TRUE) {
  # Partial correlations from precision matrix
  W <- diag(1 / sqrt(diag(O))) %*% O %*% diag(1 / sqrt(diag(O)))
  # Nodes should not have connections with themselves
  diag(W) <- 0
  # Graph representation
  g <- igraph::graph_from_adjacency_matrix(
    W, mode = "undirected", weighted = TRUE)
  igraph::V(g)$name <- ns
  # Plot the graph and adjust colourbar (very wide otherwise)
  if (horizontal) {
    cb <- guide_edge_colourbar(barheight = unit(0.1, "cm"))
  } else {
    cb <- guide_edge_colourbar(barwidth = unit(0.1, "cm"))
  }
  ggraph(g, layout = "linear", circular = TRUE) +
    geom_edge_link(aes(
      colour = weight)) +
    geom_node_label(aes(label = name), size = size) +
    scale_x_continuous(name = NULL, breaks = NULL,
      expand = expand_scale(mult = mult)) +
    scale_y_continuous(name = NULL, breaks = NULL,
      expand = expand_scale(mult = mult)) +
    scale_edge_colour_gradient2(
      name = ifelse(horizontal, "Partial correlation", "Partial\ncorrelation"),
      low = "red", mid = "white", high = "blue", limits = c(-1, 1),
      guide = cb) +
    coord_fixed() +
    theme(legend.position = "bottom")
}

# Here we could calculate the partial calculations directly
O <- solve(S)
p_g1 <- plot_graph_from_prec(O, mult = 0.25, size = 2) +
  labs(title = "Direct estimate", subtitle = "")

# Assume we know the graph structure
zero <- matrix(c(1, 3, 2, 4), ncol = 2, byrow = TRUE)
# Estimate the unconstrained MLE for the covariance and precision matrices
fit_glasso <- glasso::glasso(S, 0, zero = zero)
p_g2 <- plot_graph_from_prec(fit_glasso$wi, mult = 0.25, size = 2) +
  labs(title = "Known graph structure", subtitle = "")

# # Estimate the lasso regularised MLE for the covariance and precision
# # matrices for an unknown network structure
# glassoFast::glassoFast(S, 1.5) # Alternative package
# glasso::glasso(S, 1.5)

fit_glasso <- glasso::glasso(
  S, rho = 1, penalize.diagonal = FALSE, approx = FALSE)
p_g3 <- plot_graph_from_prec(fit_glasso$wi, mult = 0.25, size = 2) +
  labs(
    title = "Estimated graph structure",
    subtitle = TeX("$\\lambda = 1$"))

ggpubr::ggarrange(
  p_g1, p_g2, p_g3,
  common.legend = TRUE,
  legend = "bottom",
  ncol = 3)


## ----glasso-larger, fig.width=3.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
# sachs <- fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/sachs.data")
Ssachs <- as.matrix(fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/sachs.covmatrix")) # Scaled covariance matrix, var(sachs) / 1000

sachsNames <- c(
  "praf",
  "pmek",
  "plcg",
  "PIP2",
  "PIP3",
  "p44/42",
  "pakts473",
  "PKA",
  "PKC",
  "P38",
  "pjnk")

# names(sachs) <- sachsNames
# colnames(Ssachs) <- sachsNames
# rownames(Ssachs) <- sachsNames

fit_glasso <- glasso::glasso(
  Ssachs, rho = 0, penalize.diagonal = FALSE, approx = FALSE)
p0 <- plot_graph_from_prec(
  fit_glasso$wi, ns = sachsNames,
  mult = 0.1, size = 1, horizontal = FALSE) +
  labs(title = TeX("$\\lambda = 0"))
fit_glasso <- glasso::glasso(
  Ssachs, rho = 5, penalize.diagonal = FALSE, approx = FALSE)
p1 <- plot_graph_from_prec(
  fit_glasso$wi, ns = sachsNames,
  mult = 0.1, size = 1, horizontal = FALSE) +
  labs(title = TeX("$\\lambda = 5"))
fit_glasso <- glasso::glasso(
  Ssachs, rho = 20, penalize.diagonal = FALSE, approx = FALSE)
p2 <- plot_graph_from_prec(
  fit_glasso$wi, ns = sachsNames,
  mult = 0.1, size = 1, horizontal = FALSE) +
  labs(title = TeX("$\\lambda = 20"))
fit_glasso <- glasso::glasso(
  Ssachs, rho = 50, penalize.diagonal = FALSE, approx = FALSE)
p3 <- plot_graph_from_prec(
  fit_glasso$wi, ns = sachsNames,
  mult = 0.1, size = 1, horizontal = FALSE) +
  labs(title = TeX("$\\lambda = 50"))

ggpubr::ggarrange(
  p0, p1, p2, p3,
  ncol = 2, nrow = 2,
  common.legend = TRUE,
  legend = "right")