## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(grid) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette theme_set(theme_minimal() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8), plot.subtitle = element_text(size = 7), legend.text = element_text(size = 7), legend.title = element_text(size = 7))) # Set a less verbose standard theme # Packages for actual computation library(subspace) # Kind of an annoying package. Hard to install and has to # be loaded globally. However, this seems to be the only # reasonable package for subspace clustering in R library(ggraph) # Used for plotting of graphs library(data.table) # Used for reading data from an URL, but this is actually # a very good implementation of data frames that is highly # optimized for large scale computation on a single machine ## ----subspace-clustering, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- data(subspace_dataset) GGally::ggpairs( data = subspace_dataset, upper = list(continuous = GGally::wrap("points", size = 0.05)), lower = list(continuous = "blank"), diag = list(continuous = "densityDiag"), axisLabels = "none") + theme_minimal() + theme( strip.text = element_text(size = 7)) ## ----clique-example, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- fit_clique <- CLIQUE(subspace_dataset, xi = 40, tau = 0.06) cl_data <- bind_cols( as_tibble(subspace_dataset), cluster = rep(NA, dim(subspace_dataset)[1])) for(i in 1:length(fit_clique)) { cl <- fit_clique[[i]] cl_data[cl$objects, "cluster"] <- i } GGally::ggpairs( data = cl_data, mapping = aes(colour = factor(cluster, exclude = NULL)), columns = 1:5, upper = list(continuous = GGally::wrap("points", size = 0.05)), lower = list(continuous = "blank"), diag = list(continuous = "densityDiag"), axisLabels = "none") + theme_minimal() + theme( strip.text = element_text(size = 7)) ## ----proclus-example, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- fit_proclus <- ProClus(subspace_dataset, k = 12, d = 2.5) cl_data <- bind_cols( as_tibble(subspace_dataset), cluster = rep(NA, dim(subspace_dataset)[1])) for(i in 1:length(fit_proclus)) { cl <- fit_proclus[[i]] cl_data[cl$objects, "cluster"] <- i } GGally::ggpairs( data = cl_data, mapping = aes(colour = factor(cluster, exclude = NULL)), columns = 1:5, upper = list(continuous = GGally::wrap("points", size = 0.05)), lower = list(continuous = "blank"), diag = list(continuous = "densityDiag"), axisLabels = "none") + theme_minimal() + theme( strip.text = element_text(size = 7)) ## ----intro-sc, fig.width=4.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- # A more simple example for visualisation X <- matrix( c(-1.5, -1, -1, 1, 1, 1.5, 0, 0.25, -0.25, 0.25, -0.25, 0), ncol = 2) + matrix(rnorm(12, sd = 0.05), ncol = 2) # Random perturbation colnames(X) <- c("x", "y") n <- nrow(X) D1 <- matrix(rep.int(rowSums(X ^ 2), n), ncol = n, byrow = TRUE) D <- D1 + t(D1) - 2 * X %*% t(X) c <- 0.16 W <- exp(-D / c) # No entries on diagonal since points have no loop to themselves diag(W) <- 0 p_weights <- tibble( y = rep(n + 1 - 1:n, n), x = rep(1:n, each = n), w = as.vector(W)) %>% ggplot(aes(x, y)) + geom_tile(aes(fill = w), colour = cbPalette[1]) + scale_fill_gradient2( low = "red", mid = "white", high = cbPalette[3], guide = guide_colourbar(barheight = unit(0.2, "cm"))) + coord_fixed() + scale_x_continuous(name = NULL, breaks = NULL, expand = c(0, 0)) + scale_y_continuous(name = NULL, breaks = NULL, expand = c(0, 0)) + ggtitle("Graph edge weights") data_lines <- tibble( x = X[rep(1:n, n), 1], y = X[rep(1:n, n), 2], xend = X[rep(1:n, each = n), 1], yend = X[rep(1:n, each = n), 2], w = as.vector(W)) p_graph <- ggplot(as_tibble(X), aes(x, y)) + geom_segment( aes(xend = xend, yend = yend, alpha = w), data = data_lines) + geom_point(size = 0.8) + scale_x_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) + scale_y_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) + scale_alpha_continuous(guide = FALSE) + coord_fixed() + ggtitle("Raw data and graph") # Using a random-walk normalized Laplacian L <- diag(rep(1, n)) - diag(1 / rowSums(W)) %*% W # Calculate eigenvalues and eigenvectors L_eigen <- eigen(L) # Determine near-zero indices idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps)) # And select only the corresponding eigenvectors V <- L_eigen$vectors[,idx, drop = FALSE] # These are only non-zero for the k-th component and should therefore # lead to a perfect clustering # However, I found randomly initialised kmeans to be quite unstable # with respect to clustering outcome. Since we know that theoretically each # eigenvector is only non-zero whereever the different components are, # we can choose one example from each component and start kmeans there. # In my experiments, this turned out to be much more stable. init_cl <- diag(apply(V, 2, function(x) { idx <- which.max(abs(x)) x[idx] }), nrow = length(idx)) cl <- kmeans(V, init_cl) # Check clustering vs ground-truth p_cluster <- tibble( pred = cl$cluster, x = X[,1], y = X[,2], cluster = c(1, 1, 1, 2, 2, 2)) %>% ggplot(aes(x, y)) + geom_point( aes(colour = as.factor(pred), shape = as.factor(cluster)), size = 0.8) + scale_x_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) + scale_y_continuous(name = NULL, breaks = NULL, limits = c(-2, 2)) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_shape_discrete(guide = FALSE) + coord_fixed() + ggtitle("Clustering result") ggpubr::ggarrange( p_graph, p_weights, p_cluster, ncol = 3, common.legend = TRUE, legend = "bottom") ## ----spectral-clustering, fig.width=3.5, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- clusters <- do.call(bind_rows, lapply(list(c(1, 1, 100), c(2, 3, 150), c(3, 5, 250)), function(x) { tibble( cluster = rep(x[1], x[3]), r = rnorm(x[3], x[2], 0.1), theta = runif(x[3], 0, 2 * pi), x = r * cos(theta), y = r * sin(theta)) %>% select(cluster, x, y) })) # Visualise the data p_data <- ggplot(clusters, aes(x, y)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + coord_fixed() + ggtitle("Raw data (n = 500") X <- as.matrix(clusters[,c("x", "y")]) n <- nrow(X) # Calculate the distance matrix D1 <- matrix(rep.int(rowSums(X ^ 2), n), ncol = n, byrow = TRUE) D <- D1 + t(D1) - 2 * X %*% t(X) c <- 0.25 W <- exp(-D / (c ^ 2)) # No entries on diagonal since points do not have a loop to themselves diag(W) <- 0 # Calculate Laplacian L <- diag(rowSums(W)) - W L_eigen <- eigen(L) # Determine near-zero indices idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps)) # And select only the corresponding eigenvectors # Scale to get entries into a reasonable range. Not necessarily needed for # every application, but doesn't hurt either V <- scale(L_eigen$vectors[,idx]) # These are only non-zero for the k-th component and should therefore # lead to a perfect clustering # Initialisation is done as described above init_cl <- diag(apply(V, 2, function(x) { idx <- which.max(abs(x)) x[idx] })) cl <- kmeans(V, init_cl) # # Clustering was even stable when initial vectors were randomly perturbed # cl <- kmeans(V, init_cl + matrix(rnorm(9, sd = 4), ncol = 3)) # Check clustering vs ground-truth p_cluster <- bind_cols(pred = cl$cluster, clusters) %>% ggplot(aes(x, y)) + geom_point( aes(colour = as.factor(pred), shape = as.factor(cluster)), size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_shape_discrete(guide = FALSE) + coord_fixed() + ggtitle("Spectral clustering") # # Using a random-walk normalized Laplacian # L <- diag(rep(1, n)) - diag(1 / rowSums(W)) %*% W # # Calculate eigenvalues and eigenvectors # L_eigen <- eigen(L) # # Determine near-zero indices # idx <- which(abs(L_eigen$values) <= sqrt(.Machine$double.eps)) # # And select only the corresponding eigenvectors # V <- L_eigen$vectors[,idx] # # These are only non-zero for the k-th component and should therefore # # lead to a perfect clustering # # Initialisation is done as described above # init_cl <- diag(apply(V, 2, function(x) { # idx <- which.max(abs(x)) # x[idx] # })) # cl <- kmeans(V, init_cl) # # Check clustering vs ground-truth # bind_cols(pred = cl$cluster, clusters) %>% # ggplot(aes(x, y)) + # geom_point(aes(colour = as.factor(pred), shape = as.factor(cluster))) ggpubr::ggarrange( p_data, p_cluster, ncol = 2) ## ----leim-pre, echo=FALSE------------------------------------------------ # Keep the drawing device open for knitr to draw everything into one # plot and not create one for each step opts_knit$set(global.device = TRUE) ## ----laplacian-dimred, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, fig.show="hide"---- define_region <- function(row, col){ viewport(layout.pos.row = row, layout.pos.col = col) } X <- t(dimRed::loadDataSet("3D S Curve")@data) # X <- t(dimRed::loadDataSet("Broken Swiss Roll")@data) grid.newpage() layout <- grid.layout(1, 3) pushViewport( viewport( layout = layout, height = unit(1.75, "in"), width = unit(4.5, "in"))) # # Show viewport for easier manual scaling # grid.rect(gp = gpar(fill = "green", alpha = 0.5)) pfun <- function() { par(mar = c(0, 1, 1, 1), cex = 0.7) plot3D::scatter3D( X[1,], X[2,], X[3,], colvar = X[3,], phi = 20, theta = 25, pch = 16, cex = 0.5, colkey = FALSE, bty = "b", main = paste0("S curve (n = ", ncol(X), ")")) } pushViewport(define_region(1, 1)) gridGraphics::grid.echo(pfun, newpage = FALSE) grid.edit( "graphics-plot-2-main-1", x = unit(.35, "npc"), y = unit(1.0125, "npc"), gp = gpar(fontsize = 8, cex = 1, fontface = "plain")) grid.edit( "graphics-plot-2-persp-xlab-1", gp = gpar(fontsize = 7, cex = 1)) grid.edit( "graphics-plot-2-persp-ylab-1", gp = gpar(fontsize = 7, cex = 1)) grid.edit( "graphics-plot-2-persp-zlab-1", gp = gpar(fontsize = 7, cex = 1)) # Return back up and start on the second plot upViewport() leim <- dimRed::embed( t(X), "LaplacianEigenmaps", sparse = "knn", knn = 6, t = Inf, norm = TRUE) data_leim <- bind_cols( as_tibble(leim@data@data), colour = X[3,]) p_leim <- ggplot(data_leim, aes(LEIM1, LEIM2)) + geom_point(aes(colour = colour), size = 0.2) + scale_colour_gradientn( colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) + labs( title = "Laplacian Eigenmaps", subtitle = TeX("knn = 6, c = $\\infty$")) + scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) print(p_leim, vp = define_region(row = 1, col = 2)) # Printing does not enter the viewport, so we do not need to go back # Start on the third plot isomap <- dimRed::embed( t(X), "Isomap", knn = 6) data_isomap <- bind_cols( as_tibble(isomap@data@data) %>% rename(ISO1 = `iso 1`, ISO2 = `iso 2`), colour = X[3,]) p_isomap <- ggplot(data_isomap, aes(ISO1, ISO2)) + geom_point(aes(colour = colour), size = 0.2) + scale_colour_gradientn( colours = plot3D::jet.col(10, alpha = 1), guide = FALSE) + labs(title = "Isomap", subtitle = "knn = 6") + scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) print(p_isomap, vp = define_region(row = 1, col = 3)) ## ----leim-post, fig.width=4.5, fig.height=1.75, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Turn the global device off again opts_knit$set(global.device = FALSE) ## ----glasso, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Given an empirical covariance matrix s <- c(10, 1, 5, 4, 10, 2, 6, 10, 3, 10) S <- matrix(0, nrow = 4, ncol = 4) S[row(S) >= col(S)] <- s S <- S + t(S) diag(S) <- 10 plot_graph_from_prec <- function( O, ns = 1:dim(O)[1], mult = 0.2, size = 4, horizontal = TRUE) { # Partial correlations from precision matrix W <- diag(1 / sqrt(diag(O))) %*% O %*% diag(1 / sqrt(diag(O))) # Nodes should not have connections with themselves diag(W) <- 0 # Graph representation g <- igraph::graph_from_adjacency_matrix( W, mode = "undirected", weighted = TRUE) igraph::V(g)$name <- ns # Plot the graph and adjust colourbar (very wide otherwise) if (horizontal) { cb <- guide_edge_colourbar(barheight = unit(0.1, "cm")) } else { cb <- guide_edge_colourbar(barwidth = unit(0.1, "cm")) } ggraph(g, layout = "linear", circular = TRUE) + geom_edge_link(aes( colour = weight)) + geom_node_label(aes(label = name), size = size) + scale_x_continuous(name = NULL, breaks = NULL, expand = expand_scale(mult = mult)) + scale_y_continuous(name = NULL, breaks = NULL, expand = expand_scale(mult = mult)) + scale_edge_colour_gradient2( name = ifelse(horizontal, "Partial correlation", "Partial\ncorrelation"), low = "red", mid = "white", high = "blue", limits = c(-1, 1), guide = cb) + coord_fixed() + theme(legend.position = "bottom") } # Here we could calculate the partial calculations directly O <- solve(S) p_g1 <- plot_graph_from_prec(O, mult = 0.25, size = 2) + labs(title = "Direct estimate", subtitle = "") # Assume we know the graph structure zero <- matrix(c(1, 3, 2, 4), ncol = 2, byrow = TRUE) # Estimate the unconstrained MLE for the covariance and precision matrices fit_glasso <- glasso::glasso(S, 0, zero = zero) p_g2 <- plot_graph_from_prec(fit_glasso$wi, mult = 0.25, size = 2) + labs(title = "Known graph structure", subtitle = "") # # Estimate the lasso regularised MLE for the covariance and precision # # matrices for an unknown network structure # glassoFast::glassoFast(S, 1.5) # Alternative package # glasso::glasso(S, 1.5) fit_glasso <- glasso::glasso( S, rho = 1, penalize.diagonal = FALSE, approx = FALSE) p_g3 <- plot_graph_from_prec(fit_glasso$wi, mult = 0.25, size = 2) + labs( title = "Estimated graph structure", subtitle = TeX("$\\lambda = 1$")) ggpubr::ggarrange( p_g1, p_g2, p_g3, common.legend = TRUE, legend = "bottom", ncol = 3) ## ----glasso-larger, fig.width=3.5, fig.height=3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE---- # sachs <- fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/sachs.data") Ssachs <- as.matrix(fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/sachs.covmatrix")) # Scaled covariance matrix, var(sachs) / 1000 sachsNames <- c( "praf", "pmek", "plcg", "PIP2", "PIP3", "p44/42", "pakts473", "PKA", "PKC", "P38", "pjnk") # names(sachs) <- sachsNames # colnames(Ssachs) <- sachsNames # rownames(Ssachs) <- sachsNames fit_glasso <- glasso::glasso( Ssachs, rho = 0, penalize.diagonal = FALSE, approx = FALSE) p0 <- plot_graph_from_prec( fit_glasso$wi, ns = sachsNames, mult = 0.1, size = 1, horizontal = FALSE) + labs(title = TeX("$\\lambda = 0")) fit_glasso <- glasso::glasso( Ssachs, rho = 5, penalize.diagonal = FALSE, approx = FALSE) p1 <- plot_graph_from_prec( fit_glasso$wi, ns = sachsNames, mult = 0.1, size = 1, horizontal = FALSE) + labs(title = TeX("$\\lambda = 5")) fit_glasso <- glasso::glasso( Ssachs, rho = 20, penalize.diagonal = FALSE, approx = FALSE) p2 <- plot_graph_from_prec( fit_glasso$wi, ns = sachsNames, mult = 0.1, size = 1, horizontal = FALSE) + labs(title = TeX("$\\lambda = 20")) fit_glasso <- glasso::glasso( Ssachs, rho = 50, penalize.diagonal = FALSE, approx = FALSE) p3 <- plot_graph_from_prec( fit_glasso$wi, ns = sachsNames, mult = 0.1, size = 1, horizontal = FALSE) + labs(title = TeX("$\\lambda = 50")) ggpubr::ggarrange( p0, p1, p2, p3, ncol = 2, nrow = 2, common.legend = TRUE, legend = "right")