## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette # Packages for actual computation # library(MASS) # Contains mvrnorm to simulate multivariate normal # # Use as MASS::mvrnorm is recommended since MASS::select # # clashes with dplyr::select # library(mclust) # Used for GMM clustering # library(mda) # Used for Mixture DA classification # library(dbscan) # Used for density-based clustering ## ----gmm-clust-example, fig.width=3, fig.height = 3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- set.seed(2025) n <- 400 R <- matrix( c(cos(pi / 4), -sin(pi / 4), sin(pi / 4), cos(pi / 4)), ncol = 2) D1 <- diag(c(0.8, 0.2)) D2 <- diag(c(1.8, 0.6)) Sigma1 <- t(R) %*% D1 %*% D1 %*% R Sigma2 <- R %*% D2 %*% D2 %*% t(R) mu <- matrix(c(-2, 0, 0, 0, 2, 0), ncol = 2, byrow = T) c1 <- MASS::mvrnorm(n, mu[1,], Sigma1) c2 <- MASS::mvrnorm(n, mu[2,], Sigma2) c3 <- MASS::mvrnorm(n, mu[3,], Sigma1) mat_gmm_clust <- cbind(rbind(c1, c2, c3), rep(c(1, 2, 3), each = n)) colnames(mat_gmm_clust) <- c("x", "y", "cluster") data_gmm_clust <- as_tibble(mat_gmm_clust) %>% mutate(cluster = as.factor(cluster)) library(mclust) gmm_clust <- Mclust(data_gmm_clust[,1:2], 3) factoextra::fviz_mclust( gmm_clust, geom = "point", legend = "none", main = NULL) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_fill_manual(values = cbPalette[-1], guide = FALSE) + theme( plot.subtitle = element_blank(), plot.background = element_rect(fill = "transparent")) ## ----mda-example, fig.width=5, fig.height = 2.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Example from # https://www.r-bloggers.com/a-brief-look-at-mixture-discriminant-analysis/ set.seed(20128) n <- 500 # Randomly sample data x11 <- MASS::mvrnorm(n, c(-4, -4), diag(c(1, 1))) x12 <- MASS::mvrnorm(n, c(0, 4), diag(c(1, 1))) x13 <- MASS::mvrnorm(n, c(4, -4), diag(c(1, 1))) x21 <- MASS::mvrnorm(n, c(-4, 4), diag(c(1, 1))) x22 <- MASS::mvrnorm(n, c(4, 4), diag(c(1, 1))) x23 <- MASS::mvrnorm(n, c(0, 0), diag(c(1, 1))) x31 <- MASS::mvrnorm(n, c(-4, 0), diag(c(1, 1))) x32 <- MASS::mvrnorm(n, c(0, -4), diag(c(1, 1))) x33 <- MASS::mvrnorm(n, c(4, 0), diag(c(1, 1))) x <- rbind(x11, x12, x13, x21, x22, x23, x31, x32, x33) train_data <- data.frame(x, y = gl(3, 3 * n)) # Train classifiers lda_out <- MASS::lda(y ~ ., data = train_data) qda_out <- MASS::qda(y ~ ., data = train_data) mda_out <- mda::mda(y ~ ., data = train_data, subclasses = 3) # Generates test data that will be used to generate the decision # boundaries via contours contour_data <- expand.grid(X1 = seq(-8, 8, length = 300), X2 = seq(-8, 8, length = 300)) # Classifies the test data lda_predict <- data.frame(contour_data, y = as.numeric(predict(lda_out, contour_data)$class)) qda_predict <- data.frame(contour_data, y = as.numeric(predict(qda_out, contour_data)$class)) mda_predict <- data.frame(contour_data, y = as.numeric(predict(mda_out, contour_data))) # Generates plots p <- ggplot(train_data, aes(x = X1, y = X2, color = y)) + geom_point(size = 0.5) + scale_x_continuous(name = NULL, breaks = NULL, limits = c(-8, 8)) + scale_y_continuous(name = NULL, breaks = NULL, limits = c(-8, 8)) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( plot.title = element_text(size = 8)) + coord_fixed() p1 <- p + stat_contour( aes(x = X1, y = X2, z = y), data = lda_predict, colour = "black") + ggtitle("LDA Decision Boundaries") p2 <- p + stat_contour( aes(x = X1, y = X2, z = y), data = qda_predict, colour = "black") + ggtitle("QDA Decision Boundaries") p3 <- p + stat_contour( aes(x = X1, y = X2, z = y), data = mda_predict, colour = "black") + ggtitle("MDA Decision Boundaries") ggpubr::ggarrange( p1, p2, p3, ncol = 3) ## ----db-clustering-setup, echo=FALSE------------------------------------- set.seed(2019) n <- 250 moon1 <- tibble( r = rnorm(n, 3, .25), theta = runif(n, 0, pi), x = r * cos(theta), y = -1 + r * sin(theta)) %>% select(x, y) moon2 <- tibble( r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi), x = 3 + r * cos(theta), y = 1 + r * sin(theta)) %>% select(x, y) data_moons <- bind_rows(moon1, moon2) pred_km <- kmeans(data_moons, 2)$cluster p1 <- ggplot( bind_cols(data_moons, tibble(pred = as.factor(pred_km))), aes(x = x, y = y, colour = pred)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + coord_fixed() + theme_void() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8)) + ggtitle("k-means") hc_moons <- hclust(dist(data_moons), method = "single") pred_hc_moons <- cutree(hc_moons, k = 2) p2 <- ggplot( bind_cols(data_moons, tibble(pred = as.factor(pred_hc_moons))), aes(x = x, y = y, colour = pred)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + coord_fixed() + theme_void() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8)) + ggtitle("Single linkage") n_signal <- 200 n_noise <- 120 mu <- matrix(c(-1, 2, 2, 0, -2, -1.5), ncol = 2, byrow = TRUE) Sigma <- diag(c(0.2, 0.2)) data_ellipsoids <- do.call( rbind, lapply(1:3, function(i) MASS::mvrnorm(n_signal, mu[i,], Sigma))) colnames(data_ellipsoids) <- c("x", "y") data_noisy_ellipsoids <- bind_rows( as_tibble(data_ellipsoids), tibble(x = runif(n_noise, -5, 5), y = runif(n_noise, -5, 5))) pred_km_noisy_ell <- kmeans(data_noisy_ellipsoids, 3)$cluster p3 <- ggplot( bind_cols( data_noisy_ellipsoids, tibble(pred = as.factor(pred_km_noisy_ell))), aes(x = x, y = y, colour = pred)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_void() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8)) + coord_fixed() + ggtitle("k-means") pred_dbscan_moons <- dbscan::dbscan(data_moons, eps = 0.4, minPts = 5) p4 <- ggplot( bind_cols(data_moons, tibble(pred = as.factor(pred_dbscan_moons$cluster))), aes(x = x, y = y, colour = pred)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette, guide = FALSE) + coord_fixed() + theme_void() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8)) + ggtitle(TeX("DBSCAN ($\\epsilon = 0.4$, $n_{\\min} = 5$)")) pred_dbscan_noisy_ell <- dbscan::dbscan( data_noisy_ellipsoids, eps = 0.5, minPts = 5) p5 <- ggplot( bind_cols( data_noisy_ellipsoids, tibble(pred = as.factor(pred_dbscan_noisy_ell$cluster))), aes(x = x, y = y, colour = pred)) + geom_point(size = 0.6) + scale_x_continuous(name = NULL, breaks = NULL) + scale_y_continuous(name = NULL, breaks = NULL) + scale_colour_manual(values = cbPalette, guide = FALSE) + coord_fixed() + theme_void() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8)) + ggtitle(TeX("DBSCAN ($\\epsilon = 0.4$, $n_{\\min} = 5$)")) ## ----kmeans-single-linkage-moons, fig.width=2, fig.height = 2.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggpubr::ggarrange(p1, p2, nrow = 2) ## ----kmeans-noisy, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p3 ## ----dbscan-eps, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- par(mar = c(4,4,1,1), cex = 0.5) dbscan::kNNdistplot(data_moons, 4) ## ----dbscan-moons, fig.width=2, fig.height = 1.4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p4 ## ----dbscan-noisy, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- p5