## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(MASS)    # Contains mvrnorm to simulate multivariate normal
#                  # Use as MASS::mvrnorm is recommended since MASS::select
#                  # clashes with dplyr::select
# library(mclust)  # Used for GMM clustering
# library(mda)     # Used for Mixture DA classification
# library(dbscan)  # Used for density-based clustering


## ----gmm-clust-example, fig.width=3, fig.height = 3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
set.seed(2025)
n <- 400
R <- matrix(
  c(cos(pi / 4), -sin(pi / 4), sin(pi / 4), cos(pi / 4)), ncol = 2)
D1 <- diag(c(0.8, 0.2))
D2 <- diag(c(1.8, 0.6))
Sigma1 <- t(R) %*% D1 %*% D1 %*% R
Sigma2 <- R %*% D2 %*% D2 %*% t(R)
mu <- matrix(c(-2, 0, 0, 0, 2, 0), ncol = 2, byrow = T)

c1 <- MASS::mvrnorm(n, mu[1,], Sigma1)
c2 <- MASS::mvrnorm(n, mu[2,], Sigma2)
c3 <- MASS::mvrnorm(n, mu[3,], Sigma1)

mat_gmm_clust <- cbind(rbind(c1, c2, c3), rep(c(1, 2, 3), each = n))
colnames(mat_gmm_clust) <- c("x", "y", "cluster")

data_gmm_clust <- as_tibble(mat_gmm_clust) %>%
  mutate(cluster = as.factor(cluster))

library(mclust)
gmm_clust <- Mclust(data_gmm_clust[,1:2], 3)
factoextra::fviz_mclust(
  gmm_clust,
  geom = "point",
  legend = "none",
  main = NULL) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  scale_fill_manual(values = cbPalette[-1], guide = FALSE) +
  theme(
    plot.subtitle = element_blank(),
    plot.background = element_rect(fill = "transparent"))


## ----mda-example, fig.width=5, fig.height = 2.5, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Example from
# https://www.r-bloggers.com/a-brief-look-at-mixture-discriminant-analysis/
set.seed(20128)
n <- 500

# Randomly sample data
x11 <- MASS::mvrnorm(n, c(-4, -4), diag(c(1, 1)))
x12 <- MASS::mvrnorm(n, c(0, 4), diag(c(1, 1)))
x13 <- MASS::mvrnorm(n, c(4, -4), diag(c(1, 1)))

x21 <- MASS::mvrnorm(n, c(-4, 4), diag(c(1, 1)))
x22 <- MASS::mvrnorm(n, c(4, 4), diag(c(1, 1)))
x23 <- MASS::mvrnorm(n, c(0, 0), diag(c(1, 1)))

x31 <- MASS::mvrnorm(n, c(-4, 0), diag(c(1, 1)))
x32 <- MASS::mvrnorm(n, c(0, -4), diag(c(1, 1)))
x33 <- MASS::mvrnorm(n, c(4, 0), diag(c(1, 1)))

x <- rbind(x11, x12, x13, x21, x22, x23, x31, x32, x33)
train_data <- data.frame(x, y = gl(3, 3 * n))

# Train classifiers
lda_out <- MASS::lda(y ~ ., data = train_data)
qda_out <- MASS::qda(y ~ ., data = train_data)
mda_out <- mda::mda(y ~ ., data = train_data, subclasses = 3)

# Generates test data that will be used to generate the decision
# boundaries via contours
contour_data <- expand.grid(X1 = seq(-8, 8, length = 300),
X2 = seq(-8, 8, length = 300))

# Classifies the test data
lda_predict <- data.frame(contour_data,
y = as.numeric(predict(lda_out, contour_data)$class))
qda_predict <- data.frame(contour_data,
y = as.numeric(predict(qda_out, contour_data)$class))
mda_predict <- data.frame(contour_data,
y = as.numeric(predict(mda_out, contour_data)))

# Generates plots
p <- ggplot(train_data, aes(x = X1, y = X2, color = y)) +
  geom_point(size = 0.5) +
  scale_x_continuous(name = NULL, breaks = NULL, limits = c(-8, 8)) +
  scale_y_continuous(name = NULL, breaks = NULL, limits = c(-8, 8)) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 8)) +
  coord_fixed()
p1 <- p +
  stat_contour(
    aes(x = X1, y = X2, z = y),
    data = lda_predict,
    colour = "black") +
  ggtitle("LDA Decision Boundaries")
p2 <- p +
  stat_contour(
    aes(x = X1, y = X2, z = y),
    data = qda_predict,
    colour = "black") +
  ggtitle("QDA Decision Boundaries")
p3 <- p +
  stat_contour(
    aes(x = X1, y = X2, z = y),
    data = mda_predict,
    colour = "black") +
  ggtitle("MDA Decision Boundaries")

ggpubr::ggarrange(
  p1, p2, p3, ncol = 3)


## ----db-clustering-setup, echo=FALSE-------------------------------------
set.seed(2019)
n <- 250
moon1 <- tibble(
  r = rnorm(n, 3, .25), theta = runif(n, 0, pi),
  x = r * cos(theta), y = -1 + r * sin(theta)) %>%
  select(x, y)
moon2 <- tibble(
  r = rnorm(n, 3, .25), theta = runif(n, pi, 2 * pi),
  x = 3 + r * cos(theta), y = 1 + r * sin(theta)) %>%
  select(x, y)

data_moons <- bind_rows(moon1, moon2)

pred_km <- kmeans(data_moons, 2)$cluster

p1 <- ggplot(
  bind_cols(data_moons, tibble(pred = as.factor(pred_km))),
  aes(x = x, y = y, colour = pred)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  coord_fixed() +
  theme_void() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8)) +
  ggtitle("k-means")

hc_moons <- hclust(dist(data_moons), method = "single")
pred_hc_moons <- cutree(hc_moons, k = 2)

p2 <- ggplot(
  bind_cols(data_moons, tibble(pred = as.factor(pred_hc_moons))),
  aes(x = x, y = y, colour = pred)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  coord_fixed() +
  theme_void() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8)) +
  ggtitle("Single linkage")

n_signal <- 200
n_noise <- 120
mu <- matrix(c(-1, 2, 2, 0, -2, -1.5), ncol = 2, byrow = TRUE)
Sigma <- diag(c(0.2, 0.2))
data_ellipsoids <- do.call(
  rbind, lapply(1:3, function(i) MASS::mvrnorm(n_signal, mu[i,], Sigma)))
colnames(data_ellipsoids) <- c("x", "y")

data_noisy_ellipsoids <- bind_rows(
  as_tibble(data_ellipsoids),
  tibble(x = runif(n_noise, -5, 5), y = runif(n_noise, -5, 5)))

pred_km_noisy_ell <- kmeans(data_noisy_ellipsoids, 3)$cluster

p3 <- ggplot(
  bind_cols(
    data_noisy_ellipsoids,
    tibble(pred = as.factor(pred_km_noisy_ell))),
  aes(x = x, y = y, colour = pred)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme_void() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8)) +
  coord_fixed() +
  ggtitle("k-means")

pred_dbscan_moons <- dbscan::dbscan(data_moons, eps = 0.4, minPts = 5)

p4 <- ggplot(
  bind_cols(data_moons, tibble(pred = as.factor(pred_dbscan_moons$cluster))),
  aes(x = x, y = y, colour = pred)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette, guide = FALSE) +
  coord_fixed() +
  theme_void() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8)) +
  ggtitle(TeX("DBSCAN ($\\epsilon = 0.4$, $n_{\\min} = 5$)"))

pred_dbscan_noisy_ell <- dbscan::dbscan(
  data_noisy_ellipsoids, eps = 0.5, minPts = 5)

p5 <- ggplot(
  bind_cols(
    data_noisy_ellipsoids,
    tibble(pred = as.factor(pred_dbscan_noisy_ell$cluster))),
  aes(x = x, y = y, colour = pred)) +
  geom_point(size = 0.6) +
  scale_x_continuous(name = NULL, breaks = NULL) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_colour_manual(values = cbPalette, guide = FALSE) +
  coord_fixed() +
  theme_void() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8)) +
  ggtitle(TeX("DBSCAN ($\\epsilon = 0.4$, $n_{\\min} = 5$)"))


## ----kmeans-single-linkage-moons, fig.width=2, fig.height = 2.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
ggpubr::ggarrange(p1, p2, nrow = 2)

## ----kmeans-noisy, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
p3


## ----dbscan-eps, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
par(mar = c(4,4,1,1), cex = 0.5)
dbscan::kNNdistplot(data_moons, 4)


## ----dbscan-moons, fig.width=2, fig.height = 1.4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
p4

## ----dbscan-noisy, fig.width=2, fig.height = 2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
p5