## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(MASS)    # Contains mvrnorm to simulate multivariate normal
#                  # Use as MASS::mvrnorm is recommended since MASS::select
#                  # clashes with dplyr::select
# library(RCurl)   # Helps to retrieve online datasets
# library(cluster) # Provides clustering algorithms and tools for analysis
#                  # (e.g silhouette width)
library(dendextend)  # Many more possibilities for the customization of
                     # dendrograms


## ----retrieve-wine-dataset, echo=FALSE-----------------------------------
# Retrieve Wine dataset
data_wine <- read_csv(RCurl::getURL(
  "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"),
  col_names = c(
    "Wine", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium",
    "Total phenols", "Flavanoids", "Nonflavanoid phenols",
    "Proanthocyanins", "Color intensity", "Hue",
    "OD280/OD315 of diluted wines", "Proline"),
  col_types = cols(
    .default = col_double(),
    Wine = col_factor())
  )


## ----kmeans-spherical, fig.width=3, fig.height = 3.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Example borrowed from http://varianceexplained.org/r/kmeans-free-lunch/
set.seed(2019)
n <- 250
c1 <- tibble(x = rnorm(n), y = rnorm(n), cluster = 1)
c2 <- tibble(r = rnorm(n, 5, .25), theta = runif(n, 0, 2 * pi),
                 x = r * cos(theta), y = r * sin(theta), cluster = 2) %>%
    select(x, y, cluster)

data_plot <- bind_rows(c1, c2) %>%
  mutate(cluster = as.factor(cluster))

p1 <- ggplot(
  as_tibble(data_plot %>% select(x, y) %>% scale),
  aes(x = x, y = y)) +
  geom_point(size = 0.5) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 7)) +
  ggtitle("Simulated")

# Cluster on data as is
clust <- kmeans(scale(data_plot %>% select(x, y)), 2)
data_res_plot <- bind_cols(data_plot, tibble(pred = as.factor(clust$cluster)))

p2 <- ggplot(data_res_plot, aes(x = x, y = y)) +
  geom_point(aes(colour = pred), size = 0.5) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 7)) +
  ggtitle("k-means directly on data")

# Transform to polar coordinates
data_polar_plot <- data_plot %>%
  mutate(
    r = sqrt(x ^ 2 + y ^ 2),
    theta = atan2(y, x))

p3 <- ggplot(
  as_tibble(data_polar_plot %>% select(r, theta) %>% scale),
  aes(x = r, y = theta)) +
  geom_point(size = 0.5) +
  scale_y_continuous(TeX("$\\theta$")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 7)) +
  ggtitle("Polar-coordinates")

# Cluster on polar coordinates
clust_polar <- kmeans(
  scale(data_polar_plot[,c("r", "theta")]),
  centers = matrix(c(-1, 1, 0, 0), ncol = 2))
data_polar_res_plot <- bind_cols(
  data_polar_plot, tibble(pred = as.factor(clust_polar$cluster)))

p4 <- ggplot(
  data_polar_res_plot %>%
    mutate_at(vars(-pred, -cluster), function(x) as.numeric(scale(x))),
  aes(x = x, y = y)) +
  geom_point(aes(colour = pred), size = 0.5) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 7)) +
  ggtitle("k-means on polar coord")

ggpubr::ggarrange(
  p1, p2, p3, p4, ncol = 2, nrow = 2)


## ----number-of-clusters, fig.width=4, fig.height = 1.6, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Extract features and classes
X <- as.matrix(data_wine[,-1])
classes <- as_vector(data_wine[,1]) %>% unname

# PCA on features
X_svd <- svd(scale(X))
X_proj <- scale(X) %*% X_svd$v[,1:2]

data_plot <- tibble(
  wine = classes,
  PC1 = X_proj[,1],
  PC2 = X_proj[,2])

p1 <- ggplot(data_plot) +
  geom_point(aes(x = PC1, y = PC2, colour = wine), size = 0.5) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 10),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7)) +
  ggtitle("Actual classes")

# Run kmeans repeatedly for different number of clusters but also to
# capture variability with different staring centres
B <- 200
Kmax <- 10
within_scatter <- lapply(1:Kmax, function(K) {
  sapply(1:B, function(i) {
    kmeans(scale(X_proj), centers = K)$tot.withinss
  })
}) %>% do.call(rbind, .) %>%
  cbind(1:Kmax, .)
colnames(within_scatter) <- c("K", sprintf("rep%d", 1:B))

data_scatter_plot <- as_tibble(within_scatter) %>%
  gather(rep, value, -K) %>%
  mutate(rep = as.integer(str_replace(rep, "rep", "")))

p2 <- ggplot(data_scatter_plot) +
  geom_jitter(aes(x = K, y = value), size = 0.1, height = 25, width = 0.25) +
  scale_x_continuous(breaks = 1:10) +
  scale_y_continuous(TeX("$W(C)$")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 10),
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7)) +
  ggtitle("Within cluster scatter")

ggpubr::ggarrange(
  p1, p2, ncol = 2, widths = c(1, 1.2))


## ----silhouette-width, fig.width=4, fig.height = 1.6, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# PAM like this uses the euclidean metric by default, i.e. k-means
# Use l1 norm here instead
clust <- cluster::pam(scale(X), 3, metric = "manhattan")
s <- cluster::silhouette(clust)

data_plot <- tibble(
  obs = as.factor(rownames(s)),
  cluster = as.factor(s[,1]),
  sw = s[,3]) %>%   # Silhouette width
  arrange(cluster, sw) %>%
  mutate(obs = factor(obs, levels = obs))

p1 <- ggplot(data_plot) +
  geom_bar(aes(x = obs, y = sw, fill = cluster), stat = "identity") +
  coord_flip() +
  scale_x_discrete("Observation") +
  scale_y_continuous(
    "Silhouette Width", breaks = seq(-1, 1, by = 0.2)) +
  scale_fill_manual(values = cbPalette[-1], guide = FALSE) +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 7),
    axis.title = element_text(size = 7))

# Average silhouette width
Kmax <- 10
avg_sws <- sapply(2:Kmax, function(K) {
  mean(cluster::silhouette(
    cluster::pam(scale(X), K, metric = "manhattan"))[,3])
})

data_plot <- tibble(
  K = 2:Kmax,
  avg_sw = avg_sws)

p2 <- ggplot(data_plot) +
  geom_line(aes(x = K, y = avg_sw)) +
  scale_y_continuous("Avg. Silhouette Width") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 7))

ggpubr::ggarrange(
  p1, p2, ncol = 2, widths = c(1, 1.2))


## ----iris-dendro-complete, fig.width=5, fig.height = 1.8, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
# Try hierarchical clustering on wine dataset
D <- dist(scale(iris[,1:4]))
hc_complete <- hclust(D, method = "complete")

plot_dendro <- function(hc, title) {
  dend <- as.dendrogram(hc)
  labels(dend) <- NULL   # The functionality to customize the dendrograms is
  dend <- dend %>%       # available in package dendextend
    set("leaves_pch", 19) %>%
    set("leaves_cex", 0.25) %>%
    set("leaves_col", cbPalette[
    as.numeric(iris[order.dendrogram(dend), 5]) + 1])
  par(mar=c(.5,4,1,.5), bg = "transparent",
    cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.8)
  plot(dend, ylab = "Height", main = title)
  abline(h = 6, lty = 5)
  abline(h = 5, lty = 2)
}

plot_dendro(hc_complete, "Complete Linkage")


## ----iris-dendro-average-single, fig.width=5, fig.height = 1.8, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
hc_average <- hclust(D, method = "average")
hc_single <- hclust(D, method = "single")

plot_dendro(hc_average, "Average Linkage")
plot_dendro(hc_single, "Single Linkage")