## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette # Packages for actual computation # library(MASS) # Contains mvrnorm to simulate multivariate normal # # Use as MASS::mvrnorm is recommended since MASS::select # # clashes with dplyr::select # library(RCurl) # Helps to retrieve online datasets # library(cluster) # Provides clustering algorithms and tools for analysis # # (e.g silhouette width) library(dendextend) # Many more possibilities for the customization of # dendrograms ## ----retrieve-wine-dataset, echo=FALSE----------------------------------- # Retrieve Wine dataset data_wine <- read_csv(RCurl::getURL( "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"), col_names = c( "Wine", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"), col_types = cols( .default = col_double(), Wine = col_factor()) ) ## ----kmeans-spherical, fig.width=3, fig.height = 3.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Example borrowed from http://varianceexplained.org/r/kmeans-free-lunch/ set.seed(2019) n <- 250 c1 <- tibble(x = rnorm(n), y = rnorm(n), cluster = 1) c2 <- tibble(r = rnorm(n, 5, .25), theta = runif(n, 0, 2 * pi), x = r * cos(theta), y = r * sin(theta), cluster = 2) %>% select(x, y, cluster) data_plot <- bind_rows(c1, c2) %>% mutate(cluster = as.factor(cluster)) p1 <- ggplot( as_tibble(data_plot %>% select(x, y) %>% scale), aes(x = x, y = y)) + geom_point(size = 0.5) + theme_minimal() + theme( plot.title = element_text(size = 7), axis.text = element_text(size = 7), axis.title = element_text(size = 7)) + ggtitle("Simulated") # Cluster on data as is clust <- kmeans(scale(data_plot %>% select(x, y)), 2) data_res_plot <- bind_cols(data_plot, tibble(pred = as.factor(clust$cluster))) p2 <- ggplot(data_res_plot, aes(x = x, y = y)) + geom_point(aes(colour = pred), size = 0.5) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( plot.title = element_text(size = 7), axis.text = element_text(size = 7), axis.title = element_text(size = 7)) + ggtitle("k-means directly on data") # Transform to polar coordinates data_polar_plot <- data_plot %>% mutate( r = sqrt(x ^ 2 + y ^ 2), theta = atan2(y, x)) p3 <- ggplot( as_tibble(data_polar_plot %>% select(r, theta) %>% scale), aes(x = r, y = theta)) + geom_point(size = 0.5) + scale_y_continuous(TeX("$\\theta$")) + theme_minimal() + theme( plot.title = element_text(size = 7), axis.text = element_text(size = 7), axis.title = element_text(size = 7)) + ggtitle("Polar-coordinates") # Cluster on polar coordinates clust_polar <- kmeans( scale(data_polar_plot[,c("r", "theta")]), centers = matrix(c(-1, 1, 0, 0), ncol = 2)) data_polar_res_plot <- bind_cols( data_polar_plot, tibble(pred = as.factor(clust_polar$cluster))) p4 <- ggplot( data_polar_res_plot %>% mutate_at(vars(-pred, -cluster), function(x) as.numeric(scale(x))), aes(x = x, y = y)) + geom_point(aes(colour = pred), size = 0.5) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( plot.title = element_text(size = 7), axis.text = element_text(size = 7), axis.title = element_text(size = 7)) + ggtitle("k-means on polar coord") ggpubr::ggarrange( p1, p2, p3, p4, ncol = 2, nrow = 2) ## ----number-of-clusters, fig.width=4, fig.height = 1.6, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Extract features and classes X <- as.matrix(data_wine[,-1]) classes <- as_vector(data_wine[,1]) %>% unname # PCA on features X_svd <- svd(scale(X)) X_proj <- scale(X) %*% X_svd$v[,1:2] data_plot <- tibble( wine = classes, PC1 = X_proj[,1], PC2 = X_proj[,2]) p1 <- ggplot(data_plot) + geom_point(aes(x = PC1, y = PC2, colour = wine), size = 0.5) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( plot.title = element_text(size = 10), axis.title = element_text(size = 7), axis.text = element_text(size = 7)) + ggtitle("Actual classes") # Run kmeans repeatedly for different number of clusters but also to # capture variability with different staring centres B <- 200 Kmax <- 10 within_scatter <- lapply(1:Kmax, function(K) { sapply(1:B, function(i) { kmeans(scale(X_proj), centers = K)$tot.withinss }) }) %>% do.call(rbind, .) %>% cbind(1:Kmax, .) colnames(within_scatter) <- c("K", sprintf("rep%d", 1:B)) data_scatter_plot <- as_tibble(within_scatter) %>% gather(rep, value, -K) %>% mutate(rep = as.integer(str_replace(rep, "rep", ""))) p2 <- ggplot(data_scatter_plot) + geom_jitter(aes(x = K, y = value), size = 0.1, height = 25, width = 0.25) + scale_x_continuous(breaks = 1:10) + scale_y_continuous(TeX("$W(C)$")) + theme_minimal() + theme( plot.title = element_text(size = 10), axis.title = element_text(size = 7), axis.text = element_text(size = 7)) + ggtitle("Within cluster scatter") ggpubr::ggarrange( p1, p2, ncol = 2, widths = c(1, 1.2)) ## ----silhouette-width, fig.width=4, fig.height = 1.6, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # PAM like this uses the euclidean metric by default, i.e. k-means # Use l1 norm here instead clust <- cluster::pam(scale(X), 3, metric = "manhattan") s <- cluster::silhouette(clust) data_plot <- tibble( obs = as.factor(rownames(s)), cluster = as.factor(s[,1]), sw = s[,3]) %>% # Silhouette width arrange(cluster, sw) %>% mutate(obs = factor(obs, levels = obs)) p1 <- ggplot(data_plot) + geom_bar(aes(x = obs, y = sw, fill = cluster), stat = "identity") + coord_flip() + scale_x_discrete("Observation") + scale_y_continuous( "Silhouette Width", breaks = seq(-1, 1, by = 0.2)) + scale_fill_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( panel.grid = element_blank(), axis.text.y = element_blank(), axis.text.x = element_text(size = 7), axis.title = element_text(size = 7)) # Average silhouette width Kmax <- 10 avg_sws <- sapply(2:Kmax, function(K) { mean(cluster::silhouette( cluster::pam(scale(X), K, metric = "manhattan"))[,3]) }) data_plot <- tibble( K = 2:Kmax, avg_sw = avg_sws) p2 <- ggplot(data_plot) + geom_line(aes(x = K, y = avg_sw)) + scale_y_continuous("Avg. Silhouette Width") + theme_minimal() + theme( axis.text = element_text(size = 7), axis.title = element_text(size = 7)) ggpubr::ggarrange( p1, p2, ncol = 2, widths = c(1, 1.2)) ## ----iris-dendro-complete, fig.width=5, fig.height = 1.8, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- # Try hierarchical clustering on wine dataset D <- dist(scale(iris[,1:4])) hc_complete <- hclust(D, method = "complete") plot_dendro <- function(hc, title) { dend <- as.dendrogram(hc) labels(dend) <- NULL # The functionality to customize the dendrograms is dend <- dend %>% # available in package dendextend set("leaves_pch", 19) %>% set("leaves_cex", 0.25) %>% set("leaves_col", cbPalette[ as.numeric(iris[order.dendrogram(dend), 5]) + 1]) par(mar=c(.5,4,1,.5), bg = "transparent", cex.lab = 0.8, cex.axis = 0.8, cex.main = 0.8) plot(dend, ylab = "Height", main = title) abline(h = 6, lty = 5) abline(h = 5, lty = 2) } plot_dendro(hc_complete, "Complete Linkage") ## ----iris-dendro-average-single, fig.width=5, fig.height = 1.8, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- hc_average <- hclust(D, method = "average") hc_single <- hclust(D, method = "single") plot_dendro(hc_average, "Average Linkage") plot_dendro(hc_single, "Single Linkage")