## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
library(kableExtra) # Pretty printing for tables
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(MASS)  # Contains mvrnorm to simulate multivariate normal
#                # Use as MASS::mvrnorm is recommended since MASS::select
#                # clashes with dplyr::select
# library(FNN)   # Fast Nearest Neighbor Search Algorithms and Applications
# library(caret) # A huge package useful for all kinds of statistical tasks.
#                # Can be used for the whole data analysis process from
#                # data exploration, to model fitting, prediction and analysis
#                # Here: Used to create folds for cross-validation

# Download breast cancer data
UCI_data_URL <- RCurl::getURL('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data')
names <- c('id_number', 'diagnosis', 'radius_mean',
         'texture_mean', 'perimeter_mean', 'area_mean',
         'smoothness_mean', 'compactness_mean',
         'concavity_mean','concave_points_mean',
         'symmetry_mean', 'fractal_dimension_mean',
         'radius_se', 'texture_se', 'perimeter_se',
         'area_se', 'smoothness_se', 'compactness_se',
         'concavity_se', 'concave_points_se',
         'symmetry_se', 'fractal_dimension_se',
         'radius_worst', 'texture_worst',
         'perimeter_worst', 'area_worst',
         'smoothness_worst', 'compactness_worst',
         'concavity_worst', 'concave_points_worst',
         'symmetry_worst', 'fractal_dimension_worst')
breast_cancer <- as_tibble(read.table(
  textConnection(UCI_data_URL), sep = ',', col.names = names)) %>%
  mutate(diagnosis = as.factor(case_when(
    diagnosis == "M" ~ "Malignant",
    diagnosis == "B" ~ "Benign")))
# # After downloading and preparing the data we can save it for future use
# saveRDS(breast_cancer, "breast-cancer.RDa")
# # And then load from the hard disk again
# breast_cancer <- readRDS("breast-cancer.RDa")


## ----choose-k-in-knn, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
data_train <- breast_cancer %>%
  select(diagnosis, compactness_mean, symmetry_mean) %>%
  rename(class = diagnosis, x1 = compactness_mean, x2 = symmetry_mean)

# Create a grid where we predict the class
h <- 0.005 # Step-width
x1_arr <- seq(0, 0.4, by = h)
x2_arr <- seq(0.1, 0.32, by = h)
data_test <- expand.grid(x1_arr, x2_arr)

ks <- c(1, 10, 100)
class_pred <- lapply(ks, function(k) {
  FNN::knn(
    data_train[,2:3],   # training data (variables)
    data_test,          # test data (variables)
    data_train$class,   # training data (classes)
    k = k)              # k
})

# Predictions for plotting
data_pred <- tibble(
  class = as.factor(do.call(c, class_pred)),
  x1 = rep(data_test[,1], times = length(ks)),
  x2 = rep(data_test[,2], times = length(ks)),
  k = factor(
      rep(sprintf("k = %d", ks), each = dim(data_test)[1]),
      levels = sprintf("k = %d", ks)))

# Show data together with decision boundary
p_knn <- ggplot() +
  geom_tile(
    aes(x = x1, y = x2, fill = class),
    data = data_pred, alpha = 0.5,
    width = h, height = h,
    colour = "transparent") +
  geom_point(
    aes(x = x1, y = x2, colour = class, fill = class),
    data = data_train, size = .8) +
  facet_wrap(~ k, ncol = 3) +
  scale_colour_manual("Diagnosis", values = cbPalette[-1]) +
  scale_fill_manual("Diagnosis", values = cbPalette[-1], guide = FALSE) +
  scale_x_continuous(
    "Compactness", expand = c(0, 0),
    breaks = c(0.0, 0.1, 0.2, 0.3, NULL)) +
  scale_y_continuous("Symmetry", expand = c(0, 0)) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    panel.grid = element_blank())

p_knn


## ----global-vs-local-rules-plot-function, include=FALSE------------------
# Create two plots to illustrate global vs local rules
# Or equivalently: Bias vs Variance trade-off

# This function applies LDA and kNN to a two-class classification problem
# The true boundary of the data is described by the input function `boundary`
# and takes a vector of x2 values returning a x1 value (for visual reasons
# boundaries go from top to bottom).

# In addition the function simulates data and classifies it according to the
# true boundary. The data is then sub-sampled into M training samples where
# each training sample randomly contains frac * 100% of all simulated data.
# In addition the `k` for kNN can be specified.

# LDA and kNN are applied to each training sample and the resulting
# decision boundaries are plotted (grey lines). In addition they are
# averaged and plotted as a strong black line.

generate_data <- function(boundary) {
  data_boundary <- tibble(
    x2 = seq(-6, 6, by = 0.1)) %>%
    mutate(x1 = boundary(x2))

  # Simulate data points in [-2, 2] x [-6, 6] and classify as 0 if left
  # of the wiggly line and 1 if on the right
  X_all <- matrix(runif(2 * 800), ncol = 2)
  X_all[,1] <- 4 * (X_all[,1] - 0.5)
  X_all[,2] <- 12 * (X_all[,2] - 0.5)
  colnames(X_all) <- c("x1", "x2")
  data_all <- X_all %>%
    as_tibble %>%
    mutate(class = as.factor(case_when(
      x1 < boundary(x2) ~ 0,
      TRUE ~ 1)))

  # Return data_all
  list(boundary = data_boundary, all = data_all)
}

plot_global_vs_local_rules <- function(boundary, M = 10, frac = 0.2, k = 3) {
  data_gen <- generate_data(boundary)

  # Create M different training data sets (all 10% of original data)
  data_train <- lapply(1:M, function(i) sample_frac(data_gen$all, frac))

  p_base <- ggplot(mapping = aes(x = x1, y = x2)) +
    geom_point(
      aes(colour = class),
      data = data_gen$all,
      size = 0.6,
      alpha = 0.5) +
    geom_path(data = data_gen$boundary, size = 0.6, colour = "red") +
    scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
    scale_x_continuous(name = TeX("x_1"), expand = c(0, 0)) +
    scale_y_continuous(name = TeX("x_2"), expand = c(0, 0)) +
    theme_minimal() +
    theme(panel.grid = element_blank())

  #### Classify points with LDA, to see a global rule at work
  h <- 0.01
  x1s <- seq(-2, 2, by = h)
  x2s <- c(-6, 6) # We have a linear decision boundary somewhat parallel
          # to x2 axis, so we need to investigate all the x1 values
          # but only the x2 value in the first and last row
  X_pred <- expand.grid(x1s, x2s)
  colnames(X_pred) <- c("x1", "x2")
  # Use LDA and kNN (k = 3) to construct global and local decision boundaries
  data_pred_lda <- do.call(bind_rows, lapply(data_train, function(train) {
    classes <- as.numeric(
      predict(MASS::lda(class ~ x1 + x2, train), X_pred)$class)
    # Find decision boundaries numerically
    # Create a matrix with the same dimensions as the whole grid and
    # put the predicted class at each grid point.
    # Note that the lower left corner of what is in the plot will be
    # the upper left corner.
    # The class on the left of the decision boundary has a lower index than
    # the one on the right and which.max finds the first index at which
    # the higher class appears
    x1_indices <- apply(
        matrix(classes, ncol = length(x1s), byrow = TRUE),
        1, which.max)
    # Since we are finding the first index of x1 where the second class
    # is already winning, lets take the midpoint between that one and the
    # x1 value before.
    # Also, since we are dealing with a linear decision boundary, only the
    # first and last x1 and x2 values are needed
    x1 <- (x1s[x1_indices] + x1s[x1_indices - 1]) / 2
    x2 <- x2s
    c(x1 = x1[1], x1end = x1[2], x2 = x2[1], x2end = x2[2])
  }))

  p1 <- p_base +
    geom_segment(
      aes(xend = x1end, yend = x2end),
      data = data_pred_lda,
      colour = cbPalette[1],
      size = 0.4,
      alpha = 0.8) +
    geom_segment(
      aes(xend = x1end, yend = x2end),
      data = data_pred_lda %>% summarize_all(mean),
      size = 0.6) +
    ggtitle("LDA")

  #### Classify the points with kNN, to see a local rule at work
  h <- 0.05
  x1s <- seq(-2, 2, by = h)
  x2s <- seq(-6, 6, by = h) # The decision boundary could be wild so we need
                # to look at many x2s
  X_pred <- expand.grid(x1s, x2s)
  colnames(X_pred) <- c("x1", "x2")
  # Use LDA and kNN (k = 3) to construct global and local decision boundaries
  data_pred_knn <- lapply(data_train, function(train) {
    classes <- as.numeric(
      FNN::knn(as.matrix(train[,1:2]), X_pred, train$class, k = k))
    # As above
    x1_indices <- apply(
        matrix(classes, ncol = length(x1s), byrow = TRUE),
        1, which.max)
    # Since we are finding the first index of x1 where the second class
    # is already winning, lets take the midpoint between that one and the
    # x1 value before.
    # Take all x2 values here since kNN boundaries are not linear
    x1 <- (x1s[x1_indices] + x1s[x1_indices - 1]) / 2
    x2 <- x2s
    tibble(x1, x2)
  })

  data_pred_knn <- bind_cols(
    do.call(bind_rows, data_pred_knn),
    tibble(m = as.factor(rep(1:M, each = length(x2s)))))

  data_pred_knn_overall <- data_pred_knn %>%
    select(-m) %>%
    group_by(x2) %>%
    summarize(x1 = mean(x1))

  p2 <- p_base +
    geom_path(
      aes(group = m),
      data = data_pred_knn,
      colour = cbPalette[1],
      size = 0.4,
      alpha = 0.8) +
    geom_path(data = data_pred_knn_overall, size = 0.6) +
    ggtitle(TeX(paste0("kNN ($k = ", k, "$)")))

  list(lda_plot = p1, knn_plot = p2)
}


## ----global-rule-simple-boundary, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
set.seed(2636472)
# Create a straight and simple boundary; chosen arbitrarily
boundary_simple <- function(x2) {
  -0.075 + 0.35 / 6 * x2
}

plots_simple <- plot_global_vs_local_rules(boundary_simple)
plots_simple$lda_plot


## ----local-rule-simple-boundary, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
plots_simple$knn_plot


## ----global-rule-complex-boundary, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
set.seed(2636472)
# Create a wiggly true decision boundary; chosen arbitrarily
boundary_complex <- function(x2) {
  3.5 / (x2 + 9) * sin((x2 + 9) / 0.7) + 0.1 * sin((x2 + 7) / 0.2)
}

plots_complex <- plot_global_vs_local_rules(boundary_complex)
plots_complex$lda_plot


## ----local-rule-complex-boundary, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
plots_complex$knn_plot


## ----local-vs-global-performance, echo=FALSE, warning=FALSE, message=FALSE----
set.seed(2636472)
data_gen_simple <- generate_data(boundary_simple)
set.seed(2636472)
data_gen_complex <- generate_data(boundary_complex)

# Caret creates stratified folds
fold_indices <- caret::createFolds(data_gen_simple$all$class, k = 10)
avg_cv_errs <- sapply(
  list(data_gen_simple, data_gen_complex), function(data_gen) {
  cv_errs <- sapply(fold_indices, function(fold) {
    # Separate into training and test data per fold
    train <- data_gen$all[-fold,]
    X_test <- data_gen$all[fold,1:2]
    class_test <- data_gen$all[fold,]$class
    # Run LDA and predict classes
    fit <- MASS::lda(class ~ x1 + x2, train)
    class_pred_lda <- predict(fit, X_test)$class
    # Run kNN and predict classes
    class_pred_knn <- FNN::knn(train[,1:2], X_test, train$class, k = 3)

    c(
      lda = sum(class_pred_lda != class_test),
      knn = sum(class_pred_knn != class_test))
  })
  apply(cv_errs, 1, function(errs) {
    sum(errs) / dim(data_gen$all)[1]
  })
})

colnames(avg_cv_errs) <- c("simple", "complex")
rownames(avg_cv_errs) <- c("LDA", "kNN (k = 3)")
knitr::kable(
  avg_cv_errs,
  "latex",
  digits = 3,
  booktabs = TRUE,
  caption = "Average cross-validation errors for ten folds") %>%
  add_header_above(c(" ", "Boundary" = 2))


## ----nc-lda-qda-plots, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
# Keep all classes this time
data_train <- as_tibble(datasets::iris) %>%
  select(Species, Sepal.Length, Sepal.Width) %>%
  rename(class = Species, x1 = Sepal.Length, x2 = Sepal.Width)

# Calculate centroids per class
data_centroid <- data_train %>%
  group_by(class) %>%
  summarise(
    x1 = mean(x1),
    x2 = mean(x2))

# Classify with nearest centroid method
h <- 0.03
x1s <- seq(4, 8, by = h)
x2s <- seq(1.8, 4.8, by = h)
X_pred <- expand.grid(x1s, x2s)
colnames(X_pred) <- c("x1", "x2")
n_pred <- dim(X_pred)[1]

centroids <- as.matrix(data_centroid[,2:3])

y_pred <- apply(
  apply(centroids, 1,
    function(c) {
      # Calculate squared Euclidean norm for each vector in the grid
      # for the current centroid c
      rowSums(
        (X_pred - matrix(
          rep.int(c, n_pred), nrow = n_pred, byrow =TRUE)) ^ 2)
    }),
  1, which.min) # Determine if the vector is closest to centroid 1, 2, or 3

data_pred <- tibble(
  x1 = X_pred[,1],
  x2 = X_pred[,2],
  class = y_pred %>%
    as.factor %>%
    (function(f) { levels(f) <- levels(data_train$class); f })(.))

# Same data as for nearest centroid example
fit_lda <- MASS::lda(class ~ x1 + x2, data_train)
fit_qda <- MASS::qda(class ~ x1 + x2, data_train)

y_pred_lda <- predict(fit_lda, X_pred)$class
y_pred_qda <- predict(fit_qda, X_pred)$class

data_pred_da <- cbind(
  data_pred,
  tibble(class_lda = y_pred_lda, class_qda = y_pred_qda)) %>%
  rename(class_centroid = class)

p_centroid <- ggplot() +
  geom_tile(
    aes(x = x1, y = x2, colour = class, fill = class_centroid),
    colour = "transparent",
    data = data_pred_da,
    width = h,
    height = h,
    alpha = 0.4) +
  ggtitle("Nearest Centroids")

p_lda <- ggplot() +
  geom_tile(
    aes(x = x1, y = x2, colour = class, fill = class_lda),
    colour = "transparent",
    data = data_pred_da,
    width = h,
    height = h,
    alpha = 0.4) +
  ggtitle("LDA")

p_qda <- ggplot() +
  geom_tile(
    aes(x = x1, y = x2, colour = class, fill = class_qda),
    colour = "transparent",
    data = data_pred_da,
    width = h,
    height = h,
    alpha = 0.4) +
  ggtitle("QDA")

plots <- lapply(list(p_centroid, p_lda, p_qda), function(p) {
  p + geom_jitter(
    aes(x = x1, y = x2, colour = class, shape = class),
    data = data_train, height = 0.1, width=0.1, size = 1) +
  scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
  scale_fill_manual(values = cbPalette[-1], guide = FALSE) +
  scale_shape_discrete(guide = FALSE) +
  guides(  # Repeatedly using the class factor messes up the legend
       # Manual setup needed
    colour = guide_legend(
      title = "Species",
      override.aes = list(
        fill = "transparent",
        colour = cbPalette[2:4],
        shape = c(16, 17, 15),
        size = 2,
        linetype = 0))) +
  scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) +
  scale_y_continuous(name = "Sepal Width", expand = c(0, 0)) +
  theme_minimal() +
  theme(
    panel.grid = element_blank())
})

ggpubr::ggarrange(
  plotlist = plots, ncol = 3,
  common.legend = TRUE, legend = "bottom")


## ----nc-lda-qda-performance, echo=FALSE, warning=FALSE, message=FALSE----
fold_indices <- caret::createFolds(data_train$class, k = 10)

cv_errs <- sapply(fold_indices, function(fold) {
  X_train <- data_train[-fold, 2:3]
  class_train <- data_train[-fold,]$class
  X_test <- data_train[fold, 2:3]
  class_test <- data_train[fold,]$class

  centroids <- data_train[-fold,] %>%
    group_by(class) %>%
    summarise(
      x1 = mean(x1),
      x2 = mean(x2)) %>%
    select(x1, x2) %>%
    as.matrix

  class_pred_nc <- apply(
    apply(centroids, 1,
      function(c) {
        # Calculate squared Euclidean norm for each vector in the grid
        # for the current centroid c
        rowSums(
          (X_test - matrix(
            rep.int(c, dim(X_test)[1]),
            nrow = dim(X_test)[1], byrow =TRUE)) ^ 2)
      }),
    1, which.min) # Determine closest centroid

  fit_lda <- MASS::lda(class ~ x1 + x2, data_train[-fold,])
  fit_qda <- MASS::qda(class ~ x1 + x2, data_train[-fold,])

  class_pred_lda <- predict(fit_lda, X_test)$class
  class_pred_qda <- predict(fit_qda, X_test)$class

  c(
    nc = sum(as.integer(class_test) != class_pred_nc),
    lda = sum(class_test != class_pred_lda),
    qda = sum(class_test != class_pred_qda))
})

avg_cv_errs <- matrix(rowSums(cv_errs) / dim(data_train)[1], nrow = 1)
colnames(avg_cv_errs) <- c("NC", "LDA", "QDA")
knitr::kable(
  avg_cv_errs,
  "latex",
  digits = 3,
  booktabs = TRUE,
  caption = "Average cross-validation errors for ten folds")


## ----choose-k-in-knn-revisited-reminder, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
p_knn


## ----choose-k-in-knn-revisited-roc, fig.width=3.5, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
data_train <- breast_cancer %>%
  select(diagnosis, compactness_mean, symmetry_mean) %>%
  rename(class = diagnosis, x1 = compactness_mean, x2 = symmetry_mean) %>%
  mutate(class = ordered(class, levels = c("Benign", "Malignant")))
  # Order the classes to be able to create a ROC curve later

n_fold <- 5
fold_indices <- caret::createFolds(data_train$class, k = n_fold)

ks <- c(1, 3, 5, 10, 100)
fpr_grid <- seq(0, 1, by = 0.01)
cv_outcomes <- lapply(ks, function(k) {
  cv_outcome <- lapply(fold_indices, function(fold) {
    # Split data into training and test sets
    X_train <- as.matrix(data_train[-fold, 2:3])
    class_train <- data_train[-fold,]$class
    X_test <- as.matrix(data_train[fold, 2:3])
    class_test <- data_train[fold,]$class
    # Determine predictions
    class_pred <- FNN::knn(
      X_train,      # training data (variables)
      X_test,       # test data (variables)
      class_train,  # training data (classes)
      k = k,        # k
      prob = TRUE)  # Return class probabilities
    class_pred_train <- FNN::knn(
      X_train,      # training data (variables)
      X_train,      # training data again
      class_train,  # training data (classes)
      k = k)        # k

    # Retrieve probabilities for the predicted class (always above 50%)
    prob <- attr(class_pred, "prob")
    # FNN::knn returns the probability for the predicted class
    # Here we need the probability of a positive outcome (Malignant)
    prob <- ifelse(class_pred == "Benign", 1 - prob, prob)

    roc_obj <- pROC::roc(class_test, prob)
    tibble(
      tpr = approx(
        1 - roc_obj$specificities,
        roc_obj$sensitivities,
        fpr_grid)$y,
      acc = rep(
        sum(factor(class_test, ordered = FALSE) != class_pred),
        times = length(fpr_grid)),
      acc_train = rep(
        mean(
          factor(class_train, ordered = FALSE) != class_pred_train)))
  }) %>% do.call(bind_rows, .)
  bind_cols(tibble(
    k = rep.int(k, n_fold * length(fpr_grid)),
    fold = rep(1:n_fold, each = length(fpr_grid))),
    cv_outcome)
}) %>% do.call(bind_rows, .)

cv_outcomes <- bind_cols(
  cv_outcomes, tibble(fpr = rep(fpr_grid, times = n_fold * length(ks))))

data_roc <- cv_outcomes %>%
  group_by(k, fpr) %>%
  summarize(tpr = mean(tpr)) %>%
  arrange(k, fpr)

ggplot(data_roc, aes(x = fpr, y = tpr)) +
  geom_line(aes(colour = as.factor(k))) +
  geom_line(
    data = tibble(fpr = c(0, 1), tpr = c(0, 1)),
    linetype = 2,
    colour = "red") +
  scale_x_continuous(
    "False Positive Rate", lim = c(0, 1), expand = c(0, 0)) +
  scale_y_continuous(
    "True Positive Rate", lim = c(0, 1), expand = c(0, 0)) +
  scale_colour_manual("k", values = cbPalette) +
  theme_minimal() +
  theme(legend.position = "bottom")


## ----choose-k-in-knn-revisited-table, echo=FALSE, warning=FALSE, message=FALSE----
data_acc <- cv_outcomes %>%
  select(k, fold, acc, acc_train) %>%
  unique %>%
  group_by(k) %>%
  summarize(acc_train = mean(acc_train), acc = sum(acc) / dim(data_train)[1])

colnames(data_acc) <- c("\\(k\\)", "\\(R^{tr}\\)", "\\(R^{cv}\\)")

kable(
  data_acc,
  "latex",
  digits = c(0, 3, 3),
  booktabs = TRUE,
  caption = "Average training and cross-validation errors for five folds",
  escape = FALSE)