## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
library(kableExtra) # Pretty printing for tables
cbPalette <- c(
    "#999999", "#E69F00", "#56B4E9", "#009E73",
    "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(MASS)            # Contains mvrnorm to simulate multivariate normal
#                          # Use as MASS::mvrnorm is recommended since
#                          # MASS::select clashes with dplyr::select
# library(randomForest)    # Fitting of random forests
# library(randomForestSRC) # Faster implementation of random forests with
#                          # almost the same interface as the randomForest
#                          # package
# library(ggRandomForests) # Offers functionality to extract easily plottable
#                          # data from a randomForestSRC fit
# library(DAAG)            # Contains the `monica` dataset
# library(ElemStatLearn)   # Contains the `SAheart` dataset


## ----random-forest-toy-example, fig.width=4, fig.height=1.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(28246210)
# Simulate data
mu <- rep.int(0, 5)
Sigma <- diag(rep.int(1, 5))
Sigma[Sigma == 0] <- 0.98
sigma_err <- 1

simulate_data <- function(n, mu, Sigma, sigma_err) {
  X <- MASS::mvrnorm(n, mu, Sigma)
  colnames(X) <- sprintf("x%d", 1:length(mu))
  y <- X[,1] ^ 2 + rnorm(n, sd = sigma_err)
  bind_cols(
    tibble(y = y),
    as_tibble(X))
}

# Create training sample
n_train <- 50
data_train <- simulate_data(n_train, mu, Sigma, sigma_err)

# Create test sample
n_test <- 100
data_test <- simulate_data(n_test, mu, Sigma, sigma_err)

single_tree <- rpart::rpart(
    y ~ x1 + x2 + x3 + x4 + x5,
    data_train, method = "anova")
test_error_single <- mean((predict(single_tree, data_test) - data_test$y) ^ 2)

B <- 300
test_error <- do.call(cbind,
    parallel::mclapply(1:B, function(b) {
    rf <- randomForest::randomForest(
        y ~ x1 + x2 + x3 + x4 + x5,
        data_train, ntree = b)

    pred_bag <- apply(
        do.call(cbind, parallel::mclapply(1:b, function(j) {
        t <- rpart::rpart(
            y ~ x1 + x2 + x3 + x4 + x5,
            data_train[sample(n_train, replace = TRUE),],
            method = "anova")
        predict(t, data_test)
    })), 1, mean)

    c(
        mean((predict(rf, data_test) - data_test$y) ^ 2), # RF test error
        mean((pred_bag - data_test$y) ^ 2))               # Bagging test error
}))

data_plot <- tibble(
    x = rep.int(1:B, 2),
    y = c(test_error[1,], test_error[2,]),
    type = as.factor(c(
        rep.int("rf", B), rep.int("bagging", B))))

ggplot() +
    geom_point(
        aes(x = x, y = y, colour = type),
        data = data_plot, size = 1) +
    geom_hline(
        aes(yintercept = test_error_single),
        linetype = 2,
        colour = cbPalette[4]) +
    scale_x_continuous("Number of trees") +
    scale_y_continuous("Test error") +
    scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
    theme_minimal()


## ----random-forest-monica-example, fig.width=7, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Data set monitoring different risk factors associated with cardiovascular
# disease. The outcome is whether or not patients were alive after a 10 year
# period
n_tree <- 200
data_all <- as_tibble(DAAG::monica)
rf <- randomForest::randomForest(
  outcome ~ ., data_all, na.action = na.omit,
  importance = TRUE, ntree = n_tree)

data_error <- tibble(
  x = rep.int(1:n_tree, 3),
  y = as.vector(rf$err.rate),
  type = as.factor(rep(c("OOB", "Alive", "Dead"), each = n_tree)))

p1 <- ggplot(data_error, mapping = aes(x = x, y = y, colour = type)) +
  geom_point(size = 1) +
  scale_x_continuous("Number of Trees") +
  scale_y_continuous("Out-of-bag error") +
  scale_colour_manual("Type", values = cbPalette[-1]) +
  theme_minimal() +
  theme(
    legend.position = c(0.85, 0.75),
    legend.title = element_text(size = 9, vjust = 0.6),
    legend.text = element_text(size = 8),
    legend.key.size = unit("8", "pt"),
    legend.background = element_rect(fill = "white", size = 0.1)) +
  ggtitle("Error estimate")

data_importance <- tibble(
  var = as.factor(rep.int(rownames(rf$importance), 4)),
  decrease = as.vector(scale(rf$importance, center = FALSE)),
  type = as.factor(rep(
    c("Alive Acc.", "Dead Acc.", "Mean Acc.", "Mean Gini"),
    each = length(rownames(rf$importance)))))

p2 <- ggplot(data_importance, aes(x = var, y = decrease, fill = type)) +
  geom_bar(
    position = "dodge",
    stat = "identity") +
  coord_flip() +
  scale_x_discrete(name = NULL) +
  scale_y_continuous(name = "Decrease") +
  scale_fill_manual("Type", values = cbPalette[c(2, 3, 1, 4)]) +
  theme_minimal() +
  theme(
    legend.position = c(0.85, 0.75),
    panel.grid = element_blank(),
    legend.title = element_text(size = 9, vjust = 0.6),
    legend.text = element_text(size = 8),
    legend.key.size = unit("8", "pt"),
    legend.background = element_rect(fill = "white", size = 0.1)) +
  ggtitle("Variable importance")

ggpubr::ggarrange(
  p1, p2, ncol = 2)


## ----random-forest-SAheart-example-fit, fig.width=3, fig.height=4.3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Data set monitoring different risk factors associated with coronary heart
# disease. The outcome taken here is ldl which encodes cholesterol values
data_all <- as_tibble(ElemStatLearn::SAheart) %>%
  mutate(chd = as.factor(case_when(
    chd == 0 ~ "No CHD",
    TRUE ~ "CHD")))

# Fit random forest
n_tree <- 300
rf <- randomForest::randomForest(
  ldl ~ ., data = data_all, na.action = "na.omit",
  importance = TRUE, ntree = n_tree)

data_error <- tibble(
  x = 1:n_tree,
  y = rf$mse)

p1 <- ggplot(data_error, mapping = aes(x = x, y = y)) +
  geom_point(size = 0.8) +
  scale_x_continuous("Number of Trees") +
  scale_y_continuous("Out-of-bag error (MSE)") +
  theme_minimal() +
  theme(axis.title = element_text(size = 8))
  ggtitle("Error estimate")

data_importance <- tibble(
  var = as.factor(rep.int(rownames(rf$importance), 2)),
  decrease = as.vector(scale(rf$importance, center = FALSE)),
  type = as.factor(rep(
    c("Mean accuracy", "Mean MSE"),
    each = length(rownames(rf$importance)))))

p2 <- ggplot(data_importance, aes(x = var, y = decrease, fill = type)) +
  geom_bar(
    position = "dodge",
    stat = "identity") +
  coord_flip() +
  scale_x_discrete(name = NULL) +
  scale_y_continuous(name = "Decrease") +
  scale_fill_manual("Type", values = cbPalette[c(2, 3, 1, 4)]) +
  theme_minimal() +
  theme(
    legend.position = c(0.75, 0.85),
    panel.grid = element_blank(),
    legend.title = element_text(size = 9, vjust = 0.6),
    legend.text = element_text(size = 8),
    legend.key.size = unit("8", "pt"),
    legend.background = element_rect(fill = "white", size = 0.1),
    axis.title = element_text(size = 8)) +
  ggtitle("Variable importance")

ggpubr::ggarrange(
  p1, p2, ncol = 1, nrow = 2, heights = c(1, 2))


## ----random-forest-SAheart-example-pred, fig.width=4, fig.height=4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
yhat <- rf$predicted
data_out <- bind_cols(data_all, tibble(yhat = yhat)) %>%
  select(
    sbp, tobacco, adiposity, typea, obesity,
    alcohol, age, famhist, chd, ldl, yhat)
n_var <- dim(data_out)[2] - 2

var_types <- data_out %>%
  summarise_all(class) %>%
  gather(variable, class) %>%
  mutate(factor = class == "factor")

ps <- lapply(1:n_var, function(i) {
  p <- ggplot(
    data = data_out,
    aes(y = yhat, x = !!(sym(var_types[[i,"variable"]])))) +
    geom_point(aes(y = ldl), size = 0.8, alpha = 0.2, colour = cbPalette[2])
  if(var_types[[i, "factor"]]) {
    p <- p + geom_boxplot()
  } else {
    p <- p + geom_smooth(colour = cbPalette[3], size = 0.8)
  }
  p +
    scale_y_continuous(TeX("$\\widehat{y}$")) +
    theme_minimal() +
    theme(
      axis.text = element_text(size = 6),
      axis.title = element_text(size = 8))
})

ggpubr::ggarrange(plotlist = ps, nrow = 3, ncol = 3)


## ----projection-example, fig.width=2, fig.height=1, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(2903078)
# Direction of axis onto which points are going to be projected
p <- matrix(c(1, 0.5) / (sqrt(1 ^ 2 + 0.5 ^ 2)), ncol = 1)
# Data points about this axis
n <- 8
x <- seq(1, n) + rnorm(n, sd = 0.15)
y <- 0.5 * x + rt(n, 4)
X <- matrix(c(x, y), ncol = 2)
# Projections
Xproj <- t(sapply(X %*% p, function(c) c * p))

data_plot <- tibble(
  x = X[,1],
  y = X[,2],
  xp = Xproj[,1],
  yp = Xproj[,2])

ggplot(data_plot) +
  geom_abline(slope = 0.5, intercept = 0) +
  geom_segment(
    aes(x = x, y = y, xend = xp, yend = yp),
    colour = cbPalette[1]) +
  geom_point(aes(x = x, y = y), size = 1) +
  geom_point(aes(x = xp, y = yp), colour = cbPalette[2], size = 1) +
  coord_fixed() +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    axis.text = element_blank(),
    panel.grid = element_blank(),
    plot.margin = margin(0,0,0,0,"cm"))


## ----pca-example, fig.width=1.8, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(290307272)
# Postulate the principal components I want to get
R <- matrix(c(1, 1, -1, 1), ncol = 2)
R <- scale(R, center = FALSE)
# Postulate the variance I want to get in the direction of each
# principal component
D <- diag(c(0.2, 1))
# Construct covariance matrix
Sigma <- t(R) %*% D %*% D %*% R
# Simulate data
n_sample <- 100
xs <- MASS::mvrnorm(n = n_sample, mu = c(1, 1), Sigma = Sigma)
# Centre and scale data
xs <- scale(xs)
colnames(xs) <- c("x", "y")
data_plot <- as_tibble(xs)

# Determine PCA from data
xs_svd <- svd(xs)
pcs <- xs_svd$v

# Construct axes
centre <- c(1, -1)
data_axes <- tibble(
  x = rep.int(centre[1], 4),
  xend = c(centre[1] + 1, centre[1],
       centre[1] + pcs[1, 1], centre[1] + pcs[1, 2]),
  y = rep.int(centre[2], 4),
  yend = c(centre[2], centre[2] + 1,
         centre[2] + pcs[2, 1], centre[2] + pcs[2, 2]),
  type = as.factor(
    c(rep.int("Cartesian", 2), rep.int("Principal Component", 2))))

ggplot(mapping = aes(x = x, y = y)) +
  geom_point(
    data = data_plot, colour = cbPalette[1], fill = cbPalette[1], size = 0.8) +
  geom_segment(
    data = data_axes,
    aes(xend = xend, yend = yend, color = type),
    arrow = arrow(length = unit(1.5, "mm"), angle = 35),
    size = 0.8) +
  scale_x_continuous(
    name = NULL, breaks = NULL, limits = c(-2.1, 2.1), expand = c(0, 0)) +
  scale_y_continuous(
    name = NULL, breaks = NULL, limits = c(-2.1, 2.1), expand = c(0, 0)) +
  scale_color_manual(
    name = "Axes", values = cbPalette[-1],
    guide = guide_legend(title.position = "top", title.hjust = 0.5)) +
  coord_fixed(ratio=1) +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    plot.margin = unit(c(0, 0, 0, 0), "lines"),
    legend.position = "bottom",
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 7),
    legend.key.size = unit(7, "pt"))


## ----load-digits, echo=FALSE---------------------------------------------
# These are greyscale images of handwritten digits. Each image has dimensions
# 16 x 16 and results therefore in a 256 dimensional predictor space
# The goal is to predict the digit from the image
data_train <- ElemStatLearn::zip.train
colnames(data_train) <- c("digit", sprintf("x%d", 1:256))
y_train <- data_train[,1]
X_train <- data_train[,-1]


## ----digit-freq-table, echo=FALSE----------------------------------------
data_freq <- matrix(
  c(0:9, as.vector(table(y_train)) / length(y_train)),
  ncol = 2)
colnames(data_freq) <- c("Digit", "Frequency")

kable(
  data_freq,
  "latex",
  booktabs = TRUE,
  digits = c(0, 2),
  align = rep.int('c', 2),
  linesep = "")


## ----some-digits, fig.width=2.5, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Plot some of the training data
# Get data into a good format for ggplot
# then plot with geom_raster and facet_wrap
m <- 6 # Number of digits to plot
data_plot <- do.call(bind_rows,
  apply(cbind(1:m, data_train[1:m,]), 1, function(img) {
  img_data <- cbind(
    rep.int(img[1], 16),             # Image ID
    rep.int(img[2], 16),             # Actual digit
    1:16,                            # x coordinate
    matrix(img[-(1:2)], ncol = 16))  # Image data
  colnames(img_data) <- c("id", "digit", "x", seq(16, 1))
  # y coordinates are backwards here since ggplot has zero in lower left
  # instead of upper left

  # ggplot expects data in a narrow long format
  as_tibble(img_data) %>%
    gather(y, value, -id, -digit, -x) %>%
    mutate(y = as.numeric(y))
}))

ggplot(data_plot, aes(x = x, y = y)) +
  geom_tile(
    aes(fill = value), width = 1, height = 1,
    colour = "transparent") +
  facet_wrap(
    ~ id, ncol = 3,
    labeller = as_labeller(function(labels) {
      y_train[as.numeric(labels)]
    })) +
  scale_x_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_y_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_fill_gradient(
    limits = c(-1, 1), low = "black", high = "white", guide = FALSE) +
  coord_fixed() +
  theme(
    strip.background = element_rect(
      fill = "transparent", colour = cbPalette[1]),
    strip.text.x = element_text(face = "bold"),
    panel.background = element_blank(),
    plot.margin = margin(.1, .1, .1, .1, "cm"))


## ----svd-on-digits, echo=FALSE-------------------------------------------
# Perform SVD on pre-processed data
X_svd <- svd(scale(X_train))


## ----pca-on-digits-scree, fig.width=2.5, fig.height=1.25, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# PCA and SVD are related through
# Sigma = X^T X / (n - 1) and X = U D V^T
# Therefore Sigma = V D U^T U D V^T / (n - 1) = V D^2 V^T / (n - 1)
# and the eigenvalues of Sigma are d_ii^2 / (n - 1)
data_pca <- tibble(
  pc = 1:dim(X_train)[2],
  ev = X_svd$d^2 / (dim(X_train)[1] - 1))

ggplot(data_pca, aes(x = pc, y = ev)) +
  geom_point(size = 0.5, colour = cbPalette[1]) +
  geom_line(data = tibble(x = c(0, 256), y = c(1, 1)),
    aes(x = x, y = y), colour = cbPalette[2]) +
  scale_x_continuous(name = "Principal Component") +
  scale_y_log10(name = "Eigenvalue") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 10),
    axis.title = element_text(size = 8),
    axis.text = element_text(size = 7),
    plot.margin = margin(.1, .1, .1, .1, "cm")) +
  ggtitle("Scree plot")


## ----pca-on-digits-components, fig.width=1.6, fig.height=2.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Visualise the first m principal components
m <- 4
data_pc_plot <- do.call(bind_rows,
  apply(cbind(1:m, t(X_svd$v)[1:m,]), 1, function(img) {
  img_data <- cbind(
    rep.int(img[1], 16),         # Number of PC
    1:16,                        # x coordinate
    matrix(img[-1], ncol = 16))  # PC data / image data
  colnames(img_data) <- c("pc", "x", seq(16, 1))
  as_tibble(img_data) %>%
    gather(y, value, -pc, -x) %>%
    mutate(y = as.numeric(y))
}))

ggplot(data_pc_plot, aes(x = x, y = y)) +
  geom_tile(
    aes(fill = value), width = 1, height = 1,
    colour = "transparent") +
  facet_wrap(~ as.factor(pc), ncol = 2) +
  scale_x_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_y_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_fill_gradient(
    low = "black", high = "white", guide = FALSE) +
  coord_fixed() +
  theme(
    strip.background = element_rect(
      fill = "transparent", colour = cbPalette[1]),
    strip.text.x = element_text(face = "bold"),
    plot.margin = margin(.1, .1, .1, .1, "cm"),)


## ----pca-on-digits-partial-reconstruction, echo=FALSE--------------------
# What information does this subspace of relevant PCs capture?
# Look at projections of the same images as above onto the
# reduced data subspace
pcs_indices <- 1:(data_pca %>% filter(ev >= 1) %>% count %>% as.numeric)
red_v <- X_svd$v[,pcs_indices]
m <- 4
# Reconstruct on actual scale here for visual purposes
data_reconstruct <- do.call(bind_rows,
  apply(cbind(
    1:m, y_train[1:m],
    X_train[1:m,] %*% red_v %*% t(red_v)),
    1, function(img) {
  img_data <- cbind(
    rep.int(img[1], 16), rep.int(img[2], 16), 1:16,
    matrix(img[-(1:2)], ncol = 16))
  colnames(img_data) <- c("id", "digit", "x", seq(16, 1))
  as_tibble(img_data) %>%
    gather(y, value, -id, -digit, -x) %>%
    mutate(y = as.numeric(y))
}))


## ----pca-on-digits-partial-reconstruction-plot, fig.width=1.6, fig.height=2.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE----
ggplot(data_reconstruct, aes(x = x, y = y)) +
  geom_tile(
    aes(fill = value), width = 1, height = 1,
    colour = "transparent") +
  facet_wrap(
    ~ id, ncol = 2,
    labeller = as_labeller(function(labels) {
      y_train[as.numeric(labels)]
    })) +
  scale_x_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_y_continuous(
    name = NULL, breaks = NULL, expand = c(0, 0)) +
  scale_fill_gradient(
    low = "black", high = "white", guide = FALSE) +
  coord_fixed() +
  theme(
    strip.background = element_rect(
      fill = "transparent", colour = cbPalette[1]),
    strip.text.x = element_text(face = "bold"),
    plot.margin = margin(.1, .1, .1, .1, "cm"))


## ----pca-on-digits-first-two-prep-and-cv, echo=FALSE---------------------
# Project all data on first two principal components
two_v <- X_svd$v[,1:2]
X_proj <- scale(X_train) %*% two_v
data_pc_proj <- tibble(
  digit = as.factor(y_train),
  pc1 = X_proj[,1],
  pc2 = X_proj[,2])

# Consider only 0s and 1s in the projected data
data_pca <- data_pc_proj %>%
  filter(digit %in% c(0, 1)) %>%
  mutate(digit = factor(digit, levels = c(0, 1)))

# Consider only 0s and 1s in the full, unprojected data
data_full <- as_tibble(data_train) %>%
  filter(digit %in% c(0, 1)) %>%
  mutate(digit = as.factor(digit))

# Find the 25 most variable features (pixels) across both classes
# Makes sense here since all features are on the same scale
most_variance_idx <- tibble(
  idx = 1:256,
  var = apply(data_full[,-1], 2, var)) %>%
  top_n(2, var) %>%
  select(idx) %>%
  as_vector %>%
  unname

# Restrict the dataset to these
data_maxvar <- data_full[,c(1, 1 + most_variance_idx)]

# Since PCA and maximum variance selection are class-unrelated, they can be
# applied before cross-validation. Note that if you want to stay completely
# true to the thought that test data should not be seen before actually
# testing, then PCA and maximum variance selection should be done inside
# the CV loop.
# 1.  For PCA: Scale and centre only training variables and save the necessary
#     parameters to do so
# 2.  Perform PCA/max var on the training set
# 3a. For PCA: Scale and centre the test data with the parameters saved
#     from step 1, then transform test data with PCA projection matrix
#     from step 2.
# 3b. For max var selection: Choose the same variables in the test set
#     that were selected in the training set
folds <- caret::createFolds(data_pca$digit, k = 20)
cv_res <- do.call(bind_rows, lapply(folds, function(fold) {
  data_train_maxvar <- data_maxvar[-fold,]
  data_train_pca <- data_pca[-fold,]
  data_test_maxvar <- data_maxvar[fold,]
  data_test_pca <- data_pca[fold,]

  fit_qda_pca <- MASS::qda(digit ~ ., data_train_pca)
  fit_lda_pca <- MASS::lda(digit ~ ., data_train_pca)
  fit_qda_maxvar <- MASS::qda(digit ~ ., data_train_maxvar)
  fit_lda_maxvar <- MASS::lda(digit ~ ., data_train_maxvar)
  pred_qda_pca <- predict(fit_qda_pca, data_test_pca)$class
  pred_lda_pca <- predict(fit_lda_pca, data_test_pca)$class
  pred_qda_maxvar <- predict(fit_qda_maxvar, data_test_maxvar)$class
  pred_lda_maxvar <- predict(fit_lda_maxvar, data_test_maxvar)$class

  tibble(
    digit = data_test_pca$digit,   # The same for both datasets
    qda_pca = data_test_pca$digit == pred_qda_pca,
    lda_pca = data_test_pca$digit == pred_lda_pca,
    qda_maxvar = data_test_pca$digit == pred_qda_maxvar,
    lda_maxvar = data_test_pca$digit == pred_lda_maxvar)
}))

overall_mis <- cv_res %>%
  select(-digit) %>%
  summarize_all(function(x) 1 - mean(x))

individual_mis <- cv_res %>%
  group_by(digit) %>%
  summarize_all(function(x) 1 - mean(x))


## ----pca-on-digits-first-two-plot, fig.width=2, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
ggplot(data_pca) +
  geom_point(aes(x = pc1, y = pc2, colour = digit), size = 0.8) +
  scale_x_continuous(TeX("$r_1^T x$ = PC1")) +
  scale_y_continuous(TeX("$r_2^T x$ = PC2")) +
  scale_colour_manual("Digit", values = cbPalette[-1]) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    legend.box.margin = margin(0,0,0,0, "cm"),
    legend.margin = margin(0,0,0,0, "cm"),
    legend.spacing = unit(0, "cm"),
    plot.margin = margin(.1,.1,0,.1, "cm"))


## ----pca-on-digits-first-two-error-table, echo=FALSE---------------------
err_summary <- as.matrix(rbind(individual_mis[,-1], overall_mis))
rownames(err_summary) <- c("0", "1", "Overall")
colnames(err_summary) <- c(
  "QDA + PCA", "LDA + PCA", "LDA + max var", "QDA + max var")

kable(
  t(err_summary),
  "latex",
  booktabs = TRUE,
  digits = 3,
  caption = "Missclassifaction rate (20-fold CV)")