## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
    "#999999", "#E69F00", "#56B4E9", "#009E73",
    "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(rpart)         # A package for building CART trees
# library(rpart.plot)    # Nice plotting of CART trees built with rpart
# library(randomForest)  # A package to fit a random forest


## ----tree-data-example, fig.width=2.2, fig.height=1.9, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(2374633)
# Simulate some "perfect" data for a CART tree
# Create a two-dim grid and at each spot simulate some slightly randomized
# data points
per_spot <- 2
data_train <- do.call(bind_rows, apply(
    expand.grid(seq(1, 6), seq(1, 4, by = 0.75)), 1,
    function(c) {
        m <- matrix(rep.int(c, per_spot), nrow = per_spot, byrow = TRUE) +
             matrix(rnorm(2 * per_spot, sd = 0.15), nrow = per_spot)
        colnames(m) <- c("x1", "x2")
        as_tibble(m)
    })) %>%
    mutate(class = case_when(    # Create ideal separation lines between data
        x1 > 3.5 ~ 0,
        x1 <= 3.5 & x2 >= 2.3 ~ 0,
        TRUE ~ 1) %>% as.factor)

# Create a classification tree using
# - Gini impurity
# - No minimum decrease in impurity measure (cp = 0)
# - Perform splits on nodes that have more than one sample (minsplit = 1)
# - Minimum leaf size is 1 (minbucket = 1)
cart_tree <- rpart::rpart(
    class ~ x1 + x2, data_train,
    parms = list(split = "gini"),
    control = rpart::rpart.control(cp = 0, minsplit = 1, minbucket = 1))

ggplot(data_train, aes(x = x1, y = x2)) +
    geom_point(aes(shape = class, colour = class), size = 3) +
    geom_hline(aes(yintercept = 2.2)) +
    geom_segment(aes(x = 3.5, y = 0.8, xend = 3.5, yend = 2.2)) +
    scale_x_continuous(TeX("x_1")) +
    scale_y_continuous(TeX("x_2"), expand = expand_scale(add = c(0, 0.1))) +
    scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
    scale_shape_manual(values = c(48, 49), guide = FALSE) +
    theme_minimal()


## ----tree-plot-example, fig.width=2.75, fig.height=2.1, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Interpolate between colours for class 1 and class 2
cfun <- colorRampPalette(c(cbPalette[2], cbPalette[3]))
rpart.plot::prp(cart_tree, extra = 104, box.palette = cfun(5))


## ----node-impurity-two-class, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
pi0 <- seq(0, 1, by = 0.01) # Probability for class 0
pi1 <- 1 - pi0              # Probability for class 1
mis <- 1 - pmax(pi0, pi1)
gini <- pi1 * (1 - pi1) + pi0 * (1 - pi0)
entropy <- -(ifelse(pi0 > 0, pi0 * log(pi0), 0) +
             ifelse(pi1 > 0, pi1 * log(pi1), 0))

data_plot <- tibble(
    x = rep.int(pi0, 3),
    y = c(mis, gini, entropy),
    type = rep(c("Misclassification", "Gini", "Entropy"), each = length(pi0)))

ggplot() +
    geom_line(aes(x = x, y = y, colour = type), data = data_plot) +
    scale_x_continuous(TeX("$\\widehat{\\pi}_{0m}$")) +
    scale_y_continuous("Impurity") +
    scale_colour_manual("Impurity Measure", values = cbPalette[-1]) +
    theme_minimal() +
    theme(legend.position = "bottom")


## ----bootstrap-example, fig.width=5, fig.height=2.25, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(472472434)
n_sample <- 200
mu <- 3
X_train <- rexp(n_sample, rate=1/mu)
theta_hat <- mean(X_train)
quant_99_hat <- quantile(X_train, probs = 0.99)

data_train <- tibble(
  x = X_train)

ggplot() +
  geom_histogram( # Histogram of actual training data
    aes(x = x, y = ..density..),
    data = data_train,
    fill = cbPalette[2],
    alpha = 0.8) +
  geom_histogram( # Histogram of a bootstrap sample
    aes(x = x, y = ..density..),
    data = tibble(
      x = data_train$x[sample(n_sample, replace = TRUE)]),
    fill = "transparent",
    colour = "black") +
  geom_line(      # True data density
    aes(x = x, y = y),
    data = tibble(
      x = seq(0, 18.5, by = 0.1),
      y = dexp(seq(0, 18.5, by = 0.1), rate = 1/mu)),
    colour = cbPalette[3]) +
  geom_line(      # Empirial mean from data
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(theta_hat, 2),
      y = c(0, 0.4)),
    colour = "red", linetype = 2) +
  geom_line(      # True mean of distribution
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(mu, 2),
      y = c(0, 0.4)),
    colour = cbPalette[3], linetype = 2) +
  geom_line(     # Empirical 99% quantile
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(quant_99_hat, 2),
      y = c(0, 0.4)),
    colour = "red", linetype = 3) +
  geom_line(      # True 99% quantile
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(qexp(0.99, rate = 1 / mu), 2),
      y = c(0, 0.4)),
    colour = cbPalette[3], linetype = 3) +
  scale_y_continuous("Frequency") +
  theme_minimal() +
  theme(plot.margin = margin(0.25, 0.25, 0, 0, "lines"))


## ----bootstrap-ci-example, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Bootstrap 1000 times
B <- 1000
theta <- vector(mode = "numeric", length = B)
prec <- vector(mode = "numeric", length = B)
for (i in 1:B) {
  boot_idx <- sample(1:n_sample, replace = TRUE)
  theta[i] <- mean(data_train$x[boot_idx])
  prec[i] <- quantile(data_train$x[boot_idx], probs = 0.99)
}

p_mean <- ggplot() +
  geom_histogram(   # Histogram of bootstrapped values
    aes(x = x, y = ..density..),
    data = tibble(x = theta),
    fill = cbPalette[2],
    colour = "transparent",
    bins = 30) +
  geom_line(   # Empirical mean
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(theta_hat, 2),
      y = c(0, 1.7)),
    colour = "red", linetype = 2) +
  geom_line(   # True mean
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(mu, 2),
      y = c(0, 1.7)),
    colour = cbPalette[3], linetype = 2) +
  scale_x_continuous(TeX("$\\widehat{\\theta}_{b, \\mathrm{mean}$")) +
  scale_y_continuous("Frequency", expand = c(0, 0)) +
  theme_minimal()

p_prec <- ggplot() +
  geom_histogram(  # Histogram of bootstrapped values
    aes(x = x, y = ..density..),
    data = tibble(x = prec),
    fill = cbPalette[2],
    colour = "transparent",
    bins = 30) +
  geom_line(     # Empirical 99% quantile
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(quantile(data_train$x, probs = 0.99), 2),
      y = c(0, 1.7)),
    colour = "red", linetype = 2) +
  geom_line(     # True 99% quantile
    aes(x = x, y = y),
    data = tibble(
      x = rep.int(qexp(0.99, rate = 1/mu), 2),
      y = c(0, 1.7)),
    colour = cbPalette[3], linetype = 2) +
  scale_x_continuous(TeX("$\\widehat{\\theta}_{b, 0.99}$")) +
  scale_y_continuous("Frequency", expand = c(0, 0)) +
  theme_minimal()

ggpubr::ggarrange(
  p_mean, p_prec, ncol = 1, nrow = 2)

# Calculate CIs
sigma_bs <- sqrt(sum((theta - mean(theta)) ^ 2) / (B - 1))
z <- qnorm(c(0.975))
normal_approx_quantiles <- c(
    theta_hat - z * sigma_bs, theta_hat + z * sigma_bs)
perc_method_quantiles <- quantile(theta, probs = c(0.025, 0.975))