## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette # Packages for actual computation # library(rpart) # A package for building CART trees # library(rpart.plot) # Nice plotting of CART trees built with rpart # library(randomForest) # A package to fit a random forest ## ----tree-data-example, fig.width=2.2, fig.height=1.9, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(2374633) # Simulate some "perfect" data for a CART tree # Create a two-dim grid and at each spot simulate some slightly randomized # data points per_spot <- 2 data_train <- do.call(bind_rows, apply( expand.grid(seq(1, 6), seq(1, 4, by = 0.75)), 1, function(c) { m <- matrix(rep.int(c, per_spot), nrow = per_spot, byrow = TRUE) + matrix(rnorm(2 * per_spot, sd = 0.15), nrow = per_spot) colnames(m) <- c("x1", "x2") as_tibble(m) })) %>% mutate(class = case_when( # Create ideal separation lines between data x1 > 3.5 ~ 0, x1 <= 3.5 & x2 >= 2.3 ~ 0, TRUE ~ 1) %>% as.factor) # Create a classification tree using # - Gini impurity # - No minimum decrease in impurity measure (cp = 0) # - Perform splits on nodes that have more than one sample (minsplit = 1) # - Minimum leaf size is 1 (minbucket = 1) cart_tree <- rpart::rpart( class ~ x1 + x2, data_train, parms = list(split = "gini"), control = rpart::rpart.control(cp = 0, minsplit = 1, minbucket = 1)) ggplot(data_train, aes(x = x1, y = x2)) + geom_point(aes(shape = class, colour = class), size = 3) + geom_hline(aes(yintercept = 2.2)) + geom_segment(aes(x = 3.5, y = 0.8, xend = 3.5, yend = 2.2)) + scale_x_continuous(TeX("x_1")) + scale_y_continuous(TeX("x_2"), expand = expand_scale(add = c(0, 0.1))) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_shape_manual(values = c(48, 49), guide = FALSE) + theme_minimal() ## ----tree-plot-example, fig.width=2.75, fig.height=2.1, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Interpolate between colours for class 1 and class 2 cfun <- colorRampPalette(c(cbPalette[2], cbPalette[3])) rpart.plot::prp(cart_tree, extra = 104, box.palette = cfun(5)) ## ----node-impurity-two-class, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- pi0 <- seq(0, 1, by = 0.01) # Probability for class 0 pi1 <- 1 - pi0 # Probability for class 1 mis <- 1 - pmax(pi0, pi1) gini <- pi1 * (1 - pi1) + pi0 * (1 - pi0) entropy <- -(ifelse(pi0 > 0, pi0 * log(pi0), 0) + ifelse(pi1 > 0, pi1 * log(pi1), 0)) data_plot <- tibble( x = rep.int(pi0, 3), y = c(mis, gini, entropy), type = rep(c("Misclassification", "Gini", "Entropy"), each = length(pi0))) ggplot() + geom_line(aes(x = x, y = y, colour = type), data = data_plot) + scale_x_continuous(TeX("$\\widehat{\\pi}_{0m}$")) + scale_y_continuous("Impurity") + scale_colour_manual("Impurity Measure", values = cbPalette[-1]) + theme_minimal() + theme(legend.position = "bottom") ## ----bootstrap-example, fig.width=5, fig.height=2.25, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(472472434) n_sample <- 200 mu <- 3 X_train <- rexp(n_sample, rate=1/mu) theta_hat <- mean(X_train) quant_99_hat <- quantile(X_train, probs = 0.99) data_train <- tibble( x = X_train) ggplot() + geom_histogram( # Histogram of actual training data aes(x = x, y = ..density..), data = data_train, fill = cbPalette[2], alpha = 0.8) + geom_histogram( # Histogram of a bootstrap sample aes(x = x, y = ..density..), data = tibble( x = data_train$x[sample(n_sample, replace = TRUE)]), fill = "transparent", colour = "black") + geom_line( # True data density aes(x = x, y = y), data = tibble( x = seq(0, 18.5, by = 0.1), y = dexp(seq(0, 18.5, by = 0.1), rate = 1/mu)), colour = cbPalette[3]) + geom_line( # Empirial mean from data aes(x = x, y = y), data = tibble( x = rep.int(theta_hat, 2), y = c(0, 0.4)), colour = "red", linetype = 2) + geom_line( # True mean of distribution aes(x = x, y = y), data = tibble( x = rep.int(mu, 2), y = c(0, 0.4)), colour = cbPalette[3], linetype = 2) + geom_line( # Empirical 99% quantile aes(x = x, y = y), data = tibble( x = rep.int(quant_99_hat, 2), y = c(0, 0.4)), colour = "red", linetype = 3) + geom_line( # True 99% quantile aes(x = x, y = y), data = tibble( x = rep.int(qexp(0.99, rate = 1 / mu), 2), y = c(0, 0.4)), colour = cbPalette[3], linetype = 3) + scale_y_continuous("Frequency") + theme_minimal() + theme(plot.margin = margin(0.25, 0.25, 0, 0, "lines")) ## ----bootstrap-ci-example, fig.width=3, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Bootstrap 1000 times B <- 1000 theta <- vector(mode = "numeric", length = B) prec <- vector(mode = "numeric", length = B) for (i in 1:B) { boot_idx <- sample(1:n_sample, replace = TRUE) theta[i] <- mean(data_train$x[boot_idx]) prec[i] <- quantile(data_train$x[boot_idx], probs = 0.99) } p_mean <- ggplot() + geom_histogram( # Histogram of bootstrapped values aes(x = x, y = ..density..), data = tibble(x = theta), fill = cbPalette[2], colour = "transparent", bins = 30) + geom_line( # Empirical mean aes(x = x, y = y), data = tibble( x = rep.int(theta_hat, 2), y = c(0, 1.7)), colour = "red", linetype = 2) + geom_line( # True mean aes(x = x, y = y), data = tibble( x = rep.int(mu, 2), y = c(0, 1.7)), colour = cbPalette[3], linetype = 2) + scale_x_continuous(TeX("$\\widehat{\\theta}_{b, \\mathrm{mean}$")) + scale_y_continuous("Frequency", expand = c(0, 0)) + theme_minimal() p_prec <- ggplot() + geom_histogram( # Histogram of bootstrapped values aes(x = x, y = ..density..), data = tibble(x = prec), fill = cbPalette[2], colour = "transparent", bins = 30) + geom_line( # Empirical 99% quantile aes(x = x, y = y), data = tibble( x = rep.int(quantile(data_train$x, probs = 0.99), 2), y = c(0, 1.7)), colour = "red", linetype = 2) + geom_line( # True 99% quantile aes(x = x, y = y), data = tibble( x = rep.int(qexp(0.99, rate = 1/mu), 2), y = c(0, 1.7)), colour = cbPalette[3], linetype = 2) + scale_x_continuous(TeX("$\\widehat{\\theta}_{b, 0.99}$")) + scale_y_continuous("Frequency", expand = c(0, 0)) + theme_minimal() ggpubr::ggarrange( p_mean, p_prec, ncol = 1, nrow = 2) # Calculate CIs sigma_bs <- sqrt(sum((theta - mean(theta)) ^ 2) / (B - 1)) z <- qnorm(c(0.975)) normal_approx_quantiles <- c( theta_hat - z * sigma_bs, theta_hat + z * sigma_bs) perc_method_quantiles <- quantile(theta, probs = c(0.025, 0.975))