## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels library(kableExtra) # Pretty printing for tables cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette # Packages for actual computation # library(MASS) # Contains mvrnorm to simulate multivariate normal # # Use as MASS::mvrnorm is recommended since # # MASS::select clashes with dplyr::select # library(randomForest) # Fitting of random forests # library(randomForestSRC) # Faster implementation of random forests with # # almost the same interface as the randomForest # # package # library(ggRandomForests) # Offers functionality to extract easily plottable # # data from a randomForestSRC fit # library(DAAG) # Contains the `monica` dataset # library(ElemStatLearn) # Contains the `SAheart` dataset ## ----random-forest-toy-example, fig.width=4, fig.height=1.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(28246210) # Simulate data mu <- rep.int(0, 5) Sigma <- diag(rep.int(1, 5)) Sigma[Sigma == 0] <- 0.98 sigma_err <- 1 simulate_data <- function(n, mu, Sigma, sigma_err) { X <- MASS::mvrnorm(n, mu, Sigma) colnames(X) <- sprintf("x%d", 1:length(mu)) y <- X[,1] ^ 2 + rnorm(n, sd = sigma_err) bind_cols( tibble(y = y), as_tibble(X)) } # Create training sample n_train <- 50 data_train <- simulate_data(n_train, mu, Sigma, sigma_err) # Create test sample n_test <- 100 data_test <- simulate_data(n_test, mu, Sigma, sigma_err) single_tree <- rpart::rpart( y ~ x1 + x2 + x3 + x4 + x5, data_train, method = "anova") test_error_single <- mean((predict(single_tree, data_test) - data_test$y) ^ 2) B <- 300 test_error <- do.call(cbind, parallel::mclapply(1:B, function(b) { rf <- randomForest::randomForest( y ~ x1 + x2 + x3 + x4 + x5, data_train, ntree = b) pred_bag <- apply( do.call(cbind, parallel::mclapply(1:b, function(j) { t <- rpart::rpart( y ~ x1 + x2 + x3 + x4 + x5, data_train[sample(n_train, replace = TRUE),], method = "anova") predict(t, data_test) })), 1, mean) c( mean((predict(rf, data_test) - data_test$y) ^ 2), # RF test error mean((pred_bag - data_test$y) ^ 2)) # Bagging test error })) data_plot <- tibble( x = rep.int(1:B, 2), y = c(test_error[1,], test_error[2,]), type = as.factor(c( rep.int("rf", B), rep.int("bagging", B)))) ggplot() + geom_point( aes(x = x, y = y, colour = type), data = data_plot, size = 1) + geom_hline( aes(yintercept = test_error_single), linetype = 2, colour = cbPalette[4]) + scale_x_continuous("Number of trees") + scale_y_continuous("Test error") + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() ## ----random-forest-monica-example, fig.width=7, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Data set monitoring different risk factors associated with cardiovascular # disease. The outcome is whether or not patients were alive after a 10 year # period n_tree <- 200 data_all <- as_tibble(DAAG::monica) rf <- randomForest::randomForest( outcome ~ ., data_all, na.action = na.omit, importance = TRUE, ntree = n_tree) data_error <- tibble( x = rep.int(1:n_tree, 3), y = as.vector(rf$err.rate), type = as.factor(rep(c("OOB", "Alive", "Dead"), each = n_tree))) p1 <- ggplot(data_error, mapping = aes(x = x, y = y, colour = type)) + geom_point(size = 1) + scale_x_continuous("Number of Trees") + scale_y_continuous("Out-of-bag error") + scale_colour_manual("Type", values = cbPalette[-1]) + theme_minimal() + theme( legend.position = c(0.85, 0.75), legend.title = element_text(size = 9, vjust = 0.6), legend.text = element_text(size = 8), legend.key.size = unit("8", "pt"), legend.background = element_rect(fill = "white", size = 0.1)) + ggtitle("Error estimate") data_importance <- tibble( var = as.factor(rep.int(rownames(rf$importance), 4)), decrease = as.vector(scale(rf$importance, center = FALSE)), type = as.factor(rep( c("Alive Acc.", "Dead Acc.", "Mean Acc.", "Mean Gini"), each = length(rownames(rf$importance))))) p2 <- ggplot(data_importance, aes(x = var, y = decrease, fill = type)) + geom_bar( position = "dodge", stat = "identity") + coord_flip() + scale_x_discrete(name = NULL) + scale_y_continuous(name = "Decrease") + scale_fill_manual("Type", values = cbPalette[c(2, 3, 1, 4)]) + theme_minimal() + theme( legend.position = c(0.85, 0.75), panel.grid = element_blank(), legend.title = element_text(size = 9, vjust = 0.6), legend.text = element_text(size = 8), legend.key.size = unit("8", "pt"), legend.background = element_rect(fill = "white", size = 0.1)) + ggtitle("Variable importance") ggpubr::ggarrange( p1, p2, ncol = 2) ## ----random-forest-SAheart-example-fit, fig.width=3, fig.height=4.3, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Data set monitoring different risk factors associated with coronary heart # disease. The outcome taken here is ldl which encodes cholesterol values data_all <- as_tibble(ElemStatLearn::SAheart) %>% mutate(chd = as.factor(case_when( chd == 0 ~ "No CHD", TRUE ~ "CHD"))) # Fit random forest n_tree <- 300 rf <- randomForest::randomForest( ldl ~ ., data = data_all, na.action = "na.omit", importance = TRUE, ntree = n_tree) data_error <- tibble( x = 1:n_tree, y = rf$mse) p1 <- ggplot(data_error, mapping = aes(x = x, y = y)) + geom_point(size = 0.8) + scale_x_continuous("Number of Trees") + scale_y_continuous("Out-of-bag error (MSE)") + theme_minimal() + theme(axis.title = element_text(size = 8)) ggtitle("Error estimate") data_importance <- tibble( var = as.factor(rep.int(rownames(rf$importance), 2)), decrease = as.vector(scale(rf$importance, center = FALSE)), type = as.factor(rep( c("Mean accuracy", "Mean MSE"), each = length(rownames(rf$importance))))) p2 <- ggplot(data_importance, aes(x = var, y = decrease, fill = type)) + geom_bar( position = "dodge", stat = "identity") + coord_flip() + scale_x_discrete(name = NULL) + scale_y_continuous(name = "Decrease") + scale_fill_manual("Type", values = cbPalette[c(2, 3, 1, 4)]) + theme_minimal() + theme( legend.position = c(0.75, 0.85), panel.grid = element_blank(), legend.title = element_text(size = 9, vjust = 0.6), legend.text = element_text(size = 8), legend.key.size = unit("8", "pt"), legend.background = element_rect(fill = "white", size = 0.1), axis.title = element_text(size = 8)) + ggtitle("Variable importance") ggpubr::ggarrange( p1, p2, ncol = 1, nrow = 2, heights = c(1, 2)) ## ----random-forest-SAheart-example-pred, fig.width=4, fig.height=4, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- yhat <- rf$predicted data_out <- bind_cols(data_all, tibble(yhat = yhat)) %>% select( sbp, tobacco, adiposity, typea, obesity, alcohol, age, famhist, chd, ldl, yhat) n_var <- dim(data_out)[2] - 2 var_types <- data_out %>% summarise_all(class) %>% gather(variable, class) %>% mutate(factor = class == "factor") ps <- lapply(1:n_var, function(i) { p <- ggplot( data = data_out, aes(y = yhat, x = !!(sym(var_types[[i,"variable"]])))) + geom_point(aes(y = ldl), size = 0.8, alpha = 0.2, colour = cbPalette[2]) if(var_types[[i, "factor"]]) { p <- p + geom_boxplot() } else { p <- p + geom_smooth(colour = cbPalette[3], size = 0.8) } p + scale_y_continuous(TeX("$\\widehat{y}$")) + theme_minimal() + theme( axis.text = element_text(size = 6), axis.title = element_text(size = 8)) }) ggpubr::ggarrange(plotlist = ps, nrow = 3, ncol = 3) ## ----projection-example, fig.width=2, fig.height=1, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(2903078) # Direction of axis onto which points are going to be projected p <- matrix(c(1, 0.5) / (sqrt(1 ^ 2 + 0.5 ^ 2)), ncol = 1) # Data points about this axis n <- 8 x <- seq(1, n) + rnorm(n, sd = 0.15) y <- 0.5 * x + rt(n, 4) X <- matrix(c(x, y), ncol = 2) # Projections Xproj <- t(sapply(X %*% p, function(c) c * p)) data_plot <- tibble( x = X[,1], y = X[,2], xp = Xproj[,1], yp = Xproj[,2]) ggplot(data_plot) + geom_abline(slope = 0.5, intercept = 0) + geom_segment( aes(x = x, y = y, xend = xp, yend = yp), colour = cbPalette[1]) + geom_point(aes(x = x, y = y), size = 1) + geom_point(aes(x = xp, y = yp), colour = cbPalette[2], size = 1) + coord_fixed() + theme_minimal() + theme( axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), panel.grid = element_blank(), plot.margin = margin(0,0,0,0,"cm")) ## ----pca-example, fig.width=1.8, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(290307272) # Postulate the principal components I want to get R <- matrix(c(1, 1, -1, 1), ncol = 2) R <- scale(R, center = FALSE) # Postulate the variance I want to get in the direction of each # principal component D <- diag(c(0.2, 1)) # Construct covariance matrix Sigma <- t(R) %*% D %*% D %*% R # Simulate data n_sample <- 100 xs <- MASS::mvrnorm(n = n_sample, mu = c(1, 1), Sigma = Sigma) # Centre and scale data xs <- scale(xs) colnames(xs) <- c("x", "y") data_plot <- as_tibble(xs) # Determine PCA from data xs_svd <- svd(xs) pcs <- xs_svd$v # Construct axes centre <- c(1, -1) data_axes <- tibble( x = rep.int(centre[1], 4), xend = c(centre[1] + 1, centre[1], centre[1] + pcs[1, 1], centre[1] + pcs[1, 2]), y = rep.int(centre[2], 4), yend = c(centre[2], centre[2] + 1, centre[2] + pcs[2, 1], centre[2] + pcs[2, 2]), type = as.factor( c(rep.int("Cartesian", 2), rep.int("Principal Component", 2)))) ggplot(mapping = aes(x = x, y = y)) + geom_point( data = data_plot, colour = cbPalette[1], fill = cbPalette[1], size = 0.8) + geom_segment( data = data_axes, aes(xend = xend, yend = yend, color = type), arrow = arrow(length = unit(1.5, "mm"), angle = 35), size = 0.8) + scale_x_continuous( name = NULL, breaks = NULL, limits = c(-2.1, 2.1), expand = c(0, 0)) + scale_y_continuous( name = NULL, breaks = NULL, limits = c(-2.1, 2.1), expand = c(0, 0)) + scale_color_manual( name = "Axes", values = cbPalette[-1], guide = guide_legend(title.position = "top", title.hjust = 0.5)) + coord_fixed(ratio=1) + theme_minimal() + theme( panel.grid = element_blank(), panel.border = element_blank(), plot.margin = unit(c(0, 0, 0, 0), "lines"), legend.position = "bottom", legend.title = element_text(size = 8), legend.text = element_text(size = 7), legend.key.size = unit(7, "pt")) ## ----load-digits, echo=FALSE--------------------------------------------- # These are greyscale images of handwritten digits. Each image has dimensions # 16 x 16 and results therefore in a 256 dimensional predictor space # The goal is to predict the digit from the image data_train <- ElemStatLearn::zip.train colnames(data_train) <- c("digit", sprintf("x%d", 1:256)) y_train <- data_train[,1] X_train <- data_train[,-1] ## ----digit-freq-table, echo=FALSE---------------------------------------- data_freq <- matrix( c(0:9, as.vector(table(y_train)) / length(y_train)), ncol = 2) colnames(data_freq) <- c("Digit", "Frequency") kable( data_freq, "latex", booktabs = TRUE, digits = c(0, 2), align = rep.int('c', 2), linesep = "") ## ----some-digits, fig.width=2.5, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Plot some of the training data # Get data into a good format for ggplot # then plot with geom_raster and facet_wrap m <- 6 # Number of digits to plot data_plot <- do.call(bind_rows, apply(cbind(1:m, data_train[1:m,]), 1, function(img) { img_data <- cbind( rep.int(img[1], 16), # Image ID rep.int(img[2], 16), # Actual digit 1:16, # x coordinate matrix(img[-(1:2)], ncol = 16)) # Image data colnames(img_data) <- c("id", "digit", "x", seq(16, 1)) # y coordinates are backwards here since ggplot has zero in lower left # instead of upper left # ggplot expects data in a narrow long format as_tibble(img_data) %>% gather(y, value, -id, -digit, -x) %>% mutate(y = as.numeric(y)) })) ggplot(data_plot, aes(x = x, y = y)) + geom_tile( aes(fill = value), width = 1, height = 1, colour = "transparent") + facet_wrap( ~ id, ncol = 3, labeller = as_labeller(function(labels) { y_train[as.numeric(labels)] })) + scale_x_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_y_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_fill_gradient( limits = c(-1, 1), low = "black", high = "white", guide = FALSE) + coord_fixed() + theme( strip.background = element_rect( fill = "transparent", colour = cbPalette[1]), strip.text.x = element_text(face = "bold"), panel.background = element_blank(), plot.margin = margin(.1, .1, .1, .1, "cm")) ## ----svd-on-digits, echo=FALSE------------------------------------------- # Perform SVD on pre-processed data X_svd <- svd(scale(X_train)) ## ----pca-on-digits-scree, fig.width=2.5, fig.height=1.25, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # PCA and SVD are related through # Sigma = X^T X / (n - 1) and X = U D V^T # Therefore Sigma = V D U^T U D V^T / (n - 1) = V D^2 V^T / (n - 1) # and the eigenvalues of Sigma are d_ii^2 / (n - 1) data_pca <- tibble( pc = 1:dim(X_train)[2], ev = X_svd$d^2 / (dim(X_train)[1] - 1)) ggplot(data_pca, aes(x = pc, y = ev)) + geom_point(size = 0.5, colour = cbPalette[1]) + geom_line(data = tibble(x = c(0, 256), y = c(1, 1)), aes(x = x, y = y), colour = cbPalette[2]) + scale_x_continuous(name = "Principal Component") + scale_y_log10(name = "Eigenvalue") + theme_minimal() + theme( plot.title = element_text(size = 10), axis.title = element_text(size = 8), axis.text = element_text(size = 7), plot.margin = margin(.1, .1, .1, .1, "cm")) + ggtitle("Scree plot") ## ----pca-on-digits-components, fig.width=1.6, fig.height=2.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Visualise the first m principal components m <- 4 data_pc_plot <- do.call(bind_rows, apply(cbind(1:m, t(X_svd$v)[1:m,]), 1, function(img) { img_data <- cbind( rep.int(img[1], 16), # Number of PC 1:16, # x coordinate matrix(img[-1], ncol = 16)) # PC data / image data colnames(img_data) <- c("pc", "x", seq(16, 1)) as_tibble(img_data) %>% gather(y, value, -pc, -x) %>% mutate(y = as.numeric(y)) })) ggplot(data_pc_plot, aes(x = x, y = y)) + geom_tile( aes(fill = value), width = 1, height = 1, colour = "transparent") + facet_wrap(~ as.factor(pc), ncol = 2) + scale_x_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_y_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_fill_gradient( low = "black", high = "white", guide = FALSE) + coord_fixed() + theme( strip.background = element_rect( fill = "transparent", colour = cbPalette[1]), strip.text.x = element_text(face = "bold"), plot.margin = margin(.1, .1, .1, .1, "cm"),) ## ----pca-on-digits-partial-reconstruction, echo=FALSE-------------------- # What information does this subspace of relevant PCs capture? # Look at projections of the same images as above onto the # reduced data subspace pcs_indices <- 1:(data_pca %>% filter(ev >= 1) %>% count %>% as.numeric) red_v <- X_svd$v[,pcs_indices] m <- 4 # Reconstruct on actual scale here for visual purposes data_reconstruct <- do.call(bind_rows, apply(cbind( 1:m, y_train[1:m], X_train[1:m,] %*% red_v %*% t(red_v)), 1, function(img) { img_data <- cbind( rep.int(img[1], 16), rep.int(img[2], 16), 1:16, matrix(img[-(1:2)], ncol = 16)) colnames(img_data) <- c("id", "digit", "x", seq(16, 1)) as_tibble(img_data) %>% gather(y, value, -id, -digit, -x) %>% mutate(y = as.numeric(y)) })) ## ----pca-on-digits-partial-reconstruction-plot, fig.width=1.6, fig.height=2.2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE---- ggplot(data_reconstruct, aes(x = x, y = y)) + geom_tile( aes(fill = value), width = 1, height = 1, colour = "transparent") + facet_wrap( ~ id, ncol = 2, labeller = as_labeller(function(labels) { y_train[as.numeric(labels)] })) + scale_x_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_y_continuous( name = NULL, breaks = NULL, expand = c(0, 0)) + scale_fill_gradient( low = "black", high = "white", guide = FALSE) + coord_fixed() + theme( strip.background = element_rect( fill = "transparent", colour = cbPalette[1]), strip.text.x = element_text(face = "bold"), plot.margin = margin(.1, .1, .1, .1, "cm")) ## ----pca-on-digits-first-two-prep-and-cv, echo=FALSE--------------------- # Project all data on first two principal components two_v <- X_svd$v[,1:2] X_proj <- scale(X_train) %*% two_v data_pc_proj <- tibble( digit = as.factor(y_train), pc1 = X_proj[,1], pc2 = X_proj[,2]) # Consider only 0s and 1s in the projected data data_pca <- data_pc_proj %>% filter(digit %in% c(0, 1)) %>% mutate(digit = factor(digit, levels = c(0, 1))) # Consider only 0s and 1s in the full, unprojected data data_full <- as_tibble(data_train) %>% filter(digit %in% c(0, 1)) %>% mutate(digit = as.factor(digit)) # Find the 25 most variable features (pixels) across both classes # Makes sense here since all features are on the same scale most_variance_idx <- tibble( idx = 1:256, var = apply(data_full[,-1], 2, var)) %>% top_n(2, var) %>% select(idx) %>% as_vector %>% unname # Restrict the dataset to these data_maxvar <- data_full[,c(1, 1 + most_variance_idx)] # Since PCA and maximum variance selection are class-unrelated, they can be # applied before cross-validation. Note that if you want to stay completely # true to the thought that test data should not be seen before actually # testing, then PCA and maximum variance selection should be done inside # the CV loop. # 1. For PCA: Scale and centre only training variables and save the necessary # parameters to do so # 2. Perform PCA/max var on the training set # 3a. For PCA: Scale and centre the test data with the parameters saved # from step 1, then transform test data with PCA projection matrix # from step 2. # 3b. For max var selection: Choose the same variables in the test set # that were selected in the training set folds <- caret::createFolds(data_pca$digit, k = 20) cv_res <- do.call(bind_rows, lapply(folds, function(fold) { data_train_maxvar <- data_maxvar[-fold,] data_train_pca <- data_pca[-fold,] data_test_maxvar <- data_maxvar[fold,] data_test_pca <- data_pca[fold,] fit_qda_pca <- MASS::qda(digit ~ ., data_train_pca) fit_lda_pca <- MASS::lda(digit ~ ., data_train_pca) fit_qda_maxvar <- MASS::qda(digit ~ ., data_train_maxvar) fit_lda_maxvar <- MASS::lda(digit ~ ., data_train_maxvar) pred_qda_pca <- predict(fit_qda_pca, data_test_pca)$class pred_lda_pca <- predict(fit_lda_pca, data_test_pca)$class pred_qda_maxvar <- predict(fit_qda_maxvar, data_test_maxvar)$class pred_lda_maxvar <- predict(fit_lda_maxvar, data_test_maxvar)$class tibble( digit = data_test_pca$digit, # The same for both datasets qda_pca = data_test_pca$digit == pred_qda_pca, lda_pca = data_test_pca$digit == pred_lda_pca, qda_maxvar = data_test_pca$digit == pred_qda_maxvar, lda_maxvar = data_test_pca$digit == pred_lda_maxvar) })) overall_mis <- cv_res %>% select(-digit) %>% summarize_all(function(x) 1 - mean(x)) individual_mis <- cv_res %>% group_by(digit) %>% summarize_all(function(x) 1 - mean(x)) ## ----pca-on-digits-first-two-plot, fig.width=2, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- ggplot(data_pca) + geom_point(aes(x = pc1, y = pc2, colour = digit), size = 0.8) + scale_x_continuous(TeX("$r_1^T x$ = PC1")) + scale_y_continuous(TeX("$r_2^T x$ = PC2")) + scale_colour_manual("Digit", values = cbPalette[-1]) + theme_minimal() + theme( legend.position = "bottom", legend.box.margin = margin(0,0,0,0, "cm"), legend.margin = margin(0,0,0,0, "cm"), legend.spacing = unit(0, "cm"), plot.margin = margin(.1,.1,0,.1, "cm")) ## ----pca-on-digits-first-two-error-table, echo=FALSE--------------------- err_summary <- as.matrix(rbind(individual_mis[,-1], overall_mis)) rownames(err_summary) <- c("0", "1", "Overall") colnames(err_summary) <- c( "QDA + PCA", "LDA + PCA", "LDA + max var", "QDA + max var") kable( t(err_summary), "latex", booktabs = TRUE, digits = 3, caption = "Missclassifaction rate (20-fold CV)")