## ----setup, include=FALSE------------------------------------------------ # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette # Packages for actual computation # library(MASS) # Contains mvrnorm to simulate multivariate normal # # As well as functionality for LDA and QDA # # Use as MASS::mvrnorm is recommended since MASS::select # # clashes with dplyr::select # library(glmnet) # Here: Can be used for (multi-class) logistic regression # # In general: Provides lots of functionality for regularized # # regression. It will show up in later lectures again # library(nnet) # Package for fitting neural networks. Contains a function # # to fit multi-class logistic regression ## ----0-1-regression-example, fig.width=5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # Use the infamous iris data set for illustration purposes data_train <- as_tibble(datasets::iris) %>% filter(Species %in% c("setosa", "versicolor")) %>% # Focus on two classes mutate( class = case_when( Species == "setosa" ~ 0, TRUE ~ 1)) # Set 'setosa' to 0 and 'versicolor' to 1 # Fit with only one predictor (e.g. Sepal.Length) fit1 <- lm(class ~ Sepal.Length, data_train) X_pred <- matrix(c(1, 1, 4.4, 6.6), ncol = 2) y_pred <- X_pred %*% matrix(coef(fit1), ncol = 1) data_pred <- tibble( x = X_pred[,2], y = y_pred) # Decision boundary (regression line intersects horizontal line at 0.5) decision_bd1 <- (0.5 - coef(fit1)[1]) / coef(fit1)[2] p1 <- ggplot(data_train) + geom_jitter( aes(x = Sepal.Length, y = class, colour = Species), height = 0, width = 0.1) + geom_line( aes(x = x, y = y), data = tibble(x = c(4.25, 7.05), y = c(0.5, 0.5)), colour = cbPalette[1], linetype = 2) + geom_line( aes(x = x, y = y), data = data_pred, colour = cbPalette[1], linetype = 2) + geom_line( aes(x = x, y = y), data = tibble(x = c(decision_bd1, decision_bd1), y = c(1.15, -0.15))) + scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) + scale_y_continuous( name = "Coding", breaks = c(0, 0.5, 1), expand = c(0, 0)) + scale_colour_manual(values = cbPalette[-1]) + theme_minimal() + theme( legend.position = "bottom", legend.box.spacing = unit(0, "cm")) fit2 <- lm(class ~ Sepal.Length + Sepal.Width, data_train) # Scalar product x^T beta = 0.5 # Leads to beta0 + beta1 * x1 + beta2 * x2 = 0.5 (two predictors and intercept) # then x2 = (0.5 - beta0 - beta1 * x1) / beta2 decision_bd2 <- function(x1) { (0.5 - coef(fit2)[1] - coef(fit2)[2] * x1) / coef(fit2)[3] } p2 <- ggplot(data_train) + geom_line( aes(x = x, y = y), data = tibble( x = c(4.3, 7), y = c(decision_bd2(4.3), decision_bd2(7)))) + geom_point(aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) + scale_x_continuous(name = "Sepal Length") + scale_y_continuous(name = "Sepal Width") + scale_colour_manual(values = cbPalette[-1]) + theme_minimal() + theme( legend.position = "bottom", legend.box.spacing = unit(0, "cm")) ggpubr::ggarrange( p1, p2, ncol = 2, widths = c(1.5, 1), common.legend = TRUE, legend = "bottom") ## ----0-1-regression-and-outliers, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(7334482) # Simulate two very nicely separated datasets with an outlier, # that does not seem relevant in the context of the classification problem n <- 30 mu <- matrix(c(-1.5, 1.5, 1.5, -1.5), ncol = 2) Sigma <- diag(c(1.5, 1.5)) # Generate variables X_train <- do.call( rbind, lapply(1:2, function(i) MASS::mvrnorm(n, mu[i,], Sigma))) y_train <- c(rep.int(0, n), rep.int(1, n)) # Class labels data_train <- tibble( x1 = X_train[,1], x2 = X_train[,2], class = y_train) # Fit without outlier fit1 <- lm(class ~ x1 + x2, data_train) # Add extreme outlier to class 2 data_train <- rbind(data_train, c(12, -6, 1)) # Fit including outlier fit2 <- lm(class ~ x1 + x2, data_train) # Decision boundary decision_bd <- function(x1, fit) { beta <- unname(coef(fit)) (0.5 - beta[1] - beta[2] * x1) / beta[3] } # Predicted decision boundary x1_pred <- c(-4, 12) x2_pred_no_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit1)) x2_pred_with_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit2)) data_pred <- tibble( x1 = rep.int(x1_pred, 2), x2 = c(x2_pred_no_outlier, x2_pred_with_outlier), case = as.factor( c("No outlier", "No outlier", "With Outlier", "With Outlier"))) ggplot(data_train, aes(x = x1, y = x2)) + geom_point(aes(colour = as.factor(class))) + geom_point( data = tibble(x1 = c(12), x2 = c(-6)), colour = "red", shape = 22, size = 4) + geom_line(aes(linetype = case), data = data_pred) + scale_colour_manual(name = NULL, guide = FALSE, values = cbPalette[-1]) + scale_linetype_discrete(name = "Case") + scale_x_continuous(name = TeX("x_1")) + scale_y_continuous(name = TeX("x_2")) + theme_minimal() ## ----multiple-0-1-regressions-example, fig.width=4.5, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- set.seed(9294) # Simulate data with a continuous predictor between 0 and 15 # and three different classes which are well separated # at x = 6 and x = 9 X_train <- runif(n = 30, min = 0, max = 15) # Create y vectors in dummy 0-1 encoding with each class being the # reference class once # Break points are chosen for simulation purposes data_train <- tibble(x = X_train) %>% mutate( class = as.factor(case_when( x <= 6 ~ 1, x <= 9 ~ 2, TRUE ~ 3)), ref1 = case_when( # Class 1 is reference class class == 1 ~ 1, TRUE ~ 0), ref2 = case_when( # Class 2 is reference class class == 2 ~ 1, TRUE ~ 0), ref3 = case_when( # Class 3 is reference class class == 3 ~ 1, TRUE ~ 0)) %>% gather(ref, y, -x, -class) %>% mutate(ref = as.integer(str_replace(ref, "ref", ""))) # Determine regression coefficients (both linear and quadratic) coefs_linear <- lapply(1:3, function(i) { coef(lm(y ~ x, data_train %>% filter(ref == i))) }) coefs_quad <- lapply(1:3, function(i) { coef(lm(y ~ x + I(x ^ 2), data_train %>% filter(ref == i))) }) # Create prediction data X_pred <- seq(0, 14.2, length.out=60) y_pred_linear <- do.call(c, lapply(coefs_linear, function(coef) coef[1] + coef[2] * X_pred)) y_pred_quad <- do.call(c, lapply(coefs_quad, function(coef) { coef[1] + coef[2] * X_pred + coef[3] * X_pred ^ 2 })) data_pred <- tibble( x = rep.int(X_pred, 3), y_lin = y_pred_linear, y_quad = y_pred_quad, class = as.factor(c( rep.int(1, 60), rep.int(2, 60), rep.int(3, 60)))) # Place classes slightly apart on zero line for visual purposes data_train$y[data_train$ref == 2 & data_train$class != 2] <- 0.05 data_train$y[data_train$ref == 1 & data_train$class == 2] <- 0.05 # Visualise the three 0-1 regression data with only linear terms p1 <- ggplot(data_train, aes(x = x, y = y, colour = class)) + geom_point(size = 2) + geom_line(aes(y = y_lin), data = data_pred) + scale_y_continuous( name = "Coding", breaks = c(0, 1), limits = c(-0.07, 1.05)) + scale_x_continuous(name = NULL, labels = NULL, expand = c(0, 0)) + scale_color_manual(name = "Class", values = cbPalette) + theme_minimal() + theme( plot.margin = unit(c(0, 1, 0, 1), "lines")) # Visualise the three 0-1 regression data with quadratic terms p2 <- ggplot(data_train, aes(x = x, y = y, colour = class)) + geom_point(size = 2) + geom_line(aes(y = y_quad), data = data_pred) + scale_y_continuous( name = "Coding", breaks = c(0, 1), limits = c(-0.07, 1.05)) + scale_x_continuous(name = "Predictor", expand = c(0, 0)) + scale_color_manual(name = "Class", values = cbPalette) + theme_minimal() + theme(plot.margin = unit(c(0, 1, 0, 1), "lines")) ggpubr::ggarrange( p1, p2, ncol = 1, nrow = 2, heights = c(0.9, 1.1), common.legend = TRUE, legend = "bottom") ## ----logistic-function-and-normal-cdf, fig.width=3.5, fig.height=2.25, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- xs <- seq(-4, 4, by = 0.1) ys_logistic <- exp(xs) / (1 + exp(xs)) ys_normal_cdf <- pnorm(xs) data_plot <- tibble( x = c(xs, xs), y = c(ys_logistic, ys_normal_cdf), type = as.factor(c( rep.int("Logistic Function", length(xs)), rep.int("Standard Normal CDF", length(xs))))) ggplot(data_plot, aes(x = x, y = y)) + geom_line(aes(colour = type)) + geom_line(data = tibble( x = c(-4, 4), y = c(0.5, 0.5)), linetype = 2) + scale_x_continuous(expand = c(0, 0)) + scale_colour_manual("Type", values = cbPalette[-1]) + theme_minimal() + theme( legend.position = "bottom", plot.margin = margin(.5, 1.5, 0, 0, "lines"), legend.box.spacing = unit(0, "cm")) ## ----0-1-and-logistic-regression-and-outliers, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE---- # This is for the most part the same code as before # The call to glm solves the logistic regression problem set.seed(7334482) # Simulate two very nicely separated datasets with an outlier, # that does not seem relevant in the context of the classification problem n <- 30 mu <- matrix(c(-1.5, 1.5, 1.5, -1.5), ncol = 2) Sigma <- diag(c(1.5, 1.5)) # Generate variables X_train <- do.call( rbind, lapply(1:2, function(i) MASS::mvrnorm(n, mu[i,], Sigma))) y_train <- c(rep.int(0, n), rep.int(1, n)) # Class labels data_train <- tibble( x1 = X_train[,1], x2 = X_train[,2], class = y_train) # Fit without outlier fit1 <- lm(class ~ x1 + x2, data_train) # Add extreme outlier to class 2 data_train <- rbind(data_train, c(12, -2, 1)) # Fit including outlier fit2 <- lm(class ~ x1 + x2, data_train) #### This is new: Now with logistic regression # glmnet is a very versatile function. It uses a different algorithm than # the standard R function for logistic regression which is more stable in # some situations # fit_logistic <- glmnet::glmnet( # as.matrix(data_train[,1:2]), # as.vector(data_train$class), # family = "binomial", lambda = 0) # The standard R alternative is fit_logistic <- glm(class ~ x1 + x2, family = binomial, data_train) # Decision boundary decision_bd <- function(x1, fit) { beta <- unname(as.vector(coef(fit))) (0.5 - beta[1] - beta[2] * x1) / beta[3] } # Predicted decision boundary x1_pred <- c(-4, 12) x2_pred_no_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit1)) x2_pred_with_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit2)) x2_pred_logistic_with_outlier <- sapply( x1_pred, function(x1) decision_bd(x1, fit_logistic)) data_pred <- tibble( x1 = rep.int(x1_pred, 3), x2 = c( x2_pred_no_outlier, x2_pred_with_outlier, x2_pred_logistic_with_outlier), case = factor( c( "0-1: no outlier", "0-1: no outlier", "0-1: with outlier", "0-1: with outlier", "Logistic: with outlier", "Logistic: with outlier"), levels = c( "0-1: no outlier", "0-1: with outlier", "Logistic: with outlier"))) ggplot(data_train, aes(x = x1, y = x2)) + geom_point(aes(colour = as.factor(class))) + geom_point( data = tibble(x1 = c(12), x2 = c(-2)), colour = "red", shape = 22, size = 4) + geom_line(aes(linetype = case), data = data_pred) + scale_colour_manual(name = NULL, guide = FALSE, values = cbPalette[-1]) + scale_linetype_discrete(name = "Case") + scale_x_continuous(name = TeX("x_1")) + scale_y_continuous(name = TeX("x_2")) + theme_minimal() ## ----mulit-class-logistic-regression-example, fig.width=5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE---- # Keep all classes this time data_train <- as_tibble(datasets::iris) # Fit multinomial logistic model == multi-class logistic model # nnet::multinom takes a factor on the left hand side and takes the # first factor level as a reference class # levels(data_train$Species) # returns the factor levels in order # # this shows that "setosa" is taken as a reference fit <- nnet::multinom(Species ~ Sepal.Length, data_train) # Linear predictor n_pred <- 60 X_pred <- matrix(c( rep.int(1, n_pred), seq(4.2, 8.1, length.out = n_pred)), ncol = 2) y_pred_lin <- X_pred %*% t(unname(coef(fit))) # Transform to probabilities y_pred <- cbind( rep.int(1, n_pred), exp(y_pred_lin)) / (1 + rowSums(exp(y_pred_lin))) # Collect all predicted data for plotting data_pred <- tibble( x = rep.int(X_pred[,2], 3), y = as.vector(y_pred), Species = as.factor( rep(levels(data_train$Species), each = n_pred))) p1 <- ggplot( data_pred, aes(x = x, y = y, colour = Species)) + geom_line() + scale_x_continuous( name = NULL, labels = NULL, lim = c(4.2, 8.1), expand = c(0, 0)) + scale_y_continuous(name = "Probability", breaks = c(0, 0.5, 1)) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( axis.title.y = element_text( margin = margin(0, 2.85, 0, 0, "lines")), plot.margin = margin(.25, .5, 0, 0.1, "lines")) p2 <- ggplot( data_train, aes(x = Sepal.Length, y = as.numeric(Species), colour = Species)) + geom_jitter(height = 0, width=0.1) + scale_x_continuous( name = "Sepal Length", lim = c(4.2, 8.1), expand = c(0, 0)) + scale_y_continuous( name = "Species", breaks = c(1, 2, 3), labels = levels(data_train$Species)) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + theme_minimal() + theme( plot.margin = margin(0, .5, 0, 0.1, "lines")) ggpubr::ggarrange( p1, p2, nrow = 2, heights = c(1, 1)) ## ----nearest-centroids-example, fig.width=3.5, fig.height=4, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE---- # Keep all classes this time data_train <- as_tibble(datasets::iris) %>% select(Species, Sepal.Length, Sepal.Width) %>% rename(class = Species, x1 = Sepal.Length, x2 = Sepal.Width) # Calculate centroids per class data_centroid <- data_train %>% group_by(class) %>% summarise( x1 = mean(x1), x2 = mean(x2)) # Classify with nearest centroid method h <- 0.03 x1s <- seq(4, 8, by = h) x2s <- seq(1.8, 4.8, by = h) X_pred <- expand.grid(x1s, x2s) colnames(X_pred) <- c("x1", "x2") n_pred <- dim(X_pred)[1] centroids <- as.matrix(data_centroid[,2:3]) y_pred <- apply( apply(centroids, 1, function(c) { # Calculate squared Euclidean norm for each vector in the grid # for the current centroid c rowSums( (X_pred - matrix( rep.int(c, n_pred), nrow = n_pred, byrow =TRUE)) ^ 2) }), 1, which.min) # Determine if the vector is closest to centroid 1, 2, or 3 data_pred <- tibble( x1 = X_pred[,1], x2 = X_pred[,2], class = y_pred %>% as.factor %>% (function(f) { levels(f) <- levels(data_train$class); f })(.)) ggplot( data_train, aes(x = x1, y = x2, colour = class)) + geom_tile( aes(fill = class), colour = "transparent", data = data_pred, width = h, height = h, alpha = 0.3) + geom_jitter(aes(shape = class), height = 0.1, width=0.1) + geom_point(data = data_centroid, size = 4, shape = 3, colour = "black") + geom_point(data = data_centroid, size = 6, shape = 1, stroke = 1) + scale_fill_manual(values = cbPalette[-1], guide = FALSE) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_shape_discrete(guide = FALSE) + guides( # Repeatedly using the class factor messes up the legend # Manual setup needed colour = guide_legend( title = "Species", override.aes = list( fill = "transparent", colour = cbPalette[2:4], shape = c(16, 17, 15), size = 2, linetype = 0))) + scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) + scale_y_continuous(name = "Sepal Width", expand = c(0, 0)) + theme_minimal() + theme( panel.grid = element_blank(), legend.position = "bottom") ## ----lda-qda-example, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE---- # Same data as for nearest centroid example fit_lda <- MASS::lda(class ~ x1 + x2, data_train) fit_qda <- MASS::qda(class ~ x1 + x2, data_train) y_pred_lda <- predict(fit_lda, X_pred)$class y_pred_qda <- predict(fit_qda, X_pred)$class data_pred_da <- cbind( data_pred, tibble(class_lda = y_pred_lda, class_qda = y_pred_qda)) %>% rename(class_centroid = class) p_centroid <- ggplot() + geom_tile( aes(x = x1, y = x2, colour = class, fill = class_centroid), colour = "transparent", data = data_pred_da, width = h, height = h, alpha = 0.4) + ggtitle("Nearest Centroids") p_lda <- ggplot() + geom_tile( aes(x = x1, y = x2, colour = class, fill = class_lda), colour = "transparent", data = data_pred_da, width = h, height = h, alpha = 0.4) + ggtitle("LDA") p_qda <- ggplot() + geom_tile( aes(x = x1, y = x2, colour = class, fill = class_qda), colour = "transparent", data = data_pred_da, width = h, height = h, alpha = 0.4) + ggtitle("QDA") plots <- lapply(list(p_centroid, p_lda, p_qda), function(p) { p + geom_jitter( aes(x = x1, y = x2, colour = class, shape = class), data = data_train, height = 0.1, width=0.1, size = 1) + scale_colour_manual(values = cbPalette[-1], guide = FALSE) + scale_fill_manual(values = cbPalette[-1], guide = FALSE) + scale_shape_discrete(guide = FALSE) + guides( # Repeatedly using the class factor messes up the legend # Manual setup needed colour = guide_legend( title = "Species", override.aes = list( fill = "transparent", colour = cbPalette[2:4], shape = c(16, 17, 15), size = 2, linetype = 0))) + scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) + scale_y_continuous(name = "Sepal Width", expand = c(0, 0)) + theme_minimal() + theme( panel.grid = element_blank()) }) ggpubr::ggarrange( plotlist = plots, ncol = 3, common.legend = TRUE, legend = "bottom")