## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
	"#999999", "#E69F00", "#56B4E9", "#009E73",
	"#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette

# Packages for actual computation
# library(MASS)   # Contains mvrnorm to simulate multivariate normal
#				  # As well as functionality for LDA and QDA
#                 # Use as MASS::mvrnorm is recommended since MASS::select
#                 # clashes with dplyr::select
# library(glmnet) # Here: Can be used for (multi-class) logistic regression
#				  # In general: Provides lots of functionality for regularized
#				  # regression. It will show up in later lectures again
# library(nnet)   # Package for fitting neural networks. Contains a function
#                 # to fit multi-class logistic regression


## ----0-1-regression-example, fig.width=5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# Use the infamous iris data set for illustration purposes
data_train <- as_tibble(datasets::iris) %>%
	filter(Species %in% c("setosa", "versicolor")) %>% # Focus on two classes
	mutate(
		class = case_when(
			Species == "setosa" ~ 0,
			TRUE ~ 1)) # Set 'setosa' to 0 and 'versicolor' to 1

# Fit with only one predictor (e.g. Sepal.Length)
fit1 <- lm(class ~ Sepal.Length, data_train)
X_pred <- matrix(c(1, 1, 4.4, 6.6), ncol = 2)
y_pred <- X_pred %*% matrix(coef(fit1), ncol = 1)
data_pred <- tibble(
	x = X_pred[,2],
	y = y_pred)

# Decision boundary (regression line intersects horizontal line at 0.5)
decision_bd1 <- (0.5 - coef(fit1)[1]) / coef(fit1)[2]

p1 <- ggplot(data_train) +
	geom_jitter(
		aes(x = Sepal.Length, y = class, colour = Species),
		height = 0, width = 0.1) +
	geom_line(
		aes(x = x, y = y),
		data = tibble(x = c(4.25, 7.05), y = c(0.5, 0.5)),
		colour = cbPalette[1],
		linetype = 2) +
	geom_line(
		aes(x = x, y = y),
		data = data_pred,
		colour = cbPalette[1],
		linetype = 2) +
	geom_line(
		aes(x = x, y = y),
		data = tibble(x = c(decision_bd1, decision_bd1), y = c(1.15, -0.15))) +
	scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) +
	scale_y_continuous(
		name = "Coding", breaks = c(0, 0.5, 1), expand = c(0, 0)) +
	scale_colour_manual(values = cbPalette[-1]) +
	theme_minimal() +
	theme(
		legend.position = "bottom",
		legend.box.spacing = unit(0, "cm"))

fit2 <- lm(class ~ Sepal.Length + Sepal.Width, data_train)
# Scalar product x^T beta = 0.5
# Leads to beta0 + beta1 * x1 + beta2 * x2 = 0.5 (two predictors and intercept)
# then x2 = (0.5 - beta0 - beta1 * x1) / beta2
decision_bd2 <- function(x1) {
	(0.5 - coef(fit2)[1] - coef(fit2)[2] * x1) / coef(fit2)[3]
}

p2 <- ggplot(data_train) +
	geom_line(
		aes(x = x, y = y),
		data = tibble(
			x = c(4.3, 7),
			y = c(decision_bd2(4.3), decision_bd2(7)))) +
	geom_point(aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
	scale_x_continuous(name = "Sepal Length") +
	scale_y_continuous(name = "Sepal Width") +
	scale_colour_manual(values = cbPalette[-1]) +
	theme_minimal() +
	theme(
		legend.position = "bottom",
		legend.box.spacing = unit(0, "cm"))

ggpubr::ggarrange(
	p1, p2,	ncol = 2, widths = c(1.5, 1),
	common.legend = TRUE, legend = "bottom")


## ----0-1-regression-and-outliers, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(7334482)
# Simulate two very nicely separated datasets with an outlier,
# that does not seem relevant in the context of the classification problem
n <- 30
mu <- matrix(c(-1.5, 1.5, 1.5, -1.5), ncol = 2)
Sigma <- diag(c(1.5, 1.5))
# Generate variables
X_train <- do.call(
	rbind,
	lapply(1:2, function(i) MASS::mvrnorm(n, mu[i,], Sigma)))
y_train <- c(rep.int(0, n), rep.int(1, n)) # Class labels

data_train <- tibble(
	x1 = X_train[,1],
	x2 = X_train[,2],
	class = y_train)

# Fit without outlier
fit1 <- lm(class ~ x1 + x2, data_train)

# Add extreme outlier to class 2
data_train <- rbind(data_train, c(12, -6, 1))
# Fit including outlier
fit2 <- lm(class ~ x1 + x2, data_train)

# Decision boundary
decision_bd <- function(x1, fit) {
	beta <- unname(coef(fit))
	(0.5 - beta[1] - beta[2] * x1) / beta[3]
}

# Predicted decision boundary
x1_pred <- c(-4, 12)
x2_pred_no_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit1))
x2_pred_with_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit2))

data_pred <- tibble(
	x1 = rep.int(x1_pred, 2),
	x2 = c(x2_pred_no_outlier, x2_pred_with_outlier),
	case = as.factor(
		c("No outlier", "No outlier", "With Outlier", "With Outlier")))

ggplot(data_train, aes(x = x1, y = x2)) +
	geom_point(aes(colour = as.factor(class))) +
	geom_point(
		data = tibble(x1 = c(12), x2 = c(-6)),
		colour = "red",
		shape = 22, size = 4) +
	geom_line(aes(linetype = case), data = data_pred) +
	scale_colour_manual(name = NULL, guide = FALSE, values = cbPalette[-1]) +
	scale_linetype_discrete(name = "Case") +
	scale_x_continuous(name = TeX("x_1")) +
	scale_y_continuous(name = TeX("x_2")) +
	theme_minimal()


## ----multiple-0-1-regressions-example, fig.width=4.5, fig.height=3.5, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
set.seed(9294)
# Simulate data with a continuous predictor between 0 and 15
# and three different classes which are well separated
# at x = 6 and x = 9
X_train <- runif(n =  30, min = 0, max = 15)
# Create y vectors in dummy 0-1 encoding with each class being the
# reference class once
# Break points are chosen for simulation purposes
data_train <- tibble(x = X_train) %>%
	mutate(
		class = as.factor(case_when(
			x <= 6 ~ 1,
			x <= 9 ~ 2,
			TRUE ~ 3)),
		ref1 = case_when(  # Class 1 is reference class
			class == 1 ~ 1,
			TRUE ~ 0),
		ref2 = case_when(  # Class 2 is reference class
			class == 2 ~ 1,
			TRUE ~ 0),
		ref3 = case_when(  # Class 3 is reference class
			class == 3 ~ 1,
			TRUE ~ 0)) %>%
	gather(ref, y, -x, -class) %>%
	mutate(ref = as.integer(str_replace(ref, "ref", "")))

# Determine regression coefficients (both linear and quadratic)
coefs_linear <- lapply(1:3, function(i) {
	coef(lm(y ~ x, data_train %>% filter(ref == i)))
})
coefs_quad <- lapply(1:3, function(i) {
	coef(lm(y ~ x + I(x ^ 2), data_train %>% filter(ref == i)))
})

# Create prediction data
X_pred <- seq(0, 14.2, length.out=60)
y_pred_linear <- do.call(c,
	lapply(coefs_linear, function(coef) coef[1] + coef[2] * X_pred))
y_pred_quad <- do.call(c,
	lapply(coefs_quad, function(coef) {
		coef[1] + coef[2] * X_pred + coef[3] * X_pred ^ 2
	}))

data_pred <- tibble(
	x = rep.int(X_pred, 3),
	y_lin = y_pred_linear,
	y_quad = y_pred_quad,
	class = as.factor(c(
		rep.int(1, 60),
		rep.int(2, 60),
		rep.int(3, 60))))

# Place classes slightly apart on zero line for visual purposes
data_train$y[data_train$ref == 2 & data_train$class != 2] <- 0.05
data_train$y[data_train$ref == 1 & data_train$class == 2] <- 0.05

# Visualise the three 0-1 regression data with only linear terms
p1 <- ggplot(data_train, aes(x = x, y = y, colour = class)) +
	geom_point(size = 2) +
	geom_line(aes(y = y_lin), data = data_pred) +
	scale_y_continuous(
		name = "Coding",
		breaks = c(0, 1),
		limits = c(-0.07, 1.05)) +
	scale_x_continuous(name = NULL, labels = NULL, expand = c(0, 0)) +
	scale_color_manual(name = "Class", values = cbPalette) +
	theme_minimal() +
	theme(
		plot.margin = unit(c(0, 1, 0, 1), "lines"))

# Visualise the three 0-1 regression data with quadratic terms
p2 <- ggplot(data_train, aes(x = x, y = y, colour = class)) +
	geom_point(size = 2) +
	geom_line(aes(y = y_quad), data = data_pred) +
	scale_y_continuous(
		name = "Coding",
		breaks = c(0, 1),
		limits = c(-0.07, 1.05)) +
	scale_x_continuous(name = "Predictor", expand = c(0, 0)) +
	scale_color_manual(name = "Class", values = cbPalette) +
	theme_minimal() +
	theme(plot.margin = unit(c(0, 1, 0, 1), "lines"))

ggpubr::ggarrange(
	p1, p2, ncol = 1, nrow = 2, heights = c(0.9, 1.1),
	common.legend = TRUE, legend = "bottom")


## ----logistic-function-and-normal-cdf, fig.width=3.5, fig.height=2.25, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
xs <- seq(-4, 4, by = 0.1)
ys_logistic <- exp(xs) / (1 + exp(xs))
ys_normal_cdf <- pnorm(xs)

data_plot <- tibble(
	x = c(xs, xs),
	y = c(ys_logistic, ys_normal_cdf),
	type = as.factor(c(
		rep.int("Logistic Function", length(xs)),
		rep.int("Standard Normal CDF", length(xs)))))

ggplot(data_plot, aes(x = x, y = y)) +
	geom_line(aes(colour = type)) +
	geom_line(data = tibble(
		x = c(-4, 4), y = c(0.5, 0.5)), linetype = 2) +
	scale_x_continuous(expand = c(0, 0)) +
	scale_colour_manual("Type", values = cbPalette[-1]) +
	theme_minimal() +
	theme(
		legend.position = "bottom",
		plot.margin = margin(.5, 1.5, 0, 0, "lines"),
		legend.box.spacing = unit(0, "cm"))


## ----0-1-and-logistic-regression-and-outliers, fig.width=4.5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, cache=TRUE, message=FALSE----
# This is for the most part the same code as before
# The call to glm solves the logistic regression problem
set.seed(7334482)
# Simulate two very nicely separated datasets with an outlier,
# that does not seem relevant in the context of the classification problem
n <- 30
mu <- matrix(c(-1.5, 1.5, 1.5, -1.5), ncol = 2)
Sigma <- diag(c(1.5, 1.5))
# Generate variables
X_train <- do.call(
	rbind,
	lapply(1:2, function(i) MASS::mvrnorm(n, mu[i,], Sigma)))
y_train <- c(rep.int(0, n), rep.int(1, n)) # Class labels

data_train <- tibble(
	x1 = X_train[,1],
	x2 = X_train[,2],
	class = y_train)

# Fit without outlier
fit1 <- lm(class ~ x1 + x2, data_train)

# Add extreme outlier to class 2
data_train <- rbind(data_train, c(12, -2, 1))
# Fit including outlier
fit2 <- lm(class ~ x1 + x2, data_train)

#### This is new: Now with logistic regression
# glmnet is a very versatile function. It uses a different algorithm than
# the standard R function for logistic regression which is more stable in
# some situations
# fit_logistic <- glmnet::glmnet(
# 	as.matrix(data_train[,1:2]),
# 	as.vector(data_train$class),
# 	family = "binomial", lambda = 0)
# The standard R alternative is
fit_logistic <- glm(class ~ x1 + x2, family = binomial, data_train)

# Decision boundary
decision_bd <- function(x1, fit) {
	beta <- unname(as.vector(coef(fit)))
	(0.5 - beta[1] - beta[2] * x1) / beta[3]
}

# Predicted decision boundary
x1_pred <- c(-4, 12)
x2_pred_no_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit1))
x2_pred_with_outlier <- sapply(x1_pred, function(x1) decision_bd(x1, fit2))
x2_pred_logistic_with_outlier <- sapply(
	x1_pred, function(x1) decision_bd(x1, fit_logistic))

data_pred <- tibble(
	x1 = rep.int(x1_pred, 3),
	x2 = c(
		x2_pred_no_outlier, x2_pred_with_outlier,
		x2_pred_logistic_with_outlier),
	case = factor(
		c(
			"0-1: no outlier", "0-1: no outlier",
			"0-1: with outlier", "0-1: with outlier",
			"Logistic: with outlier", "Logistic: with outlier"),
		levels = c(
			"0-1: no outlier", "0-1: with outlier", "Logistic: with outlier")))

ggplot(data_train, aes(x = x1, y = x2)) +
	geom_point(aes(colour = as.factor(class))) +
	geom_point(
		data = tibble(x1 = c(12), x2 = c(-2)),
		colour = "red",
		shape = 22, size = 4) +
	geom_line(aes(linetype = case), data = data_pred) +
	scale_colour_manual(name = NULL, guide = FALSE, values = cbPalette[-1]) +
	scale_linetype_discrete(name = "Case") +
	scale_x_continuous(name = TeX("x_1")) +
	scale_y_continuous(name = TeX("x_2")) +
	theme_minimal()


## ----mulit-class-logistic-regression-example, fig.width=5, fig.height=3, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
# Keep all classes this time
data_train <- as_tibble(datasets::iris)

# Fit multinomial logistic model == multi-class logistic model
# nnet::multinom takes a factor on the left hand side and takes the
# first factor level as a reference class
# levels(data_train$Species) # returns the factor levels in order
# 							 # this shows that "setosa" is taken as a reference
fit <- nnet::multinom(Species ~ Sepal.Length, data_train)

# Linear predictor
n_pred <- 60
X_pred <- matrix(c(
	rep.int(1, n_pred),
	seq(4.2, 8.1, length.out = n_pred)),
	ncol = 2)
y_pred_lin <- X_pred %*% t(unname(coef(fit)))
# Transform to probabilities
y_pred <- cbind(
	rep.int(1, n_pred), exp(y_pred_lin)) /
	(1 + rowSums(exp(y_pred_lin)))
# Collect all predicted data for plotting
data_pred <- tibble(
	x = rep.int(X_pred[,2], 3),
	y = as.vector(y_pred),
	Species = as.factor(
		rep(levels(data_train$Species), each = n_pred)))

p1 <- ggplot(
		data_pred,
		aes(x = x, y = y, colour = Species)) +
	geom_line() +
	scale_x_continuous(
		name = NULL,
		labels = NULL,
		lim = c(4.2, 8.1),
		expand = c(0, 0)) +
	scale_y_continuous(name = "Probability", breaks = c(0, 0.5, 1)) +
	scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
	theme_minimal() +
	theme(
		axis.title.y = element_text(
			margin = margin(0, 2.85, 0, 0, "lines")),
		plot.margin = margin(.25, .5, 0, 0.1, "lines"))

p2 <- ggplot(
		data_train,
		aes(x = Sepal.Length,
			y = as.numeric(Species),
			colour = Species)) +
	geom_jitter(height = 0, width=0.1) +
	scale_x_continuous(
		name = "Sepal Length",
		lim = c(4.2, 8.1),
		expand = c(0, 0)) +
	scale_y_continuous(
		name = "Species",
		breaks = c(1, 2, 3),
		labels = levels(data_train$Species)) +
	scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
	theme_minimal() +
	theme(
		plot.margin = margin(0, .5, 0, 0.1, "lines"))

ggpubr::ggarrange(
	p1, p2, nrow = 2, heights = c(1, 1))


## ----nearest-centroids-example, fig.width=3.5, fig.height=4, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
# Keep all classes this time
data_train <- as_tibble(datasets::iris) %>%
	select(Species, Sepal.Length, Sepal.Width) %>%
	rename(class = Species, x1 = Sepal.Length, x2 = Sepal.Width)

# Calculate centroids per class
data_centroid <- data_train %>%
	group_by(class) %>%
	summarise(
		x1 = mean(x1),
		x2 = mean(x2))

# Classify with nearest centroid method
h <- 0.03
x1s <- seq(4, 8, by = h)
x2s <- seq(1.8, 4.8, by = h)
X_pred <- expand.grid(x1s, x2s)
colnames(X_pred) <- c("x1", "x2")
n_pred <- dim(X_pred)[1]

centroids <- as.matrix(data_centroid[,2:3])

y_pred <- apply(
	apply(centroids, 1,
		function(c) {
			# Calculate squared Euclidean norm for each vector in the grid
			# for the current centroid c
			rowSums(
				(X_pred - matrix(
					rep.int(c, n_pred), nrow = n_pred, byrow =TRUE)) ^ 2)
		}),
	1, which.min) # Determine if the vector is closest to centroid 1, 2, or 3

data_pred <- tibble(
	x1 = X_pred[,1],
	x2 = X_pred[,2],
	class = y_pred %>%
		as.factor %>%
		(function(f) { levels(f) <- levels(data_train$class); f })(.))

ggplot(
	data_train,
	aes(x = x1,
		y = x2,
		colour = class)) +
	geom_tile(
		aes(fill = class),
		colour = "transparent",
		data = data_pred,
		width = h,
		height = h,
		alpha = 0.3) +
	geom_jitter(aes(shape = class), height = 0.1, width=0.1) +
	geom_point(data = data_centroid, size = 4, shape = 3, colour = "black") +
	geom_point(data = data_centroid, size = 6, shape = 1, stroke = 1) +
	scale_fill_manual(values = cbPalette[-1], guide = FALSE) +
	scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
	scale_shape_discrete(guide = FALSE) +
	guides(  # Repeatedly using the class factor messes up the legend
			 # Manual setup needed
		colour = guide_legend(
			title = "Species",
			override.aes = list(
				fill = "transparent",
				colour = cbPalette[2:4],
				shape = c(16, 17, 15),
				size = 2,
				linetype = 0))) +
	scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) +
	scale_y_continuous(name = "Sepal Width", expand = c(0, 0)) +
	theme_minimal() +
	theme(
		panel.grid = element_blank(),
		legend.position = "bottom")


## ----lda-qda-example, fig.width=5, fig.height=2.5, fig.align="center", echo=FALSE, warning=FALSE, results=FALSE, cache=TRUE, message=FALSE----
# Same data as for nearest centroid example
fit_lda <- MASS::lda(class ~ x1 + x2, data_train)
fit_qda <- MASS::qda(class ~ x1 + x2, data_train)

y_pred_lda <- predict(fit_lda, X_pred)$class
y_pred_qda <- predict(fit_qda, X_pred)$class

data_pred_da <- cbind(
	data_pred,
	tibble(class_lda = y_pred_lda, class_qda = y_pred_qda)) %>%
	rename(class_centroid = class)

p_centroid <- ggplot() +
	geom_tile(
		aes(x = x1, y = x2, colour = class, fill = class_centroid),
		colour = "transparent",
		data = data_pred_da,
		width = h,
		height = h,
		alpha = 0.4) +
	ggtitle("Nearest Centroids")

p_lda <- ggplot() +
	geom_tile(
		aes(x = x1, y = x2, colour = class, fill = class_lda),
		colour = "transparent",
		data = data_pred_da,
		width = h,
		height = h,
		alpha = 0.4) +
	ggtitle("LDA")

p_qda <- ggplot() +
	geom_tile(
		aes(x = x1, y = x2, colour = class, fill = class_qda),
		colour = "transparent",
		data = data_pred_da,
		width = h,
		height = h,
		alpha = 0.4) +
	ggtitle("QDA")

plots <- lapply(list(p_centroid, p_lda, p_qda), function(p) {
	p +	geom_jitter(
		aes(x = x1, y = x2, colour = class, shape = class),
		data = data_train, height = 0.1, width=0.1, size = 1) +
	scale_colour_manual(values = cbPalette[-1], guide = FALSE) +
	scale_fill_manual(values = cbPalette[-1], guide = FALSE) +
	scale_shape_discrete(guide = FALSE) +
	guides(  # Repeatedly using the class factor messes up the legend
			 # Manual setup needed
		colour = guide_legend(
			title = "Species",
			override.aes = list(
				fill = "transparent",
				colour = cbPalette[2:4],
				shape = c(16, 17, 15),
				size = 2,
				linetype = 0))) +
	scale_x_continuous(name = "Sepal Length", expand = c(0, 0)) +
	scale_y_continuous(name = "Sepal Width", expand = c(0, 0)) +
	theme_minimal() +
	theme(
		panel.grid = element_blank())
})

ggpubr::ggarrange(
	plotlist = plots, ncol = 3,
	common.legend = TRUE, legend = "bottom")