## ----setup, include=FALSE------------------------------------------------
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette
theme_set(theme_minimal()) # Set a less verbose standard theme
bgCol <- "#FAFAFA"
theme_set(theme_minimal()) # Set a less verbose standard theme
library(rgl) # 3D plotting
knit_hooks$set(rgl = hook_rgl)

# Packages for actual computation


## ----mesh-prep, echo=FALSE-----------------------------------------------
# For creation see the code of lecture 10
meshes <- readRDS("~/Documents/PhD/stat-learning-for-bigdata/lectures/presentations/lasso_elnet_meshes.Rds")

# Create axes
x0 <- 1.5
XYZ_axes <- matrix(c(
  -x0,   0,   0,
   x0,   0,   0,
    0, -x0,   0,
    0,  x0,   0,
    0,   0, -x0,
    0,   0,  x0
   ), ncol = 3, byrow = TRUE)

# Create axes labels
x0_lab <- 1.7
XYZ_labels <- matrix(c(
   x0_lab,       0,       0,
        0,  x0_lab,       0,
        0,       0,  x0_lab
   ), ncol = 3, byrow = TRUE)


## ----elnet-group-lasso, fig.width=4.5, fig.height=1.6, fig.align="center", rgl=TRUE, echo=FALSE, dev="png", dpi=300, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
bg3d(color = bgCol)
mfrow3d(nr = 1, nc = 3)
text3d(
  c(-0.5, 2.1, 1),
  texts = "Lasso",
  cex = 1.6)
segments3d(XYZ_axes)
plotmath3d(XYZ_labels, text = c(
  TeX("$\\beta_1$"), TeX("$\\beta_2$"), TeX("$\\beta_3$")),
  fixedSize = FALSE,
  cex = 0.3)
triangles3d(
  meshes$XYZ_lasso[as.vector(t(meshes$XYZ_lasso_tris)),],
  col = cbPalette[3])
rgl.viewpoint(theta = 30, phi = 15, zoom = 0.6)

next3d()
text3d(
  c(0.1, 2.1, 1),
  texts = "Elastic net (α = 0.7)",
  cex = 1.6)
segments3d(XYZ_axes)
plotmath3d(XYZ_labels, text = c(
  TeX("$\\beta_1$"), TeX("$\\beta_2$"), TeX("$\\beta_3$")),
  fixedSize = FALSE,
  cex = 0.3)
triangles3d(
  meshes$XYZ_elnet[as.vector(t(meshes$XYZ_elnet_tris)),],
  col = cbPalette[3])
rgl.viewpoint(theta = 30, phi = 15, zoom = 0.6)

next3d()
text3d(
  c(0.1, 2.1, 1),
  texts = "Group lasso ({β₁, β₃}, {β₂})",
  cex = 1.6)
segments3d(XYZ_axes)
plotmath3d(XYZ_labels, text = c(
  TeX("$\\beta_1$"), TeX("$\\beta_2$"), TeX("$\\beta_3$")),
  fixedSize = FALSE,
  cex=0.3)
triangles3d(
  meshes$XYZ_group_lasso[as.vector(t(meshes$XYZ_group_lasso_tris)),],
  col = cbPalette[3])
rgl.viewpoint(theta = 30, phi = 15, zoom = 0.6)


## ----scad-penalty, fig.width=4, fig.height=1.6, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
scad_penalty <- function(theta, lambda, a) {
  if (any(theta < 0)) {
    stop("Value error: theta >= 0 not fulfilled")
  }
  if (lambda < 0) {
    stop("Value error: lambda >= 0 not fulfilled")
  }
  if (a <= 2) {
    stop("Value error: a > 2 not fulfilled")
  }

  ifelse(
    theta <= lambda,
    lambda * theta,
    ifelse(
      theta <= a * lambda,
      -(theta ^ 2 - 2 * a * lambda * theta + lambda ^ 2) / (2 * (a - 1)),
      (a + 1) * lambda ^ 2 / 2))
}

beta_seq <- seq(-10, 10, by = 0.1)
lambda <- 2
a <- 3.7

data_plot <- tibble(
  beta = rep.int(beta_seq, 2),
  pabsbeta = c(lambda * abs(beta_seq), scad_penalty(abs(beta_seq), lambda, a)),
  type = as.factor(rep(c("Lasso", "SCAD"), each = length(beta_seq))))

p1 <- ggplot() +
  geom_line(
    aes(beta, pabsbeta),
    data = data_plot %>% filter(type == "Lasso"),
    size = 0.4) +
  scale_x_continuous(TeX("$\\beta$"), expand = c(0, 0)) +
  scale_y_continuous(TeX("$\\lambda |\\beta |$")) +
  ggtitle("Lasso") +
  theme(
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 8),
    axis.text = element_text(size = 8))

p2 <- ggplot() +
  geom_vline(
    aes(xintercept = c(-a * lambda, -lambda, lambda, a * lambda)),
    colour = "red",
    linetype = "dashed",
    size = 0.4) +
  geom_line(
    aes(beta, pabsbeta),
    data = data_plot %>% filter(type == "SCAD"),
    size = 0.4) +
  scale_x_continuous(TeX("$\\beta$"), expand = c(0, 0)) +
  scale_y_continuous(TeX("$p_{2, 3.5}(|\\beta |)$")) +
  ggtitle("SCAD") +
  theme(
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 8),
    axis.text = element_text(size = 8))

ggpubr::ggarrange(
  p1, p2)


## ----scad-shrinkage, fig.width=4.5, fig.height=1.8, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
soft_thresholding <- function(theta, lambda) {
  dif <- abs(theta) - lambda
  sign(theta) * dif * (dif > 0)
}

scad_solution <- function(theta, lambda, a) {
  if (lambda < 0) {
    stop("Value error: lambda >= 0 not fulfilled")
  }
  if (a <= 2) {
    stop("Value error: a > 2 not fulfilled")
  }

  ifelse(
    abs(theta) <= 2 * lambda,
    soft_thresholding(theta, lambda),
    ifelse(
      abs(theta) <= a * lambda,
      ((a - 1) * theta - sign(theta) * a * lambda) / (a - 2),
      theta))
}

bnd <- 10
beta_seq <- seq(-bnd, bnd, by = 0.1)
lambda <- 2
a <- 3.7

data_plot <- tibble(
  beta = rep.int(beta_seq, 4),
  beta_pen = c(
    beta_seq,
    beta_seq / (1 + lambda),
    soft_thresholding(beta_seq, lambda),
    scad_solution(beta_seq, lambda, a)),
  type = as.factor(
    rep(c("Identity", "Ridge", "Lasso", "SCAD"), each = length(beta_seq))))

p <- ggplot() +
  geom_segment(
    aes(x, y, xend=xend, yend=yend),
    data = tibble(
      x = c(-bnd, 0),
      y = c(0, -bnd),
      xend = c(bnd, 0),
      yend = c(0, bnd)),
    arrow = arrow(length = unit(2, "mm")),
    size = 0.3) +
  scale_x_continuous(TeX("$\\beta$"), expand = c(0, 0)) +
  scale_y_continuous(name = NULL) +
  scale_colour_manual(values = cbPalette[c(1, 3)], guide = FALSE) +
  scale_linetype_manual(values = c("dashed", "solid"), guide = FALSE) +
  coord_fixed() +
  theme(
    panel.grid = element_blank(),
    plot.title = element_text(size = 8),
    axis.title = element_text(size = 6),
    axis.text = element_text(size = 6))

p1 <- p +
  geom_line(
    aes(beta, beta_pen, colour = type, linetype = type),
    data = data_plot %>% filter(type %in% c("Identity", "Ridge")),
    size = 0.4) +
  ggtitle("Ridge")

p2 <- p +
  geom_line(
    aes(beta, beta_pen, colour = type, linetype = type),
    data = data_plot %>% filter(type %in% c("Identity", "Lasso")),
    size = 0.4) +
  ggtitle("Lasso")

p3 <- p +
  geom_line(
    aes(beta, beta_pen, colour = type, linetype = type),
    data = data_plot %>% filter(type %in% c("Identity", "SCAD")),
    size = 0.4) +
  ggtitle("SCAD")

ggpubr::ggarrange(
  p1, p2, p3, ncol = 3)


## ----sparse-multi-class-logistic, fig.width=4.5, fig.height=2, fig.align="center", echo=FALSE, results=FALSE, warning=FALSE, message=FALSE, cache=TRUE----
# These are greyscale images of handwritten digits. Each image has dimensions
# 16 x 16 and results therefore in a 256 dimensional predictor space
# The goal is to predict the digit from the image
data_train <- ElemStatLearn::zip.train
colnames(data_train) <- c("digit", sprintf("x%d", 1:256))
y_train <- data_train[,1]
X_train <- data_train[,-1]

# Prepare the class means for plotting
clm_plot <- as_tibble(data_train) %>%
  group_by(digit) %>%
  summarise_all(mean) %>%
  gather(r, cs, -digit) %>%
  mutate(
    r = as.numeric(str_remove(r, "x")) - 1,
    x = r %% 16,
    y = 16 - r %/% 16) %>%
  select(digit, x, y, cs)

# Run multinomial fit
## Very slow!!
# cv_fit <- glmnet::cv.glmnet(
#   X_train, y_train, family = "multinomial", alpha = 1)
# saveRDS(cv_fit,
#   "~/Documents/PhD/stat-learning-for-bigdata/lectures/presentations/multinomial-fit.Rds")
cv_fit <- readRDS("~/Documents/PhD/stat-learning-for-bigdata/lectures/presentations/multinomial-fit.Rds")

mclog_coefs <- glmnet::coef.glmnet(cv_fit, s = cv_fit$lambda.1se)
mclog_plot <- do.call(bind_rows,
  lapply(1:length(mclog_coefs), function(i) {
  cs <- mclog_coefs[[i]]
  tibble(
    digit = rep.int(i - 1, length(cs[-1,])),
    r = as.numeric(str_remove(names(cs[-1,]), "x")) - 1,
    cs = cs[-1,]) %>%
    filter(abs(cs) > 0) %>%
    mutate(
      x = r %% 16,
      y = 16 - r %/% 16) %>%
    select(digit, x, y, cs)
}))

ggplot() +
  geom_tile(
    aes(x, y, fill = cs),
    data = clm_plot,
    width = 1, height = 1,
    alpha = 0.7) +
  geom_tile(
    aes(x, y),
    data = mclog_plot %>% filter(cs > 0),
    width = 1, height = 1,
    fill = cbPalette[2],
    alpha = 0.9) +
  geom_tile(
    aes(x, y),
    data = mclog_plot %>% filter(cs < 0),
    width = 1, height = 1,
    fill = cbPalette[3],
    alpha = 0.9) +
  facet_wrap(~ digit, ncol = 5) +
  theme_void() +
  coord_fixed() +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_gradient(low = "black", high = "white", guide = FALSE)