##### Setup
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(lattice)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
  "#999999", "#E69F00", "#56B4E9", "#009E73",
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette
theme_set(theme_minimal() +
  theme(
    axis.title = element_text(size = 7),
    axis.text = element_text(size = 7),
    plot.title = element_text(size = 8),
    plot.subtitle = element_text(size = 7),
    legend.text = element_text(size = 7),
    legend.title = element_text(size = 7),
    strip.text = element_text(size = 7))) # Set a less verbose standard theme

# Packages for actual computation
# library(subspace)   # Kind of an annoying package. Hard to install and has to
#                     # be loaded globally. However, this seems to be the only
#                     # reasonable package for subspace clustering in R
# library(ggraph)     # Used for plotting of graphs
# library(data.table) # Used for reading data from an URL, but this is actually
#                     # a very good implementation of data frames that is
#                     # highly optimized for large scale computation on a
#                     # single machine

# library(ParallelForest) # Can be installed from GitHub with
#                       # devtools::install_github("bert9bert/ParallelForest")
#                       # Only used for a certain dataset here
# library(datadr)       # Package for divide and recombine. Implements e.g.
#                       # the bag of little bootstraps
# library(trelliscope)  # Can be installed from GitHub with
#                       # devtools::install_github("delta-rho/trelliscope")
# library(housingData)  # This package provides a data set of housing data
# library(dqrng)        # Fast random number generators
# library(rsvd)         # An implementation of randomized low-rank SVD, similar
#                       # to the algorithm described in the lecture.


### Exploring the Johnson Lindenstrass theorem and why it is useful even
### though distances in high dimensions are generally considered useless

# Dimensions to look at
ps <- ceiling(exp(seq(0, 10, length.out = 20)))
# Number of samples
n <- 100

# Calculate distances between vectors
ds <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) {
  X <- matrix(dqrng::dqrnorm(n * p), nrow = n)
  D <- dist(X)
  c(min = min(D), max = max(D),
    mean = mean(D) / sqrt(2 * p), sd = sd(D),
    dmm = max(D) - min(D))
})))

# Calculate distances to nearest and farthest neighbour
ds_nn <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) {
  X <- matrix(dqrng::dqrnorm(n * p), nrow = n)
  D <- dist(X)
  minmax <- apply(as.matrix(D), 2, sort)[c(2, 100),]
  minmax <- rowMeans(minmax)
  c(min = minmax[1], max = minmax[2],
    diff = minmax[2] - minmax[1],
    rel = (minmax[2] - minmax[1]) / mean(minmax))
})))

# Observations by inspecting the matrices ds and ds_nn
# 1) Both the minimum and maximum distances between vectors grow with
#    dimension.
# 2) The distance between minimum and maximum distance is roughly the
#    same in low and high dimensions
# 3) The distance to the nearest and farthest neighbour grows with dimension
# 4) The difference in distance to the nearest and the farthest neighbour
#    remain roughly the same across dimensions
# 5) If any sample is the reference point, then the relative difference in
#    distance to the nearest and the farthest neighbour is going to zero
#    for an increase in dimension, i.e. the difference in distance to the
#    nearest and farthest neighbour relative to how large these distances
#    become in magnitude

#### Big Data visualisation
## Trelliscope can be used for response exploratory data analysis
## This example is mostly stolen from their official tutorial at
## http://deltarho.org/docs-trelliscope/
# Establish a trelliscope visual database connection (the directory
# vdb needs to be first created in the current working directory)
conn <- trelliscope::vdbConn("vdb", name = "SLBD")

# Split the data by county and state
byCounty <- datadr::divide(housingData::housing, by = c("county", "state"))
# Create a panel function telling what to plot for each subset
timePanel <- function(x) {
  lattice::xyplot(medListPriceSqft + medSoldPriceSqft ~ time,
    data = x, auto.key = TRUE, ylab = "Price / Sq. Ft.")
}

# Test the visualisation
timePanel(byCounty[[20]]$value)

# Create a cognostics function, which is a metric that tells us an
# interesting attribute about a subset of data
# Needs to return a list
priceCog <- function(x) {
  zillowString <- gsub(" ", "-", do.call(paste, datadr::getSplitVars(x)))
  list(
    slope = trelliscope::cog(coef(lm(medListPriceSqft ~ time, data = x))[2],
      desc = "list price slope"),
    meanList = trelliscope::cogMean(x$medListPriceSqft),
    meanSold = trelliscope::cogMean(x$medSoldPriceSqft),
    nObs = trelliscope::cog(length(which(!is.na(x$medListPriceSqft))),
      desc = "number of non-NA list prices"),
    zillowHref = trelliscope::cogHref(
      sprintf("http://www.zillow.com/homes/%s_rb/", zillowString),
      desc = "zillow link")
    )
}

# Test the cognostics function
priceCog(byCounty[[1]]$value)

# Create the visual database which makes the following exploratory
# data analysis much quicker
trelliscope::makeDisplay(
  byCounty,
  name = "list_sold_vs_time_example",
  desc = "List and sold price over time",
  panelFn = timePanel,
  cogFn = priceCog,
  width = 400, height = 400,
  lims = list(x = "same"))

# Actually view the plots
trelliscope::view()


###### Big-n methods
## Prepare a housing data set
kc <- read_csv("http://www.math.chalmers.se/Stat/Grundutb/GU/MSA220/S18/kc_house_data.csv",
  col_types = cols(
    .default = col_double(),
    id = col_character(),
    date = col_datetime(),
    yr_built = col_integer(),
    yr_renovated = col_integer()))

# Do some basic cleanup, look at prices on the log scale, get rid of squares
# for everything given in square feet and convert yr_renovated to a binary
# "renovated or not"
kc <- kc %>%
  select(-id, -date, -zipcode) %>%
  mutate(
    price = log10(price),
    renovated = case_when(
      yr_renovated != 0 ~ 1,
      TRUE ~ 0),
    sqft_living = sqrt(sqft_living),
    sqft_lot = sqrt(sqft_lot),
    sqft_above = sqrt(sqft_above),
    sqft_basement = sqrt(sqft_basement),
    sqft_living15 = sqrt(sqft_living15),
    sqft_lot15 = sqrt(sqft_lot15)) %>%
  select(-yr_renovated)


##### Bag of little bootstraps
##### This implementation interprets the bag of little bootstraps as a
##### technique in the divide and conquer framework
# This splits the original data into random subsets of roughly the
# specified size
rrkc <- datadr::divide(
  kc, by = datadr::rrDiv(500), update = TRUE)

kcBLB <- rrkc %>% datadr::addTransform(function(x) {
  drBLB(
    x,
    statistic = function(x, weights) {
      coef(glm(price ~ sqft_living + bedrooms + condition + grade,
      data = x, family = "gaussian", weights = weights))
    },
    metric = function(x) {
      quantile(x, c(0.025, 0.975))
    },
    R = 100,
    n = nrow(rrkc))
})

coefs <- datadr::recombine(kcBLB, datadr::combMean)
matrix(coefs, ncol = 2, byrow = TRUE)

##### Leveraging
# Let's try to predict prices from the remaining variables
# Scale them since they are on different scales
kc_std <- kc %>%
  mutate_at(
    vars(-price, -renovated, -waterfront),
    scale) %>%
  mutate(price = scale(price, scale = FALSE))
X <- as.matrix(kc_std %>% select(-price))
y <- kc_std$price

# More than 20000 observations
dim(X)

# Since the data is not crazy large, we could run the linear model
# on all data directly
fit_all <- lm(price ~ ., kc_std)

# Perform leveraging
X_svd <- svd(X, nv = 0)
leverage <- apply(X_svd$u ^ 2, 1, sum)

# Looking at e.g. a boxplot of leveraging makes it clear that one
# data point probably is an outlier
# In a real analysis you'd look at that data point in detail and decide
# whether it is to be excluded or not
boxplot(leverage)

# Sample 5000 observations from the original data where each observation
# has probability hii / sum(hii) to be chosen
n <- 5000
probs <- leverage / sum(leverage)
idx <- sample(nrow(kc), 5000, replace = FALSE, prob = probs)

fit_lev <- lm(price ~ ., kc_std[idx,])

# Compare these results, there is not much difference between
# fit_lev and fit_all (from an interpretation
# point of view, not in exact numbers), whereas the first is only based on 5000
# instead of over 20000 data points. Differences are notable in small
# coefficients. The intercept, sqft_above, and sqft_basement are some of the
# smallest coefficients and more data makes them significant, which is a
# typical big-n effect
summary(fit_all)
confint(fit_all)
summary(fit_lev)
confint(fit_lev)