##### Setup
# General purpose packages (data handling, pretty plotting, ...)
library(tidyverse)
library(lattice)
library(latex2exp) # Latex in ggplot2 labels
cbPalette <- c(
"#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette
theme_set(theme_minimal() +
theme(
axis.title = element_text(size = 7),
axis.text = element_text(size = 7),
plot.title = element_text(size = 8),
plot.subtitle = element_text(size = 7),
legend.text = element_text(size = 7),
legend.title = element_text(size = 7),
strip.text = element_text(size = 7))) # Set a less verbose standard theme
# Packages for actual computation
# library(subspace) # Kind of an annoying package. Hard to install and has to
# # be loaded globally. However, this seems to be the only
# # reasonable package for subspace clustering in R
# library(ggraph) # Used for plotting of graphs
# library(data.table) # Used for reading data from an URL, but this is actually
# # a very good implementation of data frames that is
# # highly optimized for large scale computation on a
# # single machine
# library(ParallelForest) # Can be installed from GitHub with
# # devtools::install_github("bert9bert/ParallelForest")
# # Only used for a certain dataset here
# library(datadr) # Package for divide and recombine. Implements e.g.
# # the bag of little bootstraps
# library(trelliscope) # Can be installed from GitHub with
# # devtools::install_github("delta-rho/trelliscope")
# library(housingData) # This package provides a data set of housing data
# library(dqrng) # Fast random number generators
# library(rsvd) # An implementation of randomized low-rank SVD, similar
# # to the algorithm described in the lecture.
### Exploring the Johnson Lindenstrass theorem and why it is useful even
### though distances in high dimensions are generally considered useless
# Dimensions to look at
ps <- ceiling(exp(seq(0, 10, length.out = 20)))
# Number of samples
n <- 100
# Calculate distances between vectors
ds <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) {
X <- matrix(dqrng::dqrnorm(n * p), nrow = n)
D <- dist(X)
c(min = min(D), max = max(D),
mean = mean(D) / sqrt(2 * p), sd = sd(D),
dmm = max(D) - min(D))
})))
# Calculate distances to nearest and farthest neighbour
ds_nn <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) {
X <- matrix(dqrng::dqrnorm(n * p), nrow = n)
D <- dist(X)
minmax <- apply(as.matrix(D), 2, sort)[c(2, 100),]
minmax <- rowMeans(minmax)
c(min = minmax[1], max = minmax[2],
diff = minmax[2] - minmax[1],
rel = (minmax[2] - minmax[1]) / mean(minmax))
})))
# Observations by inspecting the matrices ds and ds_nn
# 1) Both the minimum and maximum distances between vectors grow with
# dimension.
# 2) The distance between minimum and maximum distance is roughly the
# same in low and high dimensions
# 3) The distance to the nearest and farthest neighbour grows with dimension
# 4) The difference in distance to the nearest and the farthest neighbour
# remain roughly the same across dimensions
# 5) If any sample is the reference point, then the relative difference in
# distance to the nearest and the farthest neighbour is going to zero
# for an increase in dimension, i.e. the difference in distance to the
# nearest and farthest neighbour relative to how large these distances
# become in magnitude
#### Big Data visualisation
## Trelliscope can be used for response exploratory data analysis
## This example is mostly stolen from their official tutorial at
## http://deltarho.org/docs-trelliscope/
# Establish a trelliscope visual database connection (the directory
# vdb needs to be first created in the current working directory)
conn <- trelliscope::vdbConn("vdb", name = "SLBD")
# Split the data by county and state
byCounty <- datadr::divide(housingData::housing, by = c("county", "state"))
# Create a panel function telling what to plot for each subset
timePanel <- function(x) {
lattice::xyplot(medListPriceSqft + medSoldPriceSqft ~ time,
data = x, auto.key = TRUE, ylab = "Price / Sq. Ft.")
}
# Test the visualisation
timePanel(byCounty[[20]]$value)
# Create a cognostics function, which is a metric that tells us an
# interesting attribute about a subset of data
# Needs to return a list
priceCog <- function(x) {
zillowString <- gsub(" ", "-", do.call(paste, datadr::getSplitVars(x)))
list(
slope = trelliscope::cog(coef(lm(medListPriceSqft ~ time, data = x))[2],
desc = "list price slope"),
meanList = trelliscope::cogMean(x$medListPriceSqft),
meanSold = trelliscope::cogMean(x$medSoldPriceSqft),
nObs = trelliscope::cog(length(which(!is.na(x$medListPriceSqft))),
desc = "number of non-NA list prices"),
zillowHref = trelliscope::cogHref(
sprintf("http://www.zillow.com/homes/%s_rb/", zillowString),
desc = "zillow link")
)
}
# Test the cognostics function
priceCog(byCounty[[1]]$value)
# Create the visual database which makes the following exploratory
# data analysis much quicker
trelliscope::makeDisplay(
byCounty,
name = "list_sold_vs_time_example",
desc = "List and sold price over time",
panelFn = timePanel,
cogFn = priceCog,
width = 400, height = 400,
lims = list(x = "same"))
# Actually view the plots
trelliscope::view()
###### Big-n methods
## Prepare a housing data set
kc <- read_csv("http://www.math.chalmers.se/Stat/Grundutb/GU/MSA220/S18/kc_house_data.csv",
col_types = cols(
.default = col_double(),
id = col_character(),
date = col_datetime(),
yr_built = col_integer(),
yr_renovated = col_integer()))
# Do some basic cleanup, look at prices on the log scale, get rid of squares
# for everything given in square feet and convert yr_renovated to a binary
# "renovated or not"
kc <- kc %>%
select(-id, -date, -zipcode) %>%
mutate(
price = log10(price),
renovated = case_when(
yr_renovated != 0 ~ 1,
TRUE ~ 0),
sqft_living = sqrt(sqft_living),
sqft_lot = sqrt(sqft_lot),
sqft_above = sqrt(sqft_above),
sqft_basement = sqrt(sqft_basement),
sqft_living15 = sqrt(sqft_living15),
sqft_lot15 = sqrt(sqft_lot15)) %>%
select(-yr_renovated)
##### Bag of little bootstraps
##### This implementation interprets the bag of little bootstraps as a
##### technique in the divide and conquer framework
# This splits the original data into random subsets of roughly the
# specified size
rrkc <- datadr::divide(
kc, by = datadr::rrDiv(500), update = TRUE)
kcBLB <- rrkc %>% datadr::addTransform(function(x) {
drBLB(
x,
statistic = function(x, weights) {
coef(glm(price ~ sqft_living + bedrooms + condition + grade,
data = x, family = "gaussian", weights = weights))
},
metric = function(x) {
quantile(x, c(0.025, 0.975))
},
R = 100,
n = nrow(rrkc))
})
coefs <- datadr::recombine(kcBLB, datadr::combMean)
matrix(coefs, ncol = 2, byrow = TRUE)
##### Leveraging
# Let's try to predict prices from the remaining variables
# Scale them since they are on different scales
kc_std <- kc %>%
mutate_at(
vars(-price, -renovated, -waterfront),
scale) %>%
mutate(price = scale(price, scale = FALSE))
X <- as.matrix(kc_std %>% select(-price))
y <- kc_std$price
# More than 20000 observations
dim(X)
# Since the data is not crazy large, we could run the linear model
# on all data directly
fit_all <- lm(price ~ ., kc_std)
# Perform leveraging
X_svd <- svd(X, nv = 0)
leverage <- apply(X_svd$u ^ 2, 1, sum)
# Looking at e.g. a boxplot of leveraging makes it clear that one
# data point probably is an outlier
# In a real analysis you'd look at that data point in detail and decide
# whether it is to be excluded or not
boxplot(leverage)
# Sample 5000 observations from the original data where each observation
# has probability hii / sum(hii) to be chosen
n <- 5000
probs <- leverage / sum(leverage)
idx <- sample(nrow(kc), 5000, replace = FALSE, prob = probs)
fit_lev <- lm(price ~ ., kc_std[idx,])
# Compare these results, there is not much difference between
# fit_lev and fit_all (from an interpretation
# point of view, not in exact numbers), whereas the first is only based on 5000
# instead of over 20000 data points. Differences are notable in small
# coefficients. The intercept, sqft_above, and sqft_basement are some of the
# smallest coefficients and more data makes them significant, which is a
# typical big-n effect
summary(fit_all)
confint(fit_all)
summary(fit_lev)
confint(fit_lev)