##### Setup # General purpose packages (data handling, pretty plotting, ...) library(tidyverse) library(lattice) library(latex2exp) # Latex in ggplot2 labels cbPalette <- c( "#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # colour-blind friendly palette theme_set(theme_minimal() + theme( axis.title = element_text(size = 7), axis.text = element_text(size = 7), plot.title = element_text(size = 8), plot.subtitle = element_text(size = 7), legend.text = element_text(size = 7), legend.title = element_text(size = 7), strip.text = element_text(size = 7))) # Set a less verbose standard theme # Packages for actual computation # library(subspace) # Kind of an annoying package. Hard to install and has to # # be loaded globally. However, this seems to be the only # # reasonable package for subspace clustering in R # library(ggraph) # Used for plotting of graphs # library(data.table) # Used for reading data from an URL, but this is actually # # a very good implementation of data frames that is # # highly optimized for large scale computation on a # # single machine # library(ParallelForest) # Can be installed from GitHub with # # devtools::install_github("bert9bert/ParallelForest") # # Only used for a certain dataset here # library(datadr) # Package for divide and recombine. Implements e.g. # # the bag of little bootstraps # library(trelliscope) # Can be installed from GitHub with # # devtools::install_github("delta-rho/trelliscope") # library(housingData) # This package provides a data set of housing data # library(dqrng) # Fast random number generators # library(rsvd) # An implementation of randomized low-rank SVD, similar # # to the algorithm described in the lecture. ### Exploring the Johnson Lindenstrass theorem and why it is useful even ### though distances in high dimensions are generally considered useless # Dimensions to look at ps <- ceiling(exp(seq(0, 10, length.out = 20))) # Number of samples n <- 100 # Calculate distances between vectors ds <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) { X <- matrix(dqrng::dqrnorm(n * p), nrow = n) D <- dist(X) c(min = min(D), max = max(D), mean = mean(D) / sqrt(2 * p), sd = sd(D), dmm = max(D) - min(D)) }))) # Calculate distances to nearest and farthest neighbour ds_nn <- cbind(p = ps, do.call(rbind, lapply(ps, function(p) { X <- matrix(dqrng::dqrnorm(n * p), nrow = n) D <- dist(X) minmax <- apply(as.matrix(D), 2, sort)[c(2, 100),] minmax <- rowMeans(minmax) c(min = minmax[1], max = minmax[2], diff = minmax[2] - minmax[1], rel = (minmax[2] - minmax[1]) / mean(minmax)) }))) # Observations by inspecting the matrices ds and ds_nn # 1) Both the minimum and maximum distances between vectors grow with # dimension. # 2) The distance between minimum and maximum distance is roughly the # same in low and high dimensions # 3) The distance to the nearest and farthest neighbour grows with dimension # 4) The difference in distance to the nearest and the farthest neighbour # remain roughly the same across dimensions # 5) If any sample is the reference point, then the relative difference in # distance to the nearest and the farthest neighbour is going to zero # for an increase in dimension, i.e. the difference in distance to the # nearest and farthest neighbour relative to how large these distances # become in magnitude #### Big Data visualisation ## Trelliscope can be used for response exploratory data analysis ## This example is mostly stolen from their official tutorial at ## http://deltarho.org/docs-trelliscope/ # Establish a trelliscope visual database connection (the directory # vdb needs to be first created in the current working directory) conn <- trelliscope::vdbConn("vdb", name = "SLBD") # Split the data by county and state byCounty <- datadr::divide(housingData::housing, by = c("county", "state")) # Create a panel function telling what to plot for each subset timePanel <- function(x) { lattice::xyplot(medListPriceSqft + medSoldPriceSqft ~ time, data = x, auto.key = TRUE, ylab = "Price / Sq. Ft.") } # Test the visualisation timePanel(byCounty[[20]]$value) # Create a cognostics function, which is a metric that tells us an # interesting attribute about a subset of data # Needs to return a list priceCog <- function(x) { zillowString <- gsub(" ", "-", do.call(paste, datadr::getSplitVars(x))) list( slope = trelliscope::cog(coef(lm(medListPriceSqft ~ time, data = x))[2], desc = "list price slope"), meanList = trelliscope::cogMean(x$medListPriceSqft), meanSold = trelliscope::cogMean(x$medSoldPriceSqft), nObs = trelliscope::cog(length(which(!is.na(x$medListPriceSqft))), desc = "number of non-NA list prices"), zillowHref = trelliscope::cogHref( sprintf("http://www.zillow.com/homes/%s_rb/", zillowString), desc = "zillow link") ) } # Test the cognostics function priceCog(byCounty[[1]]$value) # Create the visual database which makes the following exploratory # data analysis much quicker trelliscope::makeDisplay( byCounty, name = "list_sold_vs_time_example", desc = "List and sold price over time", panelFn = timePanel, cogFn = priceCog, width = 400, height = 400, lims = list(x = "same")) # Actually view the plots trelliscope::view() ###### Big-n methods ## Prepare a housing data set kc <- read_csv("http://www.math.chalmers.se/Stat/Grundutb/GU/MSA220/S18/kc_house_data.csv", col_types = cols( .default = col_double(), id = col_character(), date = col_datetime(), yr_built = col_integer(), yr_renovated = col_integer())) # Do some basic cleanup, look at prices on the log scale, get rid of squares # for everything given in square feet and convert yr_renovated to a binary # "renovated or not" kc <- kc %>% select(-id, -date, -zipcode) %>% mutate( price = log10(price), renovated = case_when( yr_renovated != 0 ~ 1, TRUE ~ 0), sqft_living = sqrt(sqft_living), sqft_lot = sqrt(sqft_lot), sqft_above = sqrt(sqft_above), sqft_basement = sqrt(sqft_basement), sqft_living15 = sqrt(sqft_living15), sqft_lot15 = sqrt(sqft_lot15)) %>% select(-yr_renovated) ##### Bag of little bootstraps ##### This implementation interprets the bag of little bootstraps as a ##### technique in the divide and conquer framework # This splits the original data into random subsets of roughly the # specified size rrkc <- datadr::divide( kc, by = datadr::rrDiv(500), update = TRUE) kcBLB <- rrkc %>% datadr::addTransform(function(x) { drBLB( x, statistic = function(x, weights) { coef(glm(price ~ sqft_living + bedrooms + condition + grade, data = x, family = "gaussian", weights = weights)) }, metric = function(x) { quantile(x, c(0.025, 0.975)) }, R = 100, n = nrow(rrkc)) }) coefs <- datadr::recombine(kcBLB, datadr::combMean) matrix(coefs, ncol = 2, byrow = TRUE) ##### Leveraging # Let's try to predict prices from the remaining variables # Scale them since they are on different scales kc_std <- kc %>% mutate_at( vars(-price, -renovated, -waterfront), scale) %>% mutate(price = scale(price, scale = FALSE)) X <- as.matrix(kc_std %>% select(-price)) y <- kc_std$price # More than 20000 observations dim(X) # Since the data is not crazy large, we could run the linear model # on all data directly fit_all <- lm(price ~ ., kc_std) # Perform leveraging X_svd <- svd(X, nv = 0) leverage <- apply(X_svd$u ^ 2, 1, sum) # Looking at e.g. a boxplot of leveraging makes it clear that one # data point probably is an outlier # In a real analysis you'd look at that data point in detail and decide # whether it is to be excluded or not boxplot(leverage) # Sample 5000 observations from the original data where each observation # has probability hii / sum(hii) to be chosen n <- 5000 probs <- leverage / sum(leverage) idx <- sample(nrow(kc), 5000, replace = FALSE, prob = probs) fit_lev <- lm(price ~ ., kc_std[idx,]) # Compare these results, there is not much difference between # fit_lev and fit_all (from an interpretation # point of view, not in exact numbers), whereas the first is only based on 5000 # instead of over 20000 data points. Differences are notable in small # coefficients. The intercept, sqft_above, and sqft_basement are some of the # smallest coefficients and more data makes them significant, which is a # typical big-n effect summary(fit_all) confint(fit_all) summary(fit_lev) confint(fit_lev)