# package for big n methods
library(datadr)
library(dplyr)
library(ggplot)
kc<-read.csv("kc_house_data.csv")
names(kc)
head(kc)
dim(kc)
#
kc<-kc[,-c(1,2,17)]
kc$price<-log10(kc$price)
kc$yr_renovated[kc$yr_renovated!=0]<-1 # 
### Leveraging
library(rsvd)
ss<-rsvd(as.matrix(kc[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
# the leverage values
# sampling 5000 observations with different probabilities given by leverage
probs<-lev*5000/sum(lev)
probs[probs>1]<-1
pp<-rbinom(dim(kc)[1],prob=probs,size=1)
#
library(GGally)
ggpairs(data=kc, columns=c(4,6,11,12,3),
        axisLabels="show")
#
ggpairs(data=kc[pp==1,], columns=c(4,6,11,12,3),
        axisLabels="show")
ggpairs(data=kc[pp==1,c(2,3,4,5,1)],
        axisLabels="show")
# add the linear fits and smooth fits to plot
add_lines <- function(data, mapping, ...){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method="lm", fill="red", color="red", ...)
  p
}
#
ggpairs(kc[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while
#
kc[,c(4,5,11,12,17,18)]<-sqrt(kc[,c(4,5,11,12,17,18)])
# a clear outlier!
boxplot(lev)
#
ss<-rsvd(as.matrix(kc[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
probs<-lev*5000/sum(lev)
probs[probs>1]<-1
pp<-rbinom(dim(kc)[1],prob=probs,size=1)
#
ggpairs(kc[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while
#
boxplot(lev)
kc2<-kc[lev<0.015,]
#
ss<-rsvd(as.matrix(kc2[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
probs<-lev*5000/sum(lev)
probs[probs>1]<-1
pp<-rbinom(dim(kc2)[1],prob=probs,size=1)
#
ggpairs(kc2[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while
#########
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2)
par(mfrow=c(2,2))
plot(mm)
summary(mm)
# Notice that everything is super significant while the R-squared is "only" 57 percent...
# This is the "problem" with large sample sizes - everything becomes significant.
#
# Compare with a smaller sample size
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2,
       subset=sample(seq(1,21000),250))
par(mfrow=c(2,2))
plot(mm)
summary(mm)
# Leverage results
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2,subset=seq(1,dim(kc2)[1])[pp==1])
coef(mm)
confint(mm) #fast
par(mfrow=c(2,2))
plot(mm) # 
#
# running on the whole data
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2)
coef(mm)
confint(mm) 
plot(mm) # difficult to see trends in plots with many points...
########################################
# BLB method
rrkc<- divide(kc2, by = rrDiv(500), update = TRUE)
# 
###### Here you choose your own statistics of interest
###### I am asking for the L and U limits of a 95% CI for each regression coefficient here
###### but you could ask for the estimates also
kcBlb <- rrkc%>% addTransform(function(x) {
  drBLB(x,
        statistic = function(x, weights)
          coef(glm(price ~ sqft_living+bedrooms+condition+grade,
                   data = x, weights = weights, family="gaussian")),
        metric = function(x)
          quantile(x, c(0.025, 0.975)), ### Here you choose your function of interest
        R = 100,
        n = nrow(rrkc)
  )
})
######
coefs <- recombine(kcBlb, combMean) # takes a while for the inference
# compare BLB and full data results
matrix(coefs, ncol = 2, byrow = TRUE)
# compare with original data CIs
confint(lm(price~sqft_living+bedrooms+condition+grade, data = kc2))
### 
#####################################
###
# run again but on 10 times as much data...
kc3<-kc[sample(seq(1,dim(kc2)[1]),200000,replace=T),]
#
ss<-rsvd(as.matrix(kc3[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
probs<-lev*5000/sum(lev)
probs[probs>1]<-1
pp<-rbinom(dim(kc3)[1],prob=probs,size=1)
#
ggpairs(kc3[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while
#########
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3)
par(mfrow=c(2,2))
plot(mm) # takes forever....
summary(mm)
###
# Leverage results
mm<-lm(price~sqft_living+bedrooms+condition+grade,
       data=kc3,subset=seq(1,dim(kc3)[1])[pp==1])
coef(mm)
confint(mm) #fast
par(mfrow=c(2,2))
plot(mm) # 
#
boxplot(lev)
kc3<-kc3[lev<0.002,]
# remove extremes...
#
ss<-rsvd(as.matrix(kc3[,-1])) # excluding the response variable
lev<-apply(ss$u^2,1,sum)
probs<-lev*5000/sum(lev)
probs[probs>1]<-1
pp<-rbinom(dim(kc3)[1],prob=probs,size=1)
#
ggpairs(kc3[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while
#########
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3)
summary(mm)
###
# Leverage results
mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3,subset=seq(1,dim(kc3)[1])[pp==1])
coef(mm)
confint(mm) #fast
par(mfrow=c(2,2))
plot(mm) # 
########################################
# BLB method
rrkc<- divide(kc3, by = rrDiv(500), update = TRUE)
# 
###### Here you choose your own statistics of interest
###### I am asking for the L and U limits of a 95% CI for each regression coefficient here
###### but you could ask for the estimates also
kcBlb <- rrkc%>% addTransform(function(x) {
  drBLB(x,
        statistic = function(x, weights)
          coef(glm(price ~ sqft_living+bedrooms+condition+grade,
                   data = x, weights = weights, family="gaussian")),
        metric = function(x)
          quantile(x, c(0.025, 0.975)), ### Here you choose your function of interest
        R = 100,
        n = nrow(rrkc)
  )
})
######
coefs <- recombine(kcBlb, combMean) # takes a while for the inference
# compare BLB and full data results
matrix(coefs, ncol = 2, byrow = TRUE)
# compare with original data CIs
confint(lm(price~sqft_living+bedrooms+condition+grade, data = kc3))
###