# package for big n methods library(datadr) library(dplyr) library(ggplot) kc<-read.csv("kc_house_data.csv") names(kc) head(kc) dim(kc) # kc<-kc[,-c(1,2,17)] kc$price<-log10(kc$price) kc$yr_renovated[kc$yr_renovated!=0]<-1 # ### Leveraging library(rsvd) ss<-rsvd(as.matrix(kc[,-1])) # excluding the response variable lev<-apply(ss$u^2,1,sum) # the leverage values # sampling 5000 observations with different probabilities given by leverage probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(kc)[1],prob=probs,size=1) # library(GGally) ggpairs(data=kc, columns=c(4,6,11,12,3), axisLabels="show") # ggpairs(data=kc[pp==1,], columns=c(4,6,11,12,3), axisLabels="show") ggpairs(data=kc[pp==1,c(2,3,4,5,1)], axisLabels="show") # add the linear fits and smooth fits to plot add_lines <- function(data, mapping, ...){ p <- ggplot(data = data, mapping = mapping) + geom_point() + geom_smooth(method="lm", fill="red", color="red", ...) p } # ggpairs(kc[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while # kc[,c(4,5,11,12,17,18)]<-sqrt(kc[,c(4,5,11,12,17,18)]) # a clear outlier! boxplot(lev) # ss<-rsvd(as.matrix(kc[,-1])) # excluding the response variable lev<-apply(ss$u^2,1,sum) probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(kc)[1],prob=probs,size=1) # ggpairs(kc[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while # boxplot(lev) kc2<-kc[lev<0.015,] # ss<-rsvd(as.matrix(kc2[,-1])) # excluding the response variable lev<-apply(ss$u^2,1,sum) probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(kc2)[1],prob=probs,size=1) # ggpairs(kc2[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while ######### mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2) par(mfrow=c(2,2)) plot(mm) summary(mm) # Notice that everything is super significant while the R-squared is "only" 57 percent... # This is the "problem" with large sample sizes - everything becomes significant. # # Compare with a smaller sample size mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2, subset=sample(seq(1,21000),250)) par(mfrow=c(2,2)) plot(mm) summary(mm) # Leverage results mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2,subset=seq(1,dim(kc2)[1])[pp==1]) coef(mm) confint(mm) #fast par(mfrow=c(2,2)) plot(mm) # # # running on the whole data mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc2) coef(mm) confint(mm) plot(mm) # difficult to see trends in plots with many points... ######################################## # BLB method rrkc<- divide(kc2, by = rrDiv(500), update = TRUE) # ###### Here you choose your own statistics of interest ###### I am asking for the L and U limits of a 95% CI for each regression coefficient here ###### but you could ask for the estimates also kcBlb <- rrkc%>% addTransform(function(x) { drBLB(x, statistic = function(x, weights) coef(glm(price ~ sqft_living+bedrooms+condition+grade, data = x, weights = weights, family="gaussian")), metric = function(x) quantile(x, c(0.025, 0.975)), ### Here you choose your function of interest R = 100, n = nrow(rrkc) ) }) ###### coefs <- recombine(kcBlb, combMean) # takes a while for the inference # compare BLB and full data results matrix(coefs, ncol = 2, byrow = TRUE) # compare with original data CIs confint(lm(price~sqft_living+bedrooms+condition+grade, data = kc2)) ### ##################################### ### # run again but on 10 times as much data... kc3<-kc[sample(seq(1,dim(kc2)[1]),200000,replace=T),] # ss<-rsvd(as.matrix(kc3[,-1])) # excluding the response variable lev<-apply(ss$u^2,1,sum) probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(kc3)[1],prob=probs,size=1) # ggpairs(kc3[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while ######### mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3) par(mfrow=c(2,2)) plot(mm) # takes forever.... summary(mm) ### # Leverage results mm<-lm(price~sqft_living+bedrooms+condition+grade, data=kc3,subset=seq(1,dim(kc3)[1])[pp==1]) coef(mm) confint(mm) #fast par(mfrow=c(2,2)) plot(mm) # # boxplot(lev) kc3<-kc3[lev<0.002,] # remove extremes... # ss<-rsvd(as.matrix(kc3[,-1])) # excluding the response variable lev<-apply(ss$u^2,1,sum) probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(kc3)[1],prob=probs,size=1) # ggpairs(kc3[pp==1,],columns = c(2,3,4,5,1), lower = list(continuous = add_lines)) # takes a while ######### mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3) summary(mm) ### # Leverage results mm<-lm(price~sqft_living+bedrooms+condition+grade,data=kc3,subset=seq(1,dim(kc3)[1])[pp==1]) coef(mm) confint(mm) #fast par(mfrow=c(2,2)) plot(mm) # ######################################## # BLB method rrkc<- divide(kc3, by = rrDiv(500), update = TRUE) # ###### Here you choose your own statistics of interest ###### I am asking for the L and U limits of a 95% CI for each regression coefficient here ###### but you could ask for the estimates also kcBlb <- rrkc%>% addTransform(function(x) { drBLB(x, statistic = function(x, weights) coef(glm(price ~ sqft_living+bedrooms+condition+grade, data = x, weights = weights, family="gaussian")), metric = function(x) quantile(x, c(0.025, 0.975)), ### Here you choose your function of interest R = 100, n = nrow(rrkc) ) }) ###### coefs <- recombine(kcBlb, combMean) # takes a while for the inference # compare BLB and full data results matrix(coefs, ncol = 2, byrow = TRUE) # compare with original data CIs confint(lm(price~sqft_living+bedrooms+condition+grade, data = kc3)) ###