# package for big n methods library(datadr) library(dplyr) library(nycflights13) head(flights) data(weather) # flights$hour <- ifelse(flights$hour == 24, 0, flights$hour) #Joining the 'flights' and 'weather' datasets based on unique identifiers. flights_weather <- left_join(flights, weather) #Only focus on delays not early arrivals flights_weather$arr_delay <- ifelse(flights_weather$arr_delay >= 0, flights_weather$arr_delay, 0) flights_weather$dep_delay <- ifelse(flights_weather$dep_delay >= 0, flights_weather$dep_delay, 0) flights_weather$total_delay <- flights_weather$arr_delay + flights_weather$dep_delay # select a subset of interesting variables flights_weather<-select(flights_weather,month,day,sched_dep_time,dep_delay,sched_arr_time,arr_delay,carrier, origin,dest,air_time,distance,temp,dewp,humid,wind_dir,wind_speed,precip,pressure,visib,total_delay) ###### # ######################################################################3 ### Leveraging library(rsvd) # random SVD for fast computation flights_u<-select(flights_weather,month,day,sched_dep_time,air_time,distance,temp,dewp,humid,wind_dir,wind_speed,precip,pressure,visib,total_delay) # selected variables # flights_u2<-select(flights_weather,month,day,sched_dep_time,air_time,distance,total_delay) vv<-apply(is.na(flights_u2),1,sum) flights_u2<-flights_u2[vv==0,] # remove missing values flights_u2<-as.data.frame(flights_u2) # create month indicator variables flights_u2[["Feb"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Mar"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Apr"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["May"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Jun"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Jul"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Aug"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Sep"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Oct"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Nov"]]<-rep(0,dim(flights_u2)[1]) flights_u2[["Dec"]]<-rep(0,dim(flights_u2)[1]) # flights_u2$Feb[flights_u2$month==2]<-1 flights_u2$Mar[flights_u2$month==3]<-1 flights_u2$Apr[flights_u2$month==4]<-1 flights_u2$May[flights_u2$month==5]<-1 flights_u2$Jun[flights_u2$month==6]<-1 flights_u2$Jul[flights_u2$month==7]<-1 flights_u2$Aug[flights_u2$month==8]<-1 flights_u2$Sep[flights_u2$month==9]<-1 flights_u2$Oct[flights_u2$month==10]<-1 flights_u2$Nov[flights_u2$month==11]<-1 flights_u2$Dec[flights_u2$month==12]<-1 # flights_u2<-flights_u2[,-c(1,2)] # drop month and day # ss<-rsvd(as.matrix(flights_u2[,-dim(flights_u2)[2]])) # excluding the response variable lev<-apply(ss$u^2,1,sum) # the leverage values # sampling 5000 observations with different probabilities given by leverage probs<-lev*5000/sum(lev) probs[probs>1]<-1 pp<-rbinom(dim(flights_u2)[1],prob=probs,size=1) # Leverage results mm<-glm(log(total_delay+1)~., data=flights_u2,subset=seq(1,dim(flights_u2)[1])[pp==1],family=gaussian) coef(mm) confint(mm) #fast par(mfrow=c(2,2)) plot(mm) # fast to check diagnostics - may want to look for outliers here... # and transformations that might help # # running on the whole data mm<-glm(log(total_delay+1)~., data=flights_u2,family=gaussian) coef(mm) confint(mm) # takes a lot longer... #### some plots - transformations? perhaps turn delay into a 0-1 variable for delays>30 min and run binomial regression instead? pairs(flights_u2[pp==1,c(1:3,15)]) plot(flights_u2$air_time,log(flights_u2$total_delay+1)) plot(flights_u2$air_time[pp==1],log(flights_u2$total_delay[pp==1]+1)) # ######################################## # BLB method rrflights<- divide(flights_u2, by = rrDiv(500), update = TRUE) # ###### Here you choose your own statistics of interest ###### I am asking for the L and U limits of a 95% CI for each regression coefficient here ###### but you could ask for the estimates also flightsBlb <- rrflights%>% addTransform(function(x) { drBLB(x, statistic = function(x, weights) coef(glm(log(total_delay+1) ~ ., data = x, weights = weights, family="gaussian")), metric = function(x) quantile(x, c(0.025, 0.975)), R = 100, n = nrow(rrflights) ) }) ###### coefs <- recombine(flightsBlb, combMean) # takes a while for the inference # compare BLB and full data results matrix(coefs, ncol = 2, byrow = TRUE) # compare with original data CIs confint(lm(log(total_delay+1)~., data = flights_u2))