## # A rich data set with many variables # We'll considering modeling the price and the mpg as a # function of other variables. # print(names(cars2)) par(mfrow=c(2,2)) plot(cars2$mid,cars2$ci,main="citympg on price") plot(cars2$hw,cars2$ci,main="citympg on hwmpg") plot(cars2$le,cars2$ci,main="citympg on length") plot(cars2$engsize,cars2$ci,main="citympg on enginesize") p<-locator() plot(cars2$le,cars2$wi,main="width on length") plot(cars2$wi,cars2$we,main="weight on width") plot(cars2$le,cars2$wh,main="wheelbase on length") plot(cars2$lu,cars2$rea,main="rearroom on luggage room") p<-locator() # Clearly, the city mpg is not linearly related to many of the variables, # and some of the potential x-variables are related to eachother # Try at home - polynomial models for price # # Start modeling mpg # Intro to ANCOVA # par(mfrow=c(2,2)) hist(cars2$ci) qqnorm(cars2$ci) qqline(cars2$ci) # clearly, the distribution of mpg is very skewed # before jumping into polynomial modeling, let's try a simple transformation # into gpm instead of mpg p<-locator() hist(1/cars2$ci) qqnorm(1/cars2$ci) qqline(1/cars2$ci) # p<-locator() plot(cars2$we,1/cars2$ci,main="GPM on weight") plot(cars2$wi,1/cars2$ci,main="GPM on width") plot(cars2$ty,1/cars2$ci,main="GPM on type") plot(cars2$cyl,1/cars2$ci,main="GPM on cylnbr") p<-locator() # Some indication of a linear relationship of GPM on weight # more loosely related to width # size is all part of the predictors, except sporty cars tend to be small # but have gpm like large cars! # for now, drop cyl # # focus on the qualitative variable type: let's model it # as a factor variable (separate intercept for each car type) # mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+as.factor(cars2$ty)) print(summary(mod1)) mod1s<-summary(mod1) par(mfrow=c(2,2)) plot(mod1) # Residual plots look OK, but perhaps 3 outliers (observations 5,37,80) p<-locator() # above - all cars types have their own intercept # we can fit this using indicator variables (0 or 1) for each car type and add # these indicators to our set of explanatory x-variables. xc<-matrix(0,82,5) xc[cars2$type=="Compact",1]<-1 xc[cars2$type=="Small",2]<-1 xc[cars2$type=="Midsize",3]<-1 xc[cars2$type=="Large",4]<-1 xc[cars2$type=="Sporty",5]<-1 mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+xc) print(summary(mod1)) mod1s<-summary(mod1) par(mfrow=c(2,2)) plot(mod1) p<-locator() # same fit as the "factor" function above. # Note how good the fit is. # # Alternative - use compact as baseline for comparison mod1<-lm((1/cars2$ci)~cars2$we+cars2$wi+as.factor(cars2$ty)) print(summary(mod1)) mod1s<-summary(mod1) par(mfrow=c(2,2)) plot(mod1) p<-locator() xc2<-xc xc2[,1]<-1 mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+xc2) print(summary(mod1)) mod1s<-summary(mod1) par(mfrow=c(2,2)) plot(mod1) p<-locator() # same thing with indicator functions # with the compact corresponding to the intercept.. ######### mod1<-lm((1/cars2$ci)~cars2$we+cars2$wi+as.factor(cars2$ty)) # back to using compact as the baseline.... # # some diagnostic plots par(mfrow=c(2,2)) lmm<-lm.influence(mod1) e.std<-mod1$res/sqrt(1-lmm$hat) plot(mod1$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(cars2$we,e.std,main="Residuals on weight") abline(h=0) plot(cars2$ty,e.std,main="Residuals on type") abline(h=0) par<-locator() # par(mfrow=c(2,2)) plot(lmm$hat,type='h') plot(lmm$coef[,1],type='h') plot(lmm$coef[,2],type='h') plot(lmm$sig,type='h') p<-locator() # obs 80 and 37 outliers? # fivenum summaries #FNS<-cbind(fivenum(1/cars2$ci),fivenum(cars2$we),fivenum(cars2$wi)) #print(FNS) #boxplot(1/cars2$ci~as.factor(cars2$typ)) #boxplot(cars2$we~as.factor(cars2$typ)) #boxplot(cars2$wi~as.factor(cars2$typ)) # #print(c(1/cars2$ci[80],cars2$we[80],cars2$wi[80],cars2$type[80])) #print(c(1/cars2$ci[37],cars2$we[37],cars2$wi[37],cars2$type[37])) # 80 is a sporty car, high-end on GPM and we, but low-end on wi # 37 is a small car, low-end on GPM, typical we, but high-end wi # 80 could be influential, 37 is more of a 'pure outlier' # #p<-locator() print(step(mod1)) # drop type and wi! only we is kept p<-locator() mod2<-lm((1/cars2$ci)~cars2$we) summary(mod1) summary(mod2) p<-locator() # print(anova(mod1,mod2)) # # print("try dropping 37 and 80?") mod1c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)]+cars2$wi[-c(37,80)]+as.factor(cars2$ty)[-c(37,80)]) summary(mod1c) p<-locator() step(mod1c) # same final model, but we trust tests more with this stable sigma # # some diagnostic plots p<-locator() par(mfrow=c(2,2)) plot(mod1c) p<-locator() par(mfrow=c(2,2)) lmm<-lm.influence(mod1c) e.std<-mod1c$res/sqrt(1-lmm$hat) plot(mod1c$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(cars2$we[-c(37,80)],e.std,main="Residuals on weight") abline(h=0) plot(cars2$ty[-c(37,80)],e.std,main="Residuals on type") abline(h=0) par<-locator() # par(mfrow=c(2,2)) plot(lmm$hat,type='h') plot(lmm$coef[,1],type='h') plot(lmm$coef[,2],type='h') plot(lmm$sig,type='h') p<-locator() # perhaps 5 and 10? # # Fit is a little better # Let's do the CI of the coefficient estimates mod1cs<-summary(mod1c) SEbeta<-mod1cs$coef[,2] CI<-cbind(mod1cs$coef[,1]-qt(.975,mod1c$df)*SEbeta,mod1cs$coef[,1]+qt(.975,mod1c$df)*SEbeta) print("Coefficient CI") print(CI) # Only we intervals does not cover 0, so is rejected # # p<-locator() mod2c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)]) mod3c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)]-1) print(anova(mod1c,mod2c,mod3c)) # can drop the intercept and all predictors except weight