##
# A rich data set with many variables
# We'll considering modeling the price and the mpg as a 
# function of other variables. 
#
print(names(cars2))
par(mfrow=c(2,2))
plot(cars2$mid,cars2$ci,main="citympg on price")
plot(cars2$hw,cars2$ci,main="citympg on hwmpg")
plot(cars2$le,cars2$ci,main="citympg on length")
plot(cars2$engsize,cars2$ci,main="citympg on enginesize")
p<-locator()
plot(cars2$le,cars2$wi,main="width on length")
plot(cars2$wi,cars2$we,main="weight on width")
plot(cars2$le,cars2$wh,main="wheelbase on length")
plot(cars2$lu,cars2$rea,main="rearroom on luggage room")
p<-locator()
# Clearly, the city mpg is not linearly related to many of the variables,
# and some of the potential x-variables are related to eachother
# Try at home - polynomial models for price
#
# Start modeling mpg
# Intro to ANCOVA
#
par(mfrow=c(2,2))
hist(cars2$ci)
qqnorm(cars2$ci)
qqline(cars2$ci)
# clearly, the distribution of mpg is very skewed
# before jumping into polynomial modeling, let's try a simple transformation
# into gpm instead of mpg
p<-locator()
hist(1/cars2$ci)
qqnorm(1/cars2$ci)
qqline(1/cars2$ci)
#
p<-locator()
plot(cars2$we,1/cars2$ci,main="GPM on weight")
plot(cars2$wi,1/cars2$ci,main="GPM on width")
plot(cars2$ty,1/cars2$ci,main="GPM on type")
plot(cars2$cyl,1/cars2$ci,main="GPM on cylnbr")
p<-locator()
# Some indication of a linear relationship of GPM on weight
# more loosely related to width
# size is all part of the predictors, except sporty cars tend to be small
# but have gpm like large cars!
# for now, drop cyl # 
# focus on the qualitative variable type: let's model it
# as a factor variable (separate intercept for each car type)
#
mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+as.factor(cars2$ty))
print(summary(mod1))
mod1s<-summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
# Residual plots look OK, but perhaps 3 outliers (observations 5,37,80)
p<-locator()
# above - all cars types have their own intercept
# we can fit this using indicator variables (0 or 1) for each car type and add 
# these indicators to our set of explanatory x-variables. 
xc<-matrix(0,82,5)
xc[cars2$type=="Compact",1]<-1
xc[cars2$type=="Small",2]<-1
xc[cars2$type=="Midsize",3]<-1
xc[cars2$type=="Large",4]<-1
xc[cars2$type=="Sporty",5]<-1
mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+xc)
print(summary(mod1))
mod1s<-summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
p<-locator()
# same fit as the "factor" function above.
# Note how good the fit is. 
#
# Alternative - use compact as baseline for comparison
mod1<-lm((1/cars2$ci)~cars2$we+cars2$wi+as.factor(cars2$ty))
print(summary(mod1))
mod1s<-summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
p<-locator()
xc2<-xc
xc2[,1]<-1
mod1<-lm((1/cars2$ci)~-1+cars2$we+cars2$wi+xc2)
print(summary(mod1))
mod1s<-summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
p<-locator()
# same thing with indicator functions
# with the compact corresponding to the intercept..
#########
mod1<-lm((1/cars2$ci)~cars2$we+cars2$wi+as.factor(cars2$ty))
# back to using compact as the baseline....
#
# some diagnostic plots
par(mfrow=c(2,2))
lmm<-lm.influence(mod1)
e.std<-mod1$res/sqrt(1-lmm$hat)
plot(mod1$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(cars2$we,e.std,main="Residuals on weight")
abline(h=0)
plot(cars2$ty,e.std,main="Residuals on type")
abline(h=0)
par<-locator()
#
par(mfrow=c(2,2))
plot(lmm$hat,type='h')
plot(lmm$coef[,1],type='h')
plot(lmm$coef[,2],type='h')
plot(lmm$sig,type='h')
p<-locator()
# obs 80 and 37 outliers?
# fivenum summaries
#FNS<-cbind(fivenum(1/cars2$ci),fivenum(cars2$we),fivenum(cars2$wi))
#print(FNS)
#boxplot(1/cars2$ci~as.factor(cars2$typ))
#boxplot(cars2$we~as.factor(cars2$typ))
#boxplot(cars2$wi~as.factor(cars2$typ))
#
#print(c(1/cars2$ci[80],cars2$we[80],cars2$wi[80],cars2$type[80]))
#print(c(1/cars2$ci[37],cars2$we[37],cars2$wi[37],cars2$type[37]))
# 80 is a sporty car, high-end on GPM and we, but low-end on wi
# 37 is a small car, low-end on GPM, typical we, but high-end wi
# 80 could be influential, 37 is more of a 'pure outlier'
#
#p<-locator()
print(step(mod1))
# drop type and wi! only we is kept
p<-locator()
mod2<-lm((1/cars2$ci)~cars2$we)
summary(mod1)
summary(mod2)
p<-locator()
#
print(anova(mod1,mod2))
#
# 
print("try dropping 37 and 80?")
mod1c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)]+cars2$wi[-c(37,80)]+as.factor(cars2$ty)[-c(37,80)])
summary(mod1c)
p<-locator()
step(mod1c)
# same final model, but we trust tests more with this stable sigma
#
# some diagnostic plots
p<-locator()
par(mfrow=c(2,2))
plot(mod1c)
p<-locator()
par(mfrow=c(2,2))
lmm<-lm.influence(mod1c)
e.std<-mod1c$res/sqrt(1-lmm$hat)
plot(mod1c$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(cars2$we[-c(37,80)],e.std,main="Residuals on weight")
abline(h=0)
plot(cars2$ty[-c(37,80)],e.std,main="Residuals on type")
abline(h=0)
par<-locator()
#
par(mfrow=c(2,2))
plot(lmm$hat,type='h')
plot(lmm$coef[,1],type='h')
plot(lmm$coef[,2],type='h')
plot(lmm$sig,type='h')
p<-locator()
# perhaps 5 and 10?
#
# Fit is a little better
# Let's do the CI of the coefficient estimates
mod1cs<-summary(mod1c)
SEbeta<-mod1cs$coef[,2]
CI<-cbind(mod1cs$coef[,1]-qt(.975,mod1c$df)*SEbeta,mod1cs$coef[,1]+qt(.975,mod1c$df)*SEbeta)
print("Coefficient CI")
print(CI)
# Only we intervals does not cover 0, so is rejected
#
# 
p<-locator()
mod2c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)])
mod3c<-lm((1/cars2$ci)[-c(37,80)]~cars2$we[-c(37,80)]-1)
print(anova(mod1c,mod2c,mod3c))
# can drop the intercept and all predictors except weight