# Here's my code for lab 2
# Only the key steps are included
#
print(names(polldata))
par(mfrow=c(2,2))
hist(polldata$mort)
qqnorm(polldata$mort)
qqline(polldata$mort)
plot(polldata$educ,polldata$mort,main="Mortality vs education")
plot(polldata$nox,polldata$mort,main="Mortality vs nox")
p<-locator()
# mort vs educ OK, perhaps a few outliers
# nox not OK
plot(polldata$educ,polldata$mort,main="Mortality vs education")
plot(log(polldata$nox),polldata$mort,main="Mortality vs nox")
p<-locator()
# logs help - but outliers present
#
mod1<-lm(mort~educ,data=polldata)
print(summary(mod1))
mod1s<-summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
p<-locator()
# some diagnostic plots
par(mfrow=c(2,2))
lmm<-lm.influence(mod1)
e.std<-mod1$res/sqrt(1-lmm$hat)
plot(mod1$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(polldata$educ,e.std,main="Residuals on education")
abline(h=0)
plot(log(polldata$nox),e.std,main="Residuals on log(nox)")
abline(h=0)
p<-locator()
#
par(mfrow=c(1,1))
plot(lmm$hat,xlab="Duration", ylab="Leverage",type="h")
abline(h=2*2/length(pollution$mort))
p<-locator()
par(mfrow=c(2,1))
plot(lmm$coeff[,2], ylab="change in educ slope")
abline(h=0)
plot(lmm$sigma,ylab="Residual MSE when obs is dropped")
abline(h=mod1s$sig)
p<-locator()
# worried about 28, 37 and 59.
#
#
#
# Now try nox
mod2<-lm(mort~log(nox),data=polldata)
print(summary(mod2))
mod2s<-summary(mod2)
par(mfrow=c(2,2))
plot(mod2)
p<-locator()
# some diagnostic plots
par(mfrow=c(2,2))
lmm<-lm.influence(mod2)
e.std<-mod2$res/sqrt(1-lmm$hat)
plot(mod2$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(polldata$educ,e.std,main="Residuals on education")
abline(h=0)
plot(log(polldata$nox),e.std,main="Residuals on log(nox)")
abline(h=0)
p<-locator()
#
par(mfrow=c(1,1))
plot(lmm$hat,xlab="", ylab="Leverage",type="h")
abline(h=2*2/length(pollution$mort))
p<-locator()
par(mfrow=c(2,1))
plot(lmm$coeff[,2], ylab="change in nox slope")
abline(h=0)
plot(lmm$sigma,ylab="Residual MSE when obs is dropped")
abline(h=mod2s$sig)
p<-locator()
# worried about 29, 47,48,49 in terms of slope, and 37 in terms of MSE
#
par(mfrow=c(2,1))
plot(log(polldata$nox),polldata$mort)
identify(log(polldata$nox),polldata$mort)
plot((polldata$educ),polldata$mort)
identify((polldata$educ),polldata$mort)
p<-locator()
# 37 seems to be a "pure outlier", 29 and 59 influential outliers
# 
# start with those
mod3<-lm(mort~educ+log(nox),data=polldata,subset=-c(29,59))
par(mfrow=c(2,2))
plot(mod3)
mod3s<-summary(mod3)
print(mod3s)
p<-locator()
#
par(mfrow=c(2,2))
lmm<-lm.influence(mod3)
e.std<-mod3$res/sqrt(1-lmm$hat)
plot(mod3$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(polldata$educ[-c(29,59)],e.std,main="Residuals on education")
abline(h=0)
plot(log(polldata$nox)[-c(29,59)],e.std,main="Residuals on log(nox)")
abline(h=0)
p<-locator()
#
#
par(mfrow=c(1,1))
plot(lmm$hat,xlab="", ylab="Leverage",type="h")
abline(h=3*2/length(pollution$mort))
p<-locator()
par(mfrow=c(3,1))
plot(lmm$coeff[,2], ylab="change in educ slope")
abline(h=0)
plot(lmm$coeff[,3], ylab="change in nox slope")
abline(h=0)
plot(lmm$sigma,ylab="Residual MSE when obs is dropped")
abline(h=mod3s$sig)
p<-locator()
# 28 impacts educ slop, 47-49 impact nox slope
#
# try dropping these (can add and drop combos as well - try at home)
#
mod3<-lm(mort~educ+log(nox),data=polldata,subset=-c(28,29,37,47,48,49,59))
par(mfrow=c(2,2))
plot(mod3)
mod3s<-summary(mod3)
print(mod3s)
p<-locator()
#
par(mfrow=c(2,2))
lmm<-lm.influence(mod3)
e.std<-mod3$res/sqrt(1-lmm$hat)
plot(mod3$fit,e.std,main="Residuals (standardized) on fitted")
abline(h=0)
qqnorm(e.std,main="residuals normal?")
qqline(e.std)
plot(polldata$educ[-c(28,29,37,47,48,49,59)],e.std,main="Residuals on education")
abline(h=0)
plot(log(polldata$nox)[-c(28,29,37,47,48,49,59)],e.std,main="Residuals on log(nox)")
abline(h=0)
p<-locator()
# now 28, 49 and 37 look bad
#
par(mfrow=c(1,1))
plot(lmm$hat,xlab="", ylab="Leverage",type="h")
abline(h=3*2/length(pollution$mort))
p<-locator()
par(mfrow=c(3,1))
plot(lmm$coeff[,2], ylab="change in educ slope")
abline(h=0)
plot(lmm$coeff[,3], ylab="change in nox slope")
abline(h=0)
plot(lmm$sigma,ylab="Residual MSE when obs is dropped")
abline(h=mod3s$sig)
p<-locator()
#
#
# Let's do the CI of the coefficient estimates
mod3s<-summary(mod3)
SEbeta<-mod3s$coef[,2]
CI<-cbind(mod3s$coef[,1]-qt(.975,mod3$df)*SEbeta,mod3s$coef[,1]+qt(.975,mod3$df)*SEbeta)
print("Coefficient CI")
print(CI)
# None of the intervals cover 0, so none are rejected
# Caveat - these are separate CI for each parameter, not a joint evaluation.
#
#
# In terms of significance nox have the larger
# p-values, though they're still small so doesn't indicate the coefficient
# should be dropped. Check this with an F-test
p<-locator()
mod3b<-update(mod3,.~.-log(nox))
# dropping the intercept
print(anova(mod3,mod3b))
p<-locator()
# No, cannot drop the nox 
#
#
# potatodata
mod1<-lm(weight~length+breadth,data=potato)
par(mfrow=c(2,2))
plot(mod1)
print(summary(mod1))
p<-locator()
#
B<-100
Coefmata<-matrix(0,B,3)
Coefmatb<-matrix(0,B,2)
for (k in (1:B)) { 
newweight<-potato$weight+rnorm(18,sd=4)
moda2<-lm(newweight ~ potato$length+potato$breadth)
modb2<-lm(newweight ~ potato$length)
Coefmata[k,]<-moda2$coef 
Coefmatb[k,]<-modb2$coef  }
#
par(mfrow=c(1,1))
plot(Coefmata[,2],Coefmata[,3],xlab="length coef",ylab="breadth coef")
p<-locator()
par(mfrow=c(1,1))
boxplot(as.data.frame(cbind(Coefmata[,2],Coefmatb[,2])),names=c("length full","length reduced"))
p<-locator()
boxplot(as.data.frame(cbind(Coefmata[,2],Coefmata[,3],Coefmata[,2]+Coefmata[,3])),names=c("length coef", "breadth coef","sum"))