# Here's my code for lab 2 # Only the key steps are included # print(names(polldata)) par(mfrow=c(2,2)) hist(polldata$mort) qqnorm(polldata$mort) qqline(polldata$mort) plot(polldata$educ,polldata$mort,main="Mortality vs education") plot(polldata$nox,polldata$mort,main="Mortality vs nox") p<-locator() # mort vs educ OK, perhaps a few outliers # nox not OK plot(polldata$educ,polldata$mort,main="Mortality vs education") plot(log(polldata$nox),polldata$mort,main="Mortality vs nox") p<-locator() # logs help - but outliers present # mod1<-lm(mort~educ,data=polldata) print(summary(mod1)) mod1s<-summary(mod1) par(mfrow=c(2,2)) plot(mod1) p<-locator() # some diagnostic plots par(mfrow=c(2,2)) lmm<-lm.influence(mod1) e.std<-mod1$res/sqrt(1-lmm$hat) plot(mod1$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(polldata$educ,e.std,main="Residuals on education") abline(h=0) plot(log(polldata$nox),e.std,main="Residuals on log(nox)") abline(h=0) p<-locator() # par(mfrow=c(1,1)) plot(lmm$hat,xlab="Duration", ylab="Leverage",type="h") abline(h=2*2/length(pollution$mort)) p<-locator() par(mfrow=c(2,1)) plot(lmm$coeff[,2], ylab="change in educ slope") abline(h=0) plot(lmm$sigma,ylab="Residual MSE when obs is dropped") abline(h=mod1s$sig) p<-locator() # worried about 28, 37 and 59. # # # # Now try nox mod2<-lm(mort~log(nox),data=polldata) print(summary(mod2)) mod2s<-summary(mod2) par(mfrow=c(2,2)) plot(mod2) p<-locator() # some diagnostic plots par(mfrow=c(2,2)) lmm<-lm.influence(mod2) e.std<-mod2$res/sqrt(1-lmm$hat) plot(mod2$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(polldata$educ,e.std,main="Residuals on education") abline(h=0) plot(log(polldata$nox),e.std,main="Residuals on log(nox)") abline(h=0) p<-locator() # par(mfrow=c(1,1)) plot(lmm$hat,xlab="", ylab="Leverage",type="h") abline(h=2*2/length(pollution$mort)) p<-locator() par(mfrow=c(2,1)) plot(lmm$coeff[,2], ylab="change in nox slope") abline(h=0) plot(lmm$sigma,ylab="Residual MSE when obs is dropped") abline(h=mod2s$sig) p<-locator() # worried about 29, 47,48,49 in terms of slope, and 37 in terms of MSE # par(mfrow=c(2,1)) plot(log(polldata$nox),polldata$mort) identify(log(polldata$nox),polldata$mort) plot((polldata$educ),polldata$mort) identify((polldata$educ),polldata$mort) p<-locator() # 37 seems to be a "pure outlier", 29 and 59 influential outliers # # start with those mod3<-lm(mort~educ+log(nox),data=polldata,subset=-c(29,59)) par(mfrow=c(2,2)) plot(mod3) mod3s<-summary(mod3) print(mod3s) p<-locator() # par(mfrow=c(2,2)) lmm<-lm.influence(mod3) e.std<-mod3$res/sqrt(1-lmm$hat) plot(mod3$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(polldata$educ[-c(29,59)],e.std,main="Residuals on education") abline(h=0) plot(log(polldata$nox)[-c(29,59)],e.std,main="Residuals on log(nox)") abline(h=0) p<-locator() # # par(mfrow=c(1,1)) plot(lmm$hat,xlab="", ylab="Leverage",type="h") abline(h=3*2/length(pollution$mort)) p<-locator() par(mfrow=c(3,1)) plot(lmm$coeff[,2], ylab="change in educ slope") abline(h=0) plot(lmm$coeff[,3], ylab="change in nox slope") abline(h=0) plot(lmm$sigma,ylab="Residual MSE when obs is dropped") abline(h=mod3s$sig) p<-locator() # 28 impacts educ slop, 47-49 impact nox slope # # try dropping these (can add and drop combos as well - try at home) # mod3<-lm(mort~educ+log(nox),data=polldata,subset=-c(28,29,37,47,48,49,59)) par(mfrow=c(2,2)) plot(mod3) mod3s<-summary(mod3) print(mod3s) p<-locator() # par(mfrow=c(2,2)) lmm<-lm.influence(mod3) e.std<-mod3$res/sqrt(1-lmm$hat) plot(mod3$fit,e.std,main="Residuals (standardized) on fitted") abline(h=0) qqnorm(e.std,main="residuals normal?") qqline(e.std) plot(polldata$educ[-c(28,29,37,47,48,49,59)],e.std,main="Residuals on education") abline(h=0) plot(log(polldata$nox)[-c(28,29,37,47,48,49,59)],e.std,main="Residuals on log(nox)") abline(h=0) p<-locator() # now 28, 49 and 37 look bad # par(mfrow=c(1,1)) plot(lmm$hat,xlab="", ylab="Leverage",type="h") abline(h=3*2/length(pollution$mort)) p<-locator() par(mfrow=c(3,1)) plot(lmm$coeff[,2], ylab="change in educ slope") abline(h=0) plot(lmm$coeff[,3], ylab="change in nox slope") abline(h=0) plot(lmm$sigma,ylab="Residual MSE when obs is dropped") abline(h=mod3s$sig) p<-locator() # # # Let's do the CI of the coefficient estimates mod3s<-summary(mod3) SEbeta<-mod3s$coef[,2] CI<-cbind(mod3s$coef[,1]-qt(.975,mod3$df)*SEbeta,mod3s$coef[,1]+qt(.975,mod3$df)*SEbeta) print("Coefficient CI") print(CI) # None of the intervals cover 0, so none are rejected # Caveat - these are separate CI for each parameter, not a joint evaluation. # # # In terms of significance nox have the larger # p-values, though they're still small so doesn't indicate the coefficient # should be dropped. Check this with an F-test p<-locator() mod3b<-update(mod3,.~.-log(nox)) # dropping the intercept print(anova(mod3,mod3b)) p<-locator() # No, cannot drop the nox # # # potatodata mod1<-lm(weight~length+breadth,data=potato) par(mfrow=c(2,2)) plot(mod1) print(summary(mod1)) p<-locator() # B<-100 Coefmata<-matrix(0,B,3) Coefmatb<-matrix(0,B,2) for (k in (1:B)) { newweight<-potato$weight+rnorm(18,sd=4) moda2<-lm(newweight ~ potato$length+potato$breadth) modb2<-lm(newweight ~ potato$length) Coefmata[k,]<-moda2$coef Coefmatb[k,]<-modb2$coef } # par(mfrow=c(1,1)) plot(Coefmata[,2],Coefmata[,3],xlab="length coef",ylab="breadth coef") p<-locator() par(mfrow=c(1,1)) boxplot(as.data.frame(cbind(Coefmata[,2],Coefmatb[,2])),names=c("length full","length reduced")) p<-locator() boxplot(as.data.frame(cbind(Coefmata[,2],Coefmata[,3],Coefmata[,2]+Coefmata[,3])),names=c("length coef", "breadth coef","sum"))