# The DATA# # sleeptab<-read.table('sleeptab',header=T) # 1 - list the variables and data entries bwt<-sleeptab$bwt brwt<-sleeptab$brwt print(sleeptab$Spec) print(names(sleeptab)) print(sleeptab[1:9,-c(4,5,6)]) ## Histograms # 2 par(mfrow=c(1,2)) hist(bwt,20,xlab="Body weight, kg") hist(brwt,20,xlab="Brain weight, g") p<-locator() # p<-locator() halts the execution until R receives input from you: # Right-click in the graphics window and choose "STOP" in order to continue. ## Data transforms # 3 par(mfrow=c(1,2)) hist(log(bwt),20,xlab="Body weight, kg") hist(log(brwt),20,xlab="Brain weight, g") p<-locator() # # Boxplots & Barcharts par(mfrow=c(1,1)) boxplot(log(brwt)~sleeptab$pred,main="Boxplot brwt on predator index") # # Data summaries # 4 print("Fivenumber summary of log body weight") print(fivenum(log(bwt))) print("Fivenumber summary of log brain weight") print(fivenum(log(brwt))) p<-locator() ## Robustness # 5 #print("Robustness") #bwt2<-bwt #bwt2[1]<-bwt2[1]/1000 #print("Fivenumber summary of log bwt contaminated") #print(fivenum(log(bwt2))) # 6 #p<-locator() #print("What about the mean and SD?") #print(c("original data:","mean=",round(mean(log(bwt)),3),"SD=",round(sd(log(bwt)),3))) #print(c("contaminated data:","mean=",round(mean(log(bwt2)),3),"SD=",round(sd(log(bwt2)),3))) # NOTE: the mean and sd are much more affected by a contamination of the data # than the five number summaries. # # # 7 #p<-locator() ## QQ plots# short tails compared to normal# par(mfrow=c(1,1)) qqnorm(log(brwt),main="QQplot of log brain weight") qqline(log(brwt)) p<-locator() ### scatter plots## gestation<-sleeptab$gest lifespan<-sleeptab$life danger<-sleeptab$dang dream<-sleeptab$dream totsleep<-sleeptab$totsleep ## Lifespan on gestation # 8 - To highlight points in the graph, left-click on them. Then, right-click # to "stop" and proceed with the next set of commands. par(mfrow=c(1,1)) plot(gestation[gestation!=-999 & lifespan!=-999],lifespan[gestation!=-999 & lifespan!=-999], main="Lifespan on Gestation") identify(gestation[gestation!=-999 & lifespan!=-999],lifespan[gestation!=-999 & lifespan!=-999],sleeptab$Spec[gestation!=-999 & lifespan!=-999]) p<-locator() ## brain wt on gestation # 9 plot(log(brwt)[gestation!=-999],gestation[gestation!=-999],main="Gestation on Brain weight") identify(log(brwt)[gestation!=-999],gestation[gestation!=-999],sleeptab$Spec[gestation!=-999]) p<-locator() # plot(log(brwt)[gestation!=-999],sqrt(gestation[gestation!=-999]),main="SQRT gestation on Brain weight") identify(log(brwt)[gestation!=-999],sqrt(gestation[gestation!=-999]),sleeptab$Spec[gestation!=-999]) p<-locator() # plot(log(brwt)[gestation!=-999],(gestation[gestation!=-999])^(1/3),main="3-rd root gestation on Brain weight") identify(log(brwt)[gestation!=-999],(gestation[gestation!=-999])^(1/3),sleeptab$Spec[gestation!=-999]) p<-locator() # ## brain wt on body wt # 10 plot(log(bwt),log(brwt),main="Brain weight on Body Weight") identify(log(bwt),log(brwt),sleeptab$Spec) p<-locator() ## sleep on dream # 11 plot(totsleep[dream!=-999 & totsleep!=-999],dream[dream!=-999 & totsleep!=-999],main="Dream hours on Sleep hours") identify(totsleep[dream!=-999 & totsleep!=-999],dream[dream!=-999 & totsleep!=-999],sleeptab$Spec[dream!=-999 & totsleep!=-999]) p<-locator() ## Regression # Brain weight on Body weight # scatter plots, correlation# par(mfrow=c(1,2)) plot((bwt),(brwt),xlab="Body weight",ylab="Brain weigh") plot(log(bwt),log(brwt),xlab="Body weight (log)",ylab="Brain weigh (log)") p<-locator() print(c("Correlation coefficient (logs first)",cor(log(bwt),log(brwt)))) p<-locator() # # # # ## Linear models # Regression line, fitted# par(mfrow=c(1,1)) plot(log(bwt),log(brwt),xlab="Body weight (log)",ylab="Brain weigh (log)") mod1<-lm(log(brwt)~log(bwt)) mod1o<-mod1 print(summary(mod1)) lines(log(bwt),mod1$fitted) p<-locator() ## Regression diagnostics # residual plots# par(mfrow=c(1,1)) plot(log(bwt),mod1$res,xlab="Body weight (log)",ylab="Residuals",main="Diagnostic plots") p<-locator() plot(mod1$fitted,mod1$res,xlab="Fitted values",ylab="Residuals") p<-locator() qqnorm(mod1$residuals) qqline(mod1$residual) p<-locator() ## Leverage# lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(log(bwt),lmi$hat,xlab="Body weight (log)", ylab="Leverage") abline(h=3/length(bwt)) p<-locator() plot(log(bwt),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope") p<-locator() plot(log(bwt),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped") p<-locator() lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(lmi$hat, ylab="Leverage",type="h") abline(h=3/length(bwt)) p<-locator() # resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat)) resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2)) cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df) par(mfrow=c(1,1)) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mod1$df),lty=2) print(sleeptab[cooksd>qf(.95,1,mod1$df),]) p<-locator() plot(log(bwt),log(brwt),xlab="Body weights", ylab="brwt") lines(log(bwt),mod1$fitted) points(log(bwt)[cooksd>qf(.95,1,mod1$df)],log(brwt)[cooksd>qf(.95,1,mod1$df)],pch=3,col=2) p<-locator() # Cook's D is a way of combining both leverage and residual size into one # measure of "outlyingness". # Cook's D = residual^2 * (leverage/(1-leverage)^2) * 1/(p*MSE) # which is essentially the sum of the squared studentized residuals # normalized by p*MSE # If there are no outliers, and errors are normal, Cook's D follows a # F-distribution on p and n-p degrees of freedom # ## Contamination# par(mfrow=c(1,1)) plot(log(bwt),c(log(250000),log(brwt[2:62])),xlab="Body weight (log)",ylab="Brain weight (log)",main="contaminated data") mod1<-lm(c(log(250000),log(brwt[2:62]))~log(bwt)) print(summary(mod1)) lines(log(bwt),mod1$fitted) lines(log(bwt),mod1o$fitted,col='red',lty=2) p<-locator() ## Leverage again# lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(log(bwt),lmi$hat,xlab="Body weight (log)", ylab="Leverage") abline(h=2/length(bwt)) p<-locator() plot(log(bwt),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope") p<-locator() plot(log(bwt),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped") p<-locator() lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(lmi$hat, ylab="Leverage",type="h") abline(h=3/length(bwt)) p<-locator() # resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat)) resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2)) cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df) par(mfrow=c(1,1)) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mod1$df),lty=2) print(sleeptab[cooksd>qf(.95,1,mod1$df),]) p<-locator() plot(log(bwt),c(log(250000),log(brwt[2:62])),xlab="Body weights", ylab="brwt") lines(log(bwt),mod1$fitted) points(log(bwt)[cooksd>qf(.95,1,mod1$df)],c(log(250000),log(brwt[2:62]))[cooksd>qf(.95,1,mod1$df)],pch=3,col=2) p<-locator() # ## Extreme leverage# par(mfrow=c(1,1)) newx<-14 newy<-mod1o$coef[1]+mod1o$coef[2]*newx plot(c(newx,log(bwt)[2:62]),c(newy,log(brwt)[2:62]),xlab="Body weight (log)",ylab="Brain weight (log)",main="contaminated data") mod1<-lm(c(newy,log(brwt)[2:62])~c(newx,log(bwt)[2:62])) print(summary(mod1)) lines(c(newx,log(bwt)[2:62]),mod1$fitted) lines(log(bwt),mod1o$fitted,col='red',lty=2) p<-locator() ## Leverage again# lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(c(14,log(bwt)[2:62]),lmi$hat,xlab="Body weight (log)", ylab="Leverage") abline(h=2/length(bwt)) p<-locator() plot(c(14,log(bwt)[2:62]),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope") p<-locator() plot(c(14,log(bwt)[2:62]),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped") p<-locator() lmi<-lm.influence(mod1) par(mfrow=c(1,1)) plot(lmi$hat, ylab="Leverage",type="h") abline(h=3/length(bwt)) p<-locator() # resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat)) resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2)) cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df) par(mfrow=c(1,1)) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mod1$df),lty=2) print(sleeptab[cooksd>qf(.95,1,mod1$df),]) p<-locator() plot(c(newx,log(bwt)[2:62]),c(newy,log(brwt[2:62])),xlab="Body weights", ylab="brwt") lines(c(newx,log(bwt)[2:62]),mod1$fitted) points(c(newx,log(bwt)[2:62])[cooksd>qf(.95,1,mod1$df)],c(newy,log(brwt[2:62]))[cooksd>qf(.95,1,mod1$df)],pch=3,col=2) p<-locator() # # # # Below I create a data set with more noise to make curvature # of the variance of the fitted values easier to spot. mod1s<-summary(mod1) nbrwt <- exp(log(brwt) + rnorm(length(brwt), sd=.5*mod1s$sig)) mod1b <- lm(log(nbrwt)~log(bwt)) mod1bs <- summary(mod1b) # par(mfrow=c(1,1)) plot(log(bwt),log(nbrwt),xlab="Body weight (log)",ylab="Brain weigh (log)") mod1<-lm(log(nbrwt)~log(bwt)) mod1s <- summary(mod1b) print(summary(mod1b)) cc <- seq(min(log(bwt))-1,max(log(bwt))+1,by=.01) aa <- (cc-mean(cc))^2 ba <- aa/sum(aa) ciwi <- mod1bs$sig*sqrt(1/length(brwt)+ba) dd <- mod1b$coef[1]+mod1b$coef[2]*cc lines(log(bwt),mod1b$fitted) lines(cc,(dd+qt(.975,length(brwt)-2)*ciwi)) lines(cc,(dd-qt(.975,length(brwt)-2)*ciwi)) p<-locator() # # Here I sample 50 data sets from the same true model. For each new data set # I estimate a new regression line and draw it in red in the graphics window. # Notice how stable the fitted values are in the mid-range of x, and how much # variable they are in the high-leverage regions. # Right-click STOP in the graphics window for each simulation to display. B<-50 for (b in (1:B)) { sa <- sample(seq(1,length(brwt)),length(brwt),replace=T) mm <- lm(log(nbrwt)~log(bwt),subset=sa) ms <-summary(mm) lines(log(bwt[sa]),mm$fit,col='red',lty=2) if (b<5) { p <- locator() } } p<-locator() # lines(cc,(dd+qt(.975,length(brwt)-2)*sqrt(mod1bs$sig^2+ciwi^2))) lines(cc,(dd-qt(.975,length(brwt)-2)*sqrt(mod1bs$sig^2+ciwi^2))) p<-locator() # B<-50 for (b in (1:B)) { xnew <- sample(log(bwt),1) ynew <- mod1b$coef[1]+mod1b$coef[2]*xnew+rnorm(1,sd=mod1bs$sig) points(xnew,ynew,col='blue',pch=2) } p<-locator() #