# The DATA#
# sleeptab<-read.table('sleeptab',header=T)
# 1 - list the variables and data entries
bwt<-sleeptab$bwt
brwt<-sleeptab$brwt
print(sleeptab$Spec)
print(names(sleeptab))
print(sleeptab[1:9,-c(4,5,6)])
## Histograms
# 2
par(mfrow=c(1,2))
hist(bwt,20,xlab="Body weight, kg")
hist(brwt,20,xlab="Brain weight, g")
p<-locator()
# p<-locator() halts the execution until R receives input from you:
# Right-click in the graphics window and choose "STOP" in order to continue.
## Data transforms 
# 3
par(mfrow=c(1,2))
hist(log(bwt),20,xlab="Body weight, kg")
hist(log(brwt),20,xlab="Brain weight, g")
p<-locator()
#
# Boxplots & Barcharts
par(mfrow=c(1,1))
boxplot(log(brwt)~sleeptab$pred,main="Boxplot brwt on predator index")
#
# Data summaries
# 4
print("Fivenumber summary of log body weight")
print(fivenum(log(bwt)))
print("Fivenumber summary of log brain weight")
print(fivenum(log(brwt)))
p<-locator()
## Robustness
# 5
#print("Robustness")
#bwt2<-bwt
#bwt2[1]<-bwt2[1]/1000
#print("Fivenumber summary of log bwt contaminated")
#print(fivenum(log(bwt2)))
# 6
#p<-locator()
#print("What about the mean and SD?")
#print(c("original data:","mean=",round(mean(log(bwt)),3),"SD=",round(sd(log(bwt)),3)))
#print(c("contaminated data:","mean=",round(mean(log(bwt2)),3),"SD=",round(sd(log(bwt2)),3)))
# NOTE: the mean and sd are much more affected by a contamination of the data
# than the five number summaries.
#
#
# 7
#p<-locator()
## QQ plots# short tails compared to normal#
par(mfrow=c(1,1))
qqnorm(log(brwt),main="QQplot of log brain weight")
qqline(log(brwt))
p<-locator()
### scatter plots##
gestation<-sleeptab$gest
lifespan<-sleeptab$life
danger<-sleeptab$dang
dream<-sleeptab$dream
totsleep<-sleeptab$totsleep
## Lifespan on gestation
# 8 - To highlight points in the graph, left-click on them. Then, right-click
# to "stop" and proceed with the next set of commands. 
par(mfrow=c(1,1))
plot(gestation[gestation!=-999 & lifespan!=-999],lifespan[gestation!=-999 & lifespan!=-999], main="Lifespan on Gestation")
identify(gestation[gestation!=-999 & lifespan!=-999],lifespan[gestation!=-999 & lifespan!=-999],sleeptab$Spec[gestation!=-999 & lifespan!=-999])
p<-locator()
## brain wt on gestation
# 9
plot(log(brwt)[gestation!=-999],gestation[gestation!=-999],main="Gestation on Brain weight")
identify(log(brwt)[gestation!=-999],gestation[gestation!=-999],sleeptab$Spec[gestation!=-999])
p<-locator()
#
plot(log(brwt)[gestation!=-999],sqrt(gestation[gestation!=-999]),main="SQRT gestation on Brain weight")
identify(log(brwt)[gestation!=-999],sqrt(gestation[gestation!=-999]),sleeptab$Spec[gestation!=-999])
p<-locator()
#
plot(log(brwt)[gestation!=-999],(gestation[gestation!=-999])^(1/3),main="3-rd root gestation on Brain weight")
identify(log(brwt)[gestation!=-999],(gestation[gestation!=-999])^(1/3),sleeptab$Spec[gestation!=-999])
p<-locator()
#
## brain wt on body wt
# 10
plot(log(bwt),log(brwt),main="Brain weight on Body Weight")
identify(log(bwt),log(brwt),sleeptab$Spec)
p<-locator()
## sleep on dream
# 11
plot(totsleep[dream!=-999 & totsleep!=-999],dream[dream!=-999 & totsleep!=-999],main="Dream hours on Sleep hours")
identify(totsleep[dream!=-999 & totsleep!=-999],dream[dream!=-999 & totsleep!=-999],sleeptab$Spec[dream!=-999 & totsleep!=-999])
p<-locator()
## Regression
# Brain weight on Body weight
# scatter plots, correlation#
par(mfrow=c(1,2))
plot((bwt),(brwt),xlab="Body weight",ylab="Brain weigh")
plot(log(bwt),log(brwt),xlab="Body weight (log)",ylab="Brain weigh (log)")
p<-locator()
print(c("Correlation coefficient (logs first)",cor(log(bwt),log(brwt))))
p<-locator()
#
#
#
#
## Linear models
# Regression line, fitted#
par(mfrow=c(1,1))
plot(log(bwt),log(brwt),xlab="Body weight (log)",ylab="Brain weigh (log)")
mod1<-lm(log(brwt)~log(bwt))
mod1o<-mod1
print(summary(mod1))
lines(log(bwt),mod1$fitted)
p<-locator()
## Regression diagnostics
# residual plots#
par(mfrow=c(1,1))
plot(log(bwt),mod1$res,xlab="Body weight (log)",ylab="Residuals",main="Diagnostic plots")
p<-locator()
plot(mod1$fitted,mod1$res,xlab="Fitted values",ylab="Residuals")
p<-locator()
qqnorm(mod1$residuals)
qqline(mod1$residual)
p<-locator()
## Leverage#
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(log(bwt),lmi$hat,xlab="Body weight (log)", ylab="Leverage")
abline(h=3/length(bwt))
p<-locator()
plot(log(bwt),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope")
p<-locator()
plot(log(bwt),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped")
p<-locator()
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(lmi$hat, ylab="Leverage",type="h")
abline(h=3/length(bwt))
p<-locator()
#
resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat))
resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2))
cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df)
par(mfrow=c(1,1))
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mod1$df),lty=2)
print(sleeptab[cooksd>qf(.95,1,mod1$df),])
p<-locator()
plot(log(bwt),log(brwt),xlab="Body weights", ylab="brwt")
lines(log(bwt),mod1$fitted)
points(log(bwt)[cooksd>qf(.95,1,mod1$df)],log(brwt)[cooksd>qf(.95,1,mod1$df)],pch=3,col=2)
p<-locator()
# Cook's D is a way of combining both leverage and residual size into one 
# measure of "outlyingness".
# Cook's D = residual^2 * (leverage/(1-leverage)^2) * 1/(p*MSE)
# which is essentially the sum of the squared studentized residuals 
# normalized by p*MSE
# If there are no outliers, and errors are normal, Cook's D follows a
# F-distribution on p and n-p degrees of freedom
#
## Contamination#
par(mfrow=c(1,1))
plot(log(bwt),c(log(250000),log(brwt[2:62])),xlab="Body weight (log)",ylab="Brain weight (log)",main="contaminated data")
mod1<-lm(c(log(250000),log(brwt[2:62]))~log(bwt))
print(summary(mod1))
lines(log(bwt),mod1$fitted)
lines(log(bwt),mod1o$fitted,col='red',lty=2)
p<-locator()
## Leverage again#
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(log(bwt),lmi$hat,xlab="Body weight (log)", ylab="Leverage")
abline(h=2/length(bwt))
p<-locator()
plot(log(bwt),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope")
p<-locator()
plot(log(bwt),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped")
p<-locator()
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(lmi$hat, ylab="Leverage",type="h")
abline(h=3/length(bwt))
p<-locator()
#
resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat))
resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2))
cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df)
par(mfrow=c(1,1))
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mod1$df),lty=2)
print(sleeptab[cooksd>qf(.95,1,mod1$df),])
p<-locator()
plot(log(bwt),c(log(250000),log(brwt[2:62])),xlab="Body weights", ylab="brwt")
lines(log(bwt),mod1$fitted)
points(log(bwt)[cooksd>qf(.95,1,mod1$df)],c(log(250000),log(brwt[2:62]))[cooksd>qf(.95,1,mod1$df)],pch=3,col=2)
p<-locator()
#
## Extreme leverage#
par(mfrow=c(1,1))
newx<-14
newy<-mod1o$coef[1]+mod1o$coef[2]*newx
plot(c(newx,log(bwt)[2:62]),c(newy,log(brwt)[2:62]),xlab="Body weight (log)",ylab="Brain weight (log)",main="contaminated data")
mod1<-lm(c(newy,log(brwt)[2:62])~c(newx,log(bwt)[2:62]))
print(summary(mod1))
lines(c(newx,log(bwt)[2:62]),mod1$fitted)
lines(log(bwt),mod1o$fitted,col='red',lty=2)
p<-locator()
## Leverage again#
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(c(14,log(bwt)[2:62]),lmi$hat,xlab="Body weight (log)", ylab="Leverage")
abline(h=2/length(bwt))
p<-locator()
plot(c(14,log(bwt)[2:62]),lmi$coeff[,2],xlab="Body weight (log)", ylab="change in slope")
p<-locator()
plot(c(14,log(bwt)[2:62]),lmi$sigma,xlab="Body weight (log)", ylab="Residual MSE when obs is dropped")
p<-locator()
lmi<-lm.influence(mod1)
par(mfrow=c(1,1))
plot(lmi$hat, ylab="Leverage",type="h")
abline(h=3/length(bwt))
p<-locator()
#
resstd<-mod1$res/(summary(mod1)$sigma*sqrt(1-lmi$hat))
resstud <- mod1$res*sqrt((mod1$df-1)/(sum(mod1$res^2)*(1-lmi$hat)-mod1$res^2))
cooksd<-resstud^2/(length(mod1$coef)*sum(mod1$res^2)/mod1$df)
par(mfrow=c(1,1))
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mod1$df),lty=2)
print(sleeptab[cooksd>qf(.95,1,mod1$df),])
p<-locator()
plot(c(newx,log(bwt)[2:62]),c(newy,log(brwt[2:62])),xlab="Body weights", ylab="brwt")
lines(c(newx,log(bwt)[2:62]),mod1$fitted)
points(c(newx,log(bwt)[2:62])[cooksd>qf(.95,1,mod1$df)],c(newy,log(brwt[2:62]))[cooksd>qf(.95,1,mod1$df)],pch=3,col=2)
p<-locator()
#
#
#
# Below I create a data set with more noise to make curvature 
# of the variance of the fitted values easier to spot. 
mod1s<-summary(mod1)
nbrwt <- exp(log(brwt) + rnorm(length(brwt), sd=.5*mod1s$sig))
mod1b <- lm(log(nbrwt)~log(bwt))
mod1bs <- summary(mod1b)
#
par(mfrow=c(1,1))
plot(log(bwt),log(nbrwt),xlab="Body weight (log)",ylab="Brain weigh (log)")
mod1<-lm(log(nbrwt)~log(bwt))
mod1s <- summary(mod1b)
print(summary(mod1b))
cc <- seq(min(log(bwt))-1,max(log(bwt))+1,by=.01)
aa <- (cc-mean(cc))^2
ba <- aa/sum(aa)
ciwi <- mod1bs$sig*sqrt(1/length(brwt)+ba)
dd <- mod1b$coef[1]+mod1b$coef[2]*cc
lines(log(bwt),mod1b$fitted)
lines(cc,(dd+qt(.975,length(brwt)-2)*ciwi))
lines(cc,(dd-qt(.975,length(brwt)-2)*ciwi))
p<-locator()
#
# Here I sample 50 data sets from the same true model. For each new data set
# I estimate a new regression line and draw it in red in the graphics window.
# Notice how stable the fitted values are in the mid-range of x, and how much
# variable they are in the high-leverage regions. 
# Right-click STOP in the graphics window for each simulation to display. 
B<-50
for (b in (1:B)) {
  sa <- sample(seq(1,length(brwt)),length(brwt),replace=T)
  mm <- lm(log(nbrwt)~log(bwt),subset=sa)
  ms <-summary(mm)
  lines(log(bwt[sa]),mm$fit,col='red',lty=2) 
  if (b<5) { p <- locator() }
   }
p<-locator()
#
lines(cc,(dd+qt(.975,length(brwt)-2)*sqrt(mod1bs$sig^2+ciwi^2)))
lines(cc,(dd-qt(.975,length(brwt)-2)*sqrt(mod1bs$sig^2+ciwi^2)))
p<-locator()
#
B<-50
for (b in (1:B)) {
  xnew <- sample(log(bwt),1)
  ynew <- mod1b$coef[1]+mod1b$coef[2]*xnew+rnorm(1,sd=mod1bs$sig)
  points(xnew,ynew,col='blue',pch=2) }
p<-locator()
#