LDLdata<-read.table('ldldata',header=T)
#
print(names(LDLdata))
# variables in the data
LDLdata$tobind<-LDLdata$tobacco
LDLdata$tobind[LDLdata$tobind>0]<-1
LDLdata$alcind<-LDLdata$alcohol
LDLdata$alcind[LDLdata$alcind>0]<-1
# creating indicators for usage of tobacco and alcohol
#
# Start exploring the data set
par(mfrow=c(2,2))
plot(LDLdata$obesity,LDLdata$ldl,main="LDL on obesity")
plot(LDLdata$sbp,LDLdata$ldl,main="LDL on blood pressure")
plot(LDLdata$tobacco,LDLdata$ldl,main="LDL on tobacco usage")
plot(LDLdata$alcohol,LDLdata$ldl,main="LDL on alcohol consumption")
p<-locator()
boxplot(LDLdata$ldl~LDLdata$chd,main="LDL on coronary heart disease status")
boxplot(LDLdata$ldl~as.factor(LDLdata$famhist),main="LDL on family history of same")
p<-locator()
# Do these plots indicate that a linear model is sufficient? constant variance?
#
#
boxplot(LDLdata$ldl~as.factor(LDLdata$tobind),main="LDL on tobacco indicator")
boxplot(LDLdata$ldl~as.factor(LDLdata$alcind),main="LDL on alcohol indicator")
p<-locator()
#
#
# How about after a data transformation?
plot(log(LDLdata$obesity),log(LDLdata$ldl),main="log(LDL) on log(obesity)")
plot(log(LDLdata$age),log(LDLdata$ldl),main="log(LDL) on log(age)")
plot(LDLdata$adi,log(LDLdata$ldl),main="log(LDL) on adiposity")
p<-locator()
# Transform to log helps with normality of errors and enhances the relationship
# between some variables
#
# First model fit.
mm1<-lm(log(LDLdata$ldl)~log(LDLdata$age)+log(LDLdata$obesity)+as.factor(LDLdata$chd)+as.factor(LDLdata$famhist)+as.factor(LDLdata$tobind)+as.factor(LDLdata$alcind)+LDLdata$tobacco+LDLdata$alcohol+LDLdata$adiposity+LDLdata$typea+LDLdata$sbp)
print(summary(mm1))
par(mfrow=c(2,2))
plot(mm1)
p<-locator()
#
# possible outlier 139
#
lm1<-lm.influence(mm1)
par(mfrow=c(1,1))
resstd<-mm1$res/(summary(mm1)$sigma*sqrt(1-lm1$hat))
resstud <- mm1$res*sqrt((mm1$df-1)/(sum(mm1$res^2)*(1-lm1$hat)-mm1$res^2))
cooksd<-resstud^2/(length(mm1$coef)*sum(mm1$res^2)/mm1$df)
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mod1$df),lty=2)
p<-locator()
# 139 does stand out
#
par(mfrow=c(2,2))
plot(lm1$hat,main="Leverage")
abline(h=3*length(mm1$coef)/dim(LDLdata)[1])
identify(lm1$hat)
plot(lm1$coeff[,5],main="change in slope 5")
identify(lm1$coeff[,5])
plot(lm1$coeff[,8],main="change in slope 8")
identify(lm1$coeff[,8])
plot(lm1$coeff[,9],main="change in slope 9")
identify(lm1$coeff[,9])
plot(lm1$sigma, main="Residual MSE when obs is dropped")
identify(lm1$sigma)
p<-locator()
# You can also check the other slope coefficients - here I just looked at the significant ones. 
# Some high leverage observations, but 
# 139 stands out though - for now, let's drop 139 but consider going back to
# drop the rest....
#
mm1b<-lm(log(LDLdata$ldl)~log(LDLdata$age)+log(LDLdata$obesity)+as.factor(LDLdata$chd)+as.factor(LDLdata$famhist)+as.factor(LDLdata$tobind)+as.factor(LDLdata$alcind)+LDLdata$tobacco+LDLdata$alcohol+LDLdata$adiposity+LDLdata$typea+LDLdata$sbp,subset=-c(139))
print(summary(mm1b))
par(mfrow=c(2,2))
plot(mm1b)
p<-locator()
# Better fit.
#
# Checking collinearity
par(mfrow=c(1,1))
LDLpl<-LDLdata
LDLpl$famhist<-as.real(LDLpl$famhist)
image(seq(1,12),seq(1,12),abs(cor(LDLpl[2:13,2:13])))
print("Correlation matrix")
print(cor(LDLpl[2:13,2:13]))
# plotting the absolute correlations here - look for off-diagonal elements
# that are large - these variable may cause problems when we fit
# the model. Here, I would be worried e.g. about the very strong correlation
# between adiposity and obesity (0.94).