LDLdata<-read.table('ldldata',header=T) # print(names(LDLdata)) # variables in the data LDLdata$tobind<-LDLdata$tobacco LDLdata$tobind[LDLdata$tobind>0]<-1 LDLdata$alcind<-LDLdata$alcohol LDLdata$alcind[LDLdata$alcind>0]<-1 # creating indicators for usage of tobacco and alcohol # # Start exploring the data set par(mfrow=c(2,2)) plot(LDLdata$obesity,LDLdata$ldl,main="LDL on obesity") plot(LDLdata$sbp,LDLdata$ldl,main="LDL on blood pressure") plot(LDLdata$tobacco,LDLdata$ldl,main="LDL on tobacco usage") plot(LDLdata$alcohol,LDLdata$ldl,main="LDL on alcohol consumption") p<-locator() boxplot(LDLdata$ldl~LDLdata$chd,main="LDL on coronary heart disease status") boxplot(LDLdata$ldl~as.factor(LDLdata$famhist),main="LDL on family history of same") p<-locator() # Do these plots indicate that a linear model is sufficient? constant variance? # # boxplot(LDLdata$ldl~as.factor(LDLdata$tobind),main="LDL on tobacco indicator") boxplot(LDLdata$ldl~as.factor(LDLdata$alcind),main="LDL on alcohol indicator") p<-locator() # # # How about after a data transformation? plot(log(LDLdata$obesity),log(LDLdata$ldl),main="log(LDL) on log(obesity)") plot(log(LDLdata$age),log(LDLdata$ldl),main="log(LDL) on log(age)") plot(LDLdata$adi,log(LDLdata$ldl),main="log(LDL) on adiposity") p<-locator() # Transform to log helps with normality of errors and enhances the relationship # between some variables # # First model fit. mm1<-lm(log(LDLdata$ldl)~log(LDLdata$age)+log(LDLdata$obesity)+as.factor(LDLdata$chd)+as.factor(LDLdata$famhist)+as.factor(LDLdata$tobind)+as.factor(LDLdata$alcind)+LDLdata$tobacco+LDLdata$alcohol+LDLdata$adiposity+LDLdata$typea+LDLdata$sbp) print(summary(mm1)) par(mfrow=c(2,2)) plot(mm1) p<-locator() # # possible outlier 139 # lm1<-lm.influence(mm1) par(mfrow=c(1,1)) resstd<-mm1$res/(summary(mm1)$sigma*sqrt(1-lm1$hat)) resstud <- mm1$res*sqrt((mm1$df-1)/(sum(mm1$res^2)*(1-lm1$hat)-mm1$res^2)) cooksd<-resstud^2/(length(mm1$coef)*sum(mm1$res^2)/mm1$df) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mod1$df),lty=2) p<-locator() # 139 does stand out # par(mfrow=c(2,2)) plot(lm1$hat,main="Leverage") abline(h=3*length(mm1$coef)/dim(LDLdata)[1]) identify(lm1$hat) plot(lm1$coeff[,5],main="change in slope 5") identify(lm1$coeff[,5]) plot(lm1$coeff[,8],main="change in slope 8") identify(lm1$coeff[,8]) plot(lm1$coeff[,9],main="change in slope 9") identify(lm1$coeff[,9]) plot(lm1$sigma, main="Residual MSE when obs is dropped") identify(lm1$sigma) p<-locator() # You can also check the other slope coefficients - here I just looked at the significant ones. # Some high leverage observations, but # 139 stands out though - for now, let's drop 139 but consider going back to # drop the rest.... # mm1b<-lm(log(LDLdata$ldl)~log(LDLdata$age)+log(LDLdata$obesity)+as.factor(LDLdata$chd)+as.factor(LDLdata$famhist)+as.factor(LDLdata$tobind)+as.factor(LDLdata$alcind)+LDLdata$tobacco+LDLdata$alcohol+LDLdata$adiposity+LDLdata$typea+LDLdata$sbp,subset=-c(139)) print(summary(mm1b)) par(mfrow=c(2,2)) plot(mm1b) p<-locator() # Better fit. # # Checking collinearity par(mfrow=c(1,1)) LDLpl<-LDLdata LDLpl$famhist<-as.real(LDLpl$famhist) image(seq(1,12),seq(1,12),abs(cor(LDLpl[2:13,2:13]))) print("Correlation matrix") print(cor(LDLpl[2:13,2:13])) # plotting the absolute correlations here - look for off-diagonal elements # that are large - these variable may cause problems when we fit # the model. Here, I would be worried e.g. about the very strong correlation # between adiposity and obesity (0.94).