#HDdata<-read.csv('SAheart_data.txt') #print(names(HDdata)) #HDdata$tobind<-HDdata$tobacco #HDdata$tobind[HDdata$tobind>0]<-1 #HDdata$alcind<-HDdata$alcohol #HDdata$alcind[HDdata$alcind>0]<-1 # creating indicators for usage of tobacco and alcohol # #ii<-sort(sample(seq(1,dim(HDdata)[1]),150)) #HDdatatest<-HDdata[ii,] #HDdata<-HDdata[-ii,] # I start by splitting the data into a training and test set # par(mfrow=c(2,2)) plot(HDdata$obesity,HDdata$ldl,main="LDL on obesity") plot(HDdata$sbp,HDdata$ldl,main="LDL on blood pressure") plot(HDdata$tobacco,HDdata$ldl,main="LDL on tobacco usage") plot(HDdata$alcohol,HDdata$ldl,main="LDL on alcohol consumption") p<-locator() boxplot(HDdata$ldl~HDdata$chd,main="LDL on coronary heart disease status") boxplot(HDdata$ldl~as.factor(HDdata$famhist),main="LDL on family history of same") p<-locator() # boxplot(HDdata$ldl~as.factor(HDdata$tobind),main="LDL on tobacco indicator") boxplot(HDdata$ldl~as.factor(HDdata$alcind),main="LDL on alcohol indicator") p<-locator() # plot(log(HDdata$obesity),log(HDdata$ldl),main="log(LDL) on log(obesity)") plot(log(HDdata$age),log(HDdata$ldl),main="log(LDL) on log(age)") plot(HDdata$adi,log(HDdata$ldl),main="log(LDL) on adiposity") p<-locator() # Transform to log helps with normality of errors and enhances the relationship # between some variables # mm1<-lm(log(HDdata$ldl)~log(HDdata$age)+log(HDdata$obesity)+as.factor(HDdata$chd)+as.factor(HDdata$famhist)+as.factor(HDdata$tobind)+as.factor(HDdata$alcind)+HDdata$tobacco+HDdata$alcohol+HDdata$adiposity+HDdata$typea+HDdata$sbp) print(summary(mm1)) par(mfrow=c(2,2)) plot(mm1) p<-locator() # # possible outlier 139 - note, when you run the code the split is # random so you might get another outliers. From here on, replace 139 # with the outliers YOU detect. # lm1<-lm.influence(mm1) par(mfrow=c(1,1)) resstd<-mm1$res/(summary(mm1)$sigma*sqrt(1-lm1$hat)) resstud <- mm1$res*sqrt((mm1$df-1)/(sum(mm1$res^2)*(1-lm1$hat)-mm1$res^2)) cooksd<-resstud^2/(length(mm1$coef)*sum(mm1$res^2)/mm1$df) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mod1$df),lty=2) print(cbind(seq(1,dim(HDdata)[1])[cooksd>qf(.95,1,mm1$df)],HDdata[cooksd>qf(.95,1,mm1$df),])) p<-locator() # 139 does stand out # par(mfrow=c(2,2)) plot(lm1$hat,main="Leverage") abline(h=3*length(mm1$coef)/dim(HDdata)[1]) identify(lm1$hat) plot(lm1$coeff[,5],main="change in slope 5") identify(lm1$coeff[,5]) plot(lm1$coeff[,8],main="change in slope 8") identify(lm1$coeff[,8]) plot(lm1$coeff[,9],main="change in slope 9") identify(lm1$coeff[,9]) plot(lm1$sigma, main="Residual MSE when obs is dropped") identify(lm1$sigma) p<-locator() # Some high leverage observations, but # 139 stands out though - for now, let's drop 139 but consider going back to # drop the rest.... # mm1b<-lm(log(HDdata$ldl)~log(HDdata$age)+log(HDdata$obesity)+as.factor(HDdata$chd)+as.factor(HDdata$famhist)+as.factor(HDdata$tobind)+as.factor(HDdata$alcind)+HDdata$tobacco+HDdata$alcohol+HDdata$adiposity+HDdata$typea+HDdata$sbp,subset=-c(139)) print(summary(mm1b)) par(mfrow=c(2,2)) plot(mm1b) p<-locator() # # print(step(mm1b)) p<-locator() # HDdatanew<-HDdata[-c(139),] HDdatanew$age<-log(HDdatanew$age) HDdatanew$obesity<-log(HDdatanew$obesity) HDdatanew$ldl<-log(HDdatanew$ldl) ff<-as.real(HDdatanew$famhist)-1 HDdatanew$famhist<-ff HDdatatestnew<-HDdatatest HDdatatestnew$age<-log(HDdatatestnew$age) HDdatatestnew$obesity<-log(HDdatatestnew$obesity) HDdatatestnew$ldl<-log(HDdatatestnew$ldl) fft<-as.real(HDdatatestnew$famhist)-1 HDdatatestnew$famhist<-fft # create the training and test set with indicators.