#HDdata<-read.csv('SAheart_data.txt')
#print(names(HDdata))
#HDdata$tobind<-HDdata$tobacco
#HDdata$tobind[HDdata$tobind>0]<-1
#HDdata$alcind<-HDdata$alcohol
#HDdata$alcind[HDdata$alcind>0]<-1
# creating indicators for usage of tobacco and alcohol
#
#ii<-sort(sample(seq(1,dim(HDdata)[1]),150))
#HDdatatest<-HDdata[ii,]
#HDdata<-HDdata[-ii,]
# I start by splitting the data into a training and test set
#
par(mfrow=c(2,2))
plot(HDdata$obesity,HDdata$ldl,main="LDL on obesity")
plot(HDdata$sbp,HDdata$ldl,main="LDL on blood pressure")
plot(HDdata$tobacco,HDdata$ldl,main="LDL on tobacco usage")
plot(HDdata$alcohol,HDdata$ldl,main="LDL on alcohol consumption")
p<-locator()
boxplot(HDdata$ldl~HDdata$chd,main="LDL on coronary heart disease status")
boxplot(HDdata$ldl~as.factor(HDdata$famhist),main="LDL on family history of same")
p<-locator()
#
boxplot(HDdata$ldl~as.factor(HDdata$tobind),main="LDL on tobacco indicator")
boxplot(HDdata$ldl~as.factor(HDdata$alcind),main="LDL on alcohol indicator")
p<-locator()
#
plot(log(HDdata$obesity),log(HDdata$ldl),main="log(LDL) on log(obesity)")
plot(log(HDdata$age),log(HDdata$ldl),main="log(LDL) on log(age)")
plot(HDdata$adi,log(HDdata$ldl),main="log(LDL) on adiposity")
p<-locator()
# Transform to log helps with normality of errors and enhances the relationship
# between some variables
#
mm1<-lm(log(HDdata$ldl)~log(HDdata$age)+log(HDdata$obesity)+as.factor(HDdata$chd)+as.factor(HDdata$famhist)+as.factor(HDdata$tobind)+as.factor(HDdata$alcind)+HDdata$tobacco+HDdata$alcohol+HDdata$adiposity+HDdata$typea+HDdata$sbp)
print(summary(mm1))
par(mfrow=c(2,2))
plot(mm1)
p<-locator()
#
# possible outlier 139 - note, when you run the code the split is 
# random so you might get another outliers. From here on, replace 139
# with the outliers YOU detect.
#
lm1<-lm.influence(mm1)
par(mfrow=c(1,1))
resstd<-mm1$res/(summary(mm1)$sigma*sqrt(1-lm1$hat))
resstud <- mm1$res*sqrt((mm1$df-1)/(sum(mm1$res^2)*(1-lm1$hat)-mm1$res^2))
cooksd<-resstud^2/(length(mm1$coef)*sum(mm1$res^2)/mm1$df)
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mod1$df),lty=2)
print(cbind(seq(1,dim(HDdata)[1])[cooksd>qf(.95,1,mm1$df)],HDdata[cooksd>qf(.95,1,mm1$df),]))
p<-locator()
# 139 does stand out
#
par(mfrow=c(2,2))
plot(lm1$hat,main="Leverage")
abline(h=3*length(mm1$coef)/dim(HDdata)[1])
identify(lm1$hat)
plot(lm1$coeff[,5],main="change in slope 5")
identify(lm1$coeff[,5])
plot(lm1$coeff[,8],main="change in slope 8")
identify(lm1$coeff[,8])
plot(lm1$coeff[,9],main="change in slope 9")
identify(lm1$coeff[,9])
plot(lm1$sigma, main="Residual MSE when obs is dropped")
identify(lm1$sigma)
p<-locator()
# Some high leverage observations, but 
# 139 stands out though - for now, let's drop 139 but consider going back to
# drop the rest....
#
mm1b<-lm(log(HDdata$ldl)~log(HDdata$age)+log(HDdata$obesity)+as.factor(HDdata$chd)+as.factor(HDdata$famhist)+as.factor(HDdata$tobind)+as.factor(HDdata$alcind)+HDdata$tobacco+HDdata$alcohol+HDdata$adiposity+HDdata$typea+HDdata$sbp,subset=-c(139))
print(summary(mm1b))
par(mfrow=c(2,2))
plot(mm1b)
p<-locator()
#
#
print(step(mm1b))
p<-locator()
# 
HDdatanew<-HDdata[-c(139),]
HDdatanew$age<-log(HDdatanew$age)
HDdatanew$obesity<-log(HDdatanew$obesity)
HDdatanew$ldl<-log(HDdatanew$ldl)
ff<-as.real(HDdatanew$famhist)-1
HDdatanew$famhist<-ff
HDdatatestnew<-HDdatatest
HDdatatestnew$age<-log(HDdatatestnew$age)
HDdatatestnew$obesity<-log(HDdatatestnew$obesity)
HDdatatestnew$ldl<-log(HDdatatestnew$ldl)
fft<-as.real(HDdatatestnew$famhist)-1
HDdatatestnew$famhist<-fft
# create the training and test set with indicators.