plot(SAheart[,-10],col=SAheart[,10]+1)
SAheart[,5]<-as.numeric(SAheart[,5])-1
# can you see a red/black pattern in any of the variables?
# do the shapes look the SAheartme for both colored data sets?
# is a linear boundary appropriate?
p<-locator()
#
ntrain<-300
# try different values here
puse<-c(1,2,3,4,5,6,7,8,9)
# example of subset: puse<-c(1,3,9) sbp, ldl and age
# variables to include - try removing some of these... e.g obesity
#
# Try large training, small number of features - LDA or QDA?
# Try small training, small number of features..... 
# etc
#
ii<-sample(seq(1,dim(SAheart)[1]),ntrain)
 # take out ntrain observations
 ll<-lda(SAheart[ii,puse],SAheart[ii,10])
 # run lda 
 pll<-predict(ll,newdata=SAheart[-ii,puse])
 print(table(SAheart[-ii,10],pll$class)) 
 print(c("LDA Misclassification error rate",round(sum(SAheart[-ii,10]!=pll$class)/length(pll$class),3)))
 # 
 ll<-qda(SAheart[ii,puse],SAheart[ii,10])
 # run qda 
 pll<-predict(ll,newdata=SAheart[-ii,puse])
 print(table(SAheart[-ii,10],pll$class)) 
 print(c("QDA Misclassification error rate",round(sum(SAheart[-ii,10]!=pll$class)/length(pll$class),3)))
 #
p<-locator()
# Below I repeat this 25 times and summarize the output
B<-25
Errrates<-matrix(0,B,2)
# I create empty matrices to store the error rates
for (bb in (1:B)) {
#
iie<-sample(seq(1,dim(SAheart)[1]),ntrain)
trdata<-SAheart[-iie,]
tedata<-SAheart[iie,]
ll<-lda(trdata[,puse],trdata[,10])
# run lda 
pll<-predict(ll,newdata=tedata[,puse])
Errrates[bb,1]<-sum(tedata[,10]!=pll$class)/length(pll$class)
ll<-qda(trdata[,puse],trdata[,10])
# run qda 
pll<-predict(ll,newdata=tedata[,puse])
Errrates[bb,2]<-sum(tedata[,10]!=pll$class)/length(pll$class)
}
boxplot(Errrates[,1]-Errrates[,2],main="PElda-PEqda")
abline(h=0)
# Comparing LDA error rate to QDA error rates
p<-locator()
boxplot(Errrates,names=c("LDA","QDA"),title="Error rates")
# Is LDA or QDA better?
# Now try different size training sets and removing some observations