plot(SAheart[,-10],col=SAheart[,10]+1) SAheart[,5]<-as.numeric(SAheart[,5])-1 # can you see a red/black pattern in any of the variables? # do the shapes look the SAheartme for both colored data sets? # is a linear boundary appropriate? p<-locator() # ntrain<-300 # try different values here puse<-c(1,2,3,4,5,6,7,8,9) # example of subset: puse<-c(1,3,9) sbp, ldl and age # variables to include - try removing some of these... e.g obesity # # Try large training, small number of features - LDA or QDA? # Try small training, small number of features..... # etc # ii<-sample(seq(1,dim(SAheart)[1]),ntrain) # take out ntrain observations ll<-lda(SAheart[ii,puse],SAheart[ii,10]) # run lda pll<-predict(ll,newdata=SAheart[-ii,puse]) print(table(SAheart[-ii,10],pll$class)) print(c("LDA Misclassification error rate",round(sum(SAheart[-ii,10]!=pll$class)/length(pll$class),3))) # ll<-qda(SAheart[ii,puse],SAheart[ii,10]) # run qda pll<-predict(ll,newdata=SAheart[-ii,puse]) print(table(SAheart[-ii,10],pll$class)) print(c("QDA Misclassification error rate",round(sum(SAheart[-ii,10]!=pll$class)/length(pll$class),3))) # p<-locator() # Below I repeat this 25 times and summarize the output B<-25 Errrates<-matrix(0,B,2) # I create empty matrices to store the error rates for (bb in (1:B)) { # iie<-sample(seq(1,dim(SAheart)[1]),ntrain) trdata<-SAheart[-iie,] tedata<-SAheart[iie,] ll<-lda(trdata[,puse],trdata[,10]) # run lda pll<-predict(ll,newdata=tedata[,puse]) Errrates[bb,1]<-sum(tedata[,10]!=pll$class)/length(pll$class) ll<-qda(trdata[,puse],trdata[,10]) # run qda pll<-predict(ll,newdata=tedata[,puse]) Errrates[bb,2]<-sum(tedata[,10]!=pll$class)/length(pll$class) } boxplot(Errrates[,1]-Errrates[,2],main="PElda-PEqda") abline(h=0) # Comparing LDA error rate to QDA error rates p<-locator() boxplot(Errrates,names=c("LDA","QDA"),title="Error rates") # Is LDA or QDA better? # Now try different size training sets and removing some observations