# # In this code I run CV repeatedly. # Scroll down to find the code to run it just one time. # K<-100 # The number of random splits I run CV on repeatedly. # # BP <- 1/5 # The real test set - the fraction left out for prediction error estimation # # rleapsall<-summary(regsubsets(HDdatanew[,-c(1,4)],HDdatanew[,4],int=T,nbest=10,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex"))) tt<-apply(rleapsall$which,1,sum) cleaps2<-rleapsall cleaps2$which<-rleapsall$which[tt>1,] LL<-dim(cleaps2$which)[1] # Here I created a library of models to search over. LL is the number # of models I will fit to the data. # This is a bit of a cheat since I used the whole data set to come up # with all these models, but I try to play it safe by asking for 10 # models of each size. # # cleaps2$which is a library of models to search over. # There are LL models total # # PELL<-rep(0,LL) PEK<-rep(0,LL) # I create empty vectors to keep record of average pred error for each model whichmodel<-rep(0,LL) modsel<-rep(0,dim(HDdatanew)[2]-1) modsize<-rep(0,K) # and empty tables and vectors to count up the number of times a model is # selected # # # This loop runs the CV several times on random splits of data. for (kk in (1:K)) { whichmodelkk<-rep(0,LL) # # B<-3 # The kind of cross-validation # B<-2 is 2-fold # B<-3 is 3-fold, can try 5, or 10 as well. # ii<-sample(seq(1,dim(HDdatanew)[1]),round((1-BP)*dim(HDdatanew)[1])) # scramble order of observations # this is just to make sure that you have the folds of the data are # comparable. can be a problem otherwise if the data stored in # increasing values of y for example. HDdatanewb<-HDdatanew[ii,] HDdatanewc<-HDdatanew[-ii,] # leave out for prediction error estimation yyte<-HDdatanewc[,4] xxte<-as.matrix(HDdatanewc[,-c(1,4)]) # # # # HERE IS THE MAIN CV CODE # I create an empty vector to store the CV prediction errors in PEL <- rep(0,dim(rleapsall$which)[1]) # for (bb in (1:B)) { # do for all folds ij<-seq(1,floor(dim(HDdatanewb)[1]/B))+floor(dim(HDdatanewb)[1]/B)*(bb-1) if (bb==B) { ij<-seq(floor(dim(HDdatanewb)[1]/B)*(bb-1),dim(HDdatanewb)[1]) } yy<-HDdatanewb[-ij,4] xx<-as.matrix(HDdatanewb[-ij,-c(1,4)]) yyt<-HDdatanewb[ij,4] xxt<-as.matrix(HDdatanewb[ij,-c(1,4)]) # create the training and test set - 1 fold for testing for (ll in (1:LL)) { mmcp<-lm(yy~cbind(rep(1,length(yy)),xx)[,cleaps2$which[ll,]==T]-1) # fit model ll to the training set PEL[ll]<-PEL[ll]+ (1/B)*sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt)[,cleaps2$which[ll,]==T]%*%mmcp$coef)^2)/length(yyt) # compute the CV prediction errors # store for average comparison after you've done this to all folds. } } # Find the winning model from the minimum prediction error min(PEL) yy<-HDdatanewb[,4] xx<-as.matrix(HDdatanewb[,-c(1,4)]) mmcp<-lm(yy~cbind(rep(1,length(yy)),xx)[,cleaps2$which[PEL==min(PEL),]==T]-1) PEK[PEL==min(PEL)] <- PEK[PEL==min(PEL)] + sum((yyte-cbind(rep(1,dim(xxte)[1]),xxte)[,cleaps2$which[PEL==min(PEL),]==T]%*%mmcp$coef)^2)/length(yyte) # PEK is the actual prediction error of this model on the test data # # END OF MAIN CV CODE. # YOUR BEST MODEL IS mmcp # # # Across of the K splits, store the number of times a variable was selected modsel[cleaps2$which[PEL==min(PEL),]]<-modsel[cleaps2$which[PEL==min(PEL),]]+1 # How big was the model? modsize[kk]<-sum(cleaps2$which[PEL==min(PEL),])-1 # How many times was an actual model selected? whichmodelkk[sort.list(PEL)[1]]<-whichmodelkk[sort.list(PEL)[1]]+1 whichmodel[sort.list(PEL)[1]]<-whichmodel[sort.list(PEL)[1]]+1 # for (ll in (1:LL)) { PELL[ll] <- PELL[ll]+PEL[ll]/K } # Here I record across the splits what the average prediction error is. # PEL was the prediction error from one split # PELL is the average prediction error the K splits. # # # } PELL<-PELL/K PEK<-PEK/K modseltab<-cbind(names(HDdatanew)[-c(1,4)],modsel[-1]) # After the code stops running look at # modelseltab to see which variables are selected and how often # modsize will tell you the size of the models selected # # plot(tt[PEK>0],PEK[PEK>0],xlab="size",ylab="True Prediction Error") # plotting actual prediction error performance # not plotting the zeroes, since that's for models that were # never selected. # topmodels<-cleaps2$which[rev(sort.list(whichmodel))[1:10],] print(cbind(c("CV-PE",PELL[rev(sort.list(whichmodel))][1:10]),c("PE",PEK[rev(sort.list(whichmodel))[1:10]]),rbind(names(HDdatanew)[-c(1,4)],topmodels[,-1]))) print(c("Topmodels selected #", rev(sort(whichmodel))[1:10])) # How many times was each top model selected - is there a clear winner? # Compare CV prediction error PELL to actual prediction errors PEK #