frac<-.67 # I will use 2/3 for training - try another value if you like. K<-100 # The number of splits to perform - this takes a while. You can do # fewer reps if you want. # PEK<-matrix(0,K,2) avPEK<-PEK modselbic<-rep(0,dim(polltrain2)[2]) modselaic<-modselbic modsizebic<-rep(0,K) modsizeaic<-rep(0,K) # The above commands just create empty vectors and matrices so that I can # store results from the K random splits of data. # The for lopp below repeats the random splitting and model selection # K times. for (kk in (1:K)) { ii2<-sample(seq(1,dim(polltrain2)[1]),round(dim(polltrain2)[1]*frac)) yy<-polltrain2[ii2,16] xx<-as.matrix(polltrain2[ii2,-16]) yyt<-polltrain2[-ii2,16] xxt<-as.matrix(polltrain2[-ii2,-16]) # rleaps<-regsubsets(xx,yy,int=T,nbest=50,nvmax=dim(polltrain2)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) cleaps2<-cleaps cleaps2$which<-cleaps$which[tt>1,] BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # various model selection criteria # aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) PEK[kk,1]<-PEbic mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEK[kk,2]<-PEaic modselbic[bicmod==T]<-modselbic[bicmod==T]+1 modsizebic[kk]<-sum(bicmod)-1 modselaic[aicmod==T]<-modselaic[aicmod==T]+1 modsizeaic[kk]<-sum(aicmod)-1 } # After the loop ends, the PEK matrix contains the prediction errors for # each split for the AIC (2nd column) and the BIC (1st column). # You can do boxplots or look at the difference between prediction error # for the two criteria. # # modsizeaic and modsizebic records the size of the models picked for the # different criteria. You can look at boxplots of these as well. # modselaic and modselbic record the number of times a variable was # picked to be in the model. modseltab<-cbind(names(polltrain2)[-16],modselbic[-1],modselaic[-1]) # modelseltab tabulates which variables are selected and how often # modsize will tell you the size of the models selected print(modseltab) print(modsize)