# Model selection code # # First you need to install the R-package "leaps". # Do this by issuing the command install.packages("leaps") # # Once you have installed the package on your computer you need to # activate it in R by issuing the command library(leaps) # You need to do the activation each time you start R, but the installation # you only need to do the first time. # # # From the file pollstart I have a training data set called polltrain2. # This will be a different data set for you since you're to work on a different # subset of data. frac<-.67 # I split the training data again, just to examine model selection # stability. # Here I use 2/3 of the data for training, 1/3 for testing # You can pick another size split like 50-50 with frac<-.5 etc # ii2<-sample(seq(1,dim(polltrain2)[1]),round(dim(polltrain2)[1]*frac)) yy<-polltrain2[ii2,16] xx<-as.matrix(polltrain2[ii2,-16]) yyt<-polltrain2[-ii2,16] xxt<-as.matrix(polltrain2[-ii2,-16]) # Here I create the design matrix x and the outcome variable y from the # subset of the pollution data. # xxt and yyt are the test set # Note, for the pollution data, column 16 is the outcome. This may not # be the case of your project data. # rleaps<-regsubsets(xx,yy,int=T,nbest=10,nvmax=dim(polltrain2)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) # The regsubsets and summary commands do all subset model fitting. Here, # I am asking R to return the 10 best models of each size, and to always # include the intercept. # tt<-apply(cleaps$which,1,sum) # This is the size of each model. # The cleaps$which is a True/False table. Each row is a particular model # and the variables that are included in this model is marked with T for True. BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # various model selection criteria par(mfrow=c(2,2)) plot(tt,cleaps$cp,main="Cp") plot(tt,AIC,main="AIC") plot(tt,BIC,main="BIC") # where is the minimum obtained? That is the selected model. p<-locator() # cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] # these are the winning models for each criteria. # rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(polltrain2)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) # Here I just ask for the best model of every size tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 par(mfrow=c(1,1)) plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Cp-black,AIC-blue,BIC-red") lines(tt,AIC-min(AIC),col="blue") lines(tt,BIC-min(BIC),col="red") abline(h=0,lty=2) # Notice how flat then Cp and BIC curves are - it's going to be difficult # to select a model with any certainty here. p<-locator() # mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T]) mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) # here I fit the models selected by Cp, AIC and BIC PEcp<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt) PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) # I then apply those models for prediction on my test data # and compute the prediction MSE # print(c("PE and size of selected models")) print(c("PECP=",round(PEcp,4),"size=",length(cpmod[cpmod==T]))) print(c("PEAIC=",round(PEaic,4),"size=",length(aicmod[aicmod==T]))) print(c("PEBIC=",round(PEbic,4),"size=",length(bicmod[bicmod==T]))) # print(c("CP model", names(polltrain2[-16])[cpmod[2:length(cpmod)]==T])) print(c("AIC model", names(polltrain2[-16])[aicmod[2:length(aicmod)]==T])) print(c("BIC model", names(polltrain2[-16])[bicmod[2:length(bicmod)]==T])) # the prediction errors for each selected model. # Run this code several times to check the stability of model selection # Each time the code will create a new training and test set. # The commands in ModSelpollrep does this automatically for you. #