# Model selection code
#
# First you need to install the R-package "leaps".
# Do this by issuing the command
install.packages("leaps")
#
# Once you have installed the package on your computer you need to 
# activate it in R by issuing the command
library(leaps)
# You need to do the activation each time you start R, but the installation
# you only need to do the first time.
#
# 
# From the file pollstart I have a training data set called polltrain2.
# This will be a different data set for you since you're to work on a different
# subset of data.
frac<-.67
# I split the training data again, just to examine model selection
# stability.
# Here I use  2/3 of the data for training, 1/3 for testing
# You can pick another size split like 50-50 with frac<-.5 etc
#
ii2<-sample(seq(1,dim(polltrain2)[1]),round(dim(polltrain2)[1]*frac))
yy<-polltrain2[ii2,16]
xx<-as.matrix(polltrain2[ii2,-16])
yyt<-polltrain2[-ii2,16]
xxt<-as.matrix(polltrain2[-ii2,-16])
# Here I create the design matrix x and the outcome variable y from the 
# subset of the pollution data. 
# xxt and yyt are the test set
# Note, for the pollution data, column 16 is the outcome. This may not
# be the case of your project data.
#
rleaps<-regsubsets(xx,yy,int=T,nbest=10,nvmax=dim(polltrain2)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
# The regsubsets and summary commands do all subset model fitting. Here,
# I am asking R to return the 10 best models of each size, and to always
# include the intercept.
#
tt<-apply(cleaps$which,1,sum)
# This is the size of each model. 
# The cleaps$which is a True/False table. Each row is a particular model
# and the variables that are included in this model is marked with T for True.
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
# various model selection criteria
par(mfrow=c(2,2))
plot(tt,cleaps$cp,main="Cp")
plot(tt,AIC,main="AIC")
plot(tt,BIC,main="BIC")
# where is the minimum obtained? That is the selected model. 
p<-locator()
#
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
# these are the winning models for each criteria.
#
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(polltrain2)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
# Here I just ask for the best model of every size
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
par(mfrow=c(1,1))
plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Cp-black,AIC-blue,BIC-red")
lines(tt,AIC-min(AIC),col="blue")
lines(tt,BIC-min(BIC),col="red")
abline(h=0,lty=2)
# Notice how flat then Cp and BIC curves are - it's going to be difficult
# to select a model with any certainty here.
p<-locator()
#
mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T])
mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T])
mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T])
# here I fit the models selected by Cp, AIC and BIC
PEcp<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt)
PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt)
PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt)
# I then apply those models for prediction on my test data 
# and compute the prediction MSE
#
print(c("PE and size of selected models"))
print(c("PECP=",round(PEcp,4),"size=",length(cpmod[cpmod==T])))
print(c("PEAIC=",round(PEaic,4),"size=",length(aicmod[aicmod==T])))
print(c("PEBIC=",round(PEbic,4),"size=",length(bicmod[bicmod==T])))
#
print(c("CP model", names(polltrain2[-16])[cpmod[2:length(cpmod)]==T]))
print(c("AIC model", names(polltrain2[-16])[aicmod[2:length(aicmod)]==T]))
print(c("BIC model", names(polltrain2[-16])[bicmod[2:length(bicmod)]==T]))
# the prediction errors for each selected model.
# Run this code several times to check the stability of model selection
# Each time the code will create a new training and test set. 
# The commands in ModSelpollrep does this automatically for you.
#