library(leaps)
# the subset selection library
frac<-.67
# use  2/3 of the data for training, 1/3 for testing
# Try other splits here to see what happens to model selection...
#
ii<-sample(seq(1,dim(HDdatanew)[1]),round(dim(HDdatanew)[1]*frac))
yy<-HDdatanew[ii,4]
xx<-as.matrix(HDdatanew[ii,-c(1,4)])
yyt<-HDdatanew[-ii,4]
xxt<-as.matrix(HDdatanew[-ii,-c(1,4)])
#
rleaps<-regsubsets(xx,yy,int=T,nbest=10,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
# various model selection criteria
par(mfrow=c(2,2))
plot(tt,cleaps$cp,main="Cp")
plot(tt,AIC,main="AIC")
plot(tt,BIC,main="BIC")
# where is the minimum obtained? That is the selected model. 
p<-locator()
#
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
#
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
# just the best model of every size
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
par(mfrow=c(1,1))
plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Cp-black,AIC-blue,BIC-red")
lines(tt,AIC-min(AIC),col="blue")
lines(tt,BIC-min(BIC),col="red")
abline(h=0,lty=2)
p<-locator()
mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T])
mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T])
mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T])
PEcp<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt)
PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt)
PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt)
print(c("PE and size of selected models"))
print(c("PECP=",round(PEcp,4),"size=",length(cpmod[cpmod==T])))
print(c("PEAIC=",round(PEaic,4),"size=",length(aicmod[aicmod==T])))
print(c("PEBIC=",round(PEbic,4),"size=",length(bicmod[bicmod==T])))
#
print(c("CP model", names(HDdatanew[-c(1,4)])[cpmod[2:length(cpmod)]==T]))
print(c("AIC model", names(HDdatanew[-c(1,4)])[aicmod[2:length(aicmod)]==T]))
print(c("BIC model", names(HDdatanew[-c(1,4)])[bicmod[2:length(bicmod)]==T]))
# the prediction errors for each selected model.
# Run this code several times to check the stability of model selection
# Each time the code will create a new training and test set. 
#
p<-locator()
lm1<-lars(xx,yy,type='lasso',trace=T)
plm1<-predict(lm1,xxt)
par(mfrow=c(1,1))
plot(lm1)
p<-locator()
# predicts for all paths
#
lpe1<-apply((yyt-plm1$fit)^2,2,mean)
print("Prediction errors - Lasso")
cdf<-as.data.frame(rbind(as.vector(as.real(lm1$ac)),lpe1[-1]))
names(cdf)<-names(HDdatanew[-c(1,4)])[as.real(lm1$ac)]
print(cdf)