library(leaps) # the subset selection library frac<-.67 # use 2/3 of the data for training, 1/3 for testing # Try other splits here to see what happens to model selection... # ii<-sample(seq(1,dim(HDdatanew)[1]),round(dim(HDdatanew)[1]*frac)) yy<-HDdatanew[ii,4] xx<-as.matrix(HDdatanew[ii,-c(1,4)]) yyt<-HDdatanew[-ii,4] xxt<-as.matrix(HDdatanew[-ii,-c(1,4)]) # rleaps<-regsubsets(xx,yy,int=T,nbest=10,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # various model selection criteria par(mfrow=c(2,2)) plot(tt,cleaps$cp,main="Cp") plot(tt,AIC,main="AIC") plot(tt,BIC,main="BIC") # where is the minimum obtained? That is the selected model. p<-locator() # cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] # rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) # just the best model of every size tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 par(mfrow=c(1,1)) plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Cp-black,AIC-blue,BIC-red") lines(tt,AIC-min(AIC),col="blue") lines(tt,BIC-min(BIC),col="red") abline(h=0,lty=2) p<-locator() mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T]) mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) PEcp<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt) PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) print(c("PE and size of selected models")) print(c("PECP=",round(PEcp,4),"size=",length(cpmod[cpmod==T]))) print(c("PEAIC=",round(PEaic,4),"size=",length(aicmod[aicmod==T]))) print(c("PEBIC=",round(PEbic,4),"size=",length(bicmod[bicmod==T]))) # print(c("CP model", names(HDdatanew[-c(1,4)])[cpmod[2:length(cpmod)]==T])) print(c("AIC model", names(HDdatanew[-c(1,4)])[aicmod[2:length(aicmod)]==T])) print(c("BIC model", names(HDdatanew[-c(1,4)])[bicmod[2:length(bicmod)]==T])) # the prediction errors for each selected model. # Run this code several times to check the stability of model selection # Each time the code will create a new training and test set. # p<-locator() lm1<-lars(xx,yy,type='lasso',trace=T) plm1<-predict(lm1,xxt) par(mfrow=c(1,1)) plot(lm1) p<-locator() # predicts for all paths # lpe1<-apply((yyt-plm1$fit)^2,2,mean) print("Prediction errors - Lasso") cdf<-as.data.frame(rbind(as.vector(as.real(lm1$ac)),lpe1[-1])) names(cdf)<-names(HDdatanew[-c(1,4)])[as.real(lm1$ac)] print(cdf)