#
# In this code I run CV repeatedly.
# Scroll down to find the code to run it just one time.
#
K<-100
# The number of random splits I run CV on repeatedly.
# 
# 
BP <- 1/5
# The real test set - the fraction left out for prediction error estimation
# 
#
rleapsall<-summary(regsubsets(HDdatanew[,-c(1,4)],HDdatanew[,4],int=T,nbest=10,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex")))
tt<-apply(rleapsall$which,1,sum)
cleaps2<-rleapsall
cleaps2$which<-rleapsall$which[tt>1,]
LL<-dim(cleaps2$which)[1]
# Here I created a library of models to search over. LL is the number
# of models I will fit to the data.
# This is a bit of a cheat since I used the whole data set to come up
# with all these models, but I try to play it safe by asking for 10
# models of each size.
#
# cleaps2$which is a library of models to search over.
# There are LL models total 
#
#
PELL<-rep(0,LL)
PEK<-rep(0,LL)
# I create empty vectors to keep record of average pred error for each model
whichmodel<-rep(0,LL)
modsel<-rep(0,dim(HDdatanew)[2]-1)
modsize<-rep(0,K)
# and empty tables and vectors to count up the number of times a model is 
# selected
#
#
# This loop runs the CV several times on random splits of data.
for (kk in (1:K)) {
whichmodelkk<-rep(0,LL)
#
#
B<-3
# The kind of cross-validation
# B<-2 is 2-fold
# B<-3 is 3-fold, can try 5, or 10 as well.
#
ii<-sample(seq(1,dim(HDdatanew)[1]),round((1-BP)*dim(HDdatanew)[1]))
# scramble order of observations
# this is just to make sure that you have the folds of the data are 
# comparable. can be a problem otherwise if the data stored in
# increasing values of y for example.
HDdatanewb<-HDdatanew[ii,]
HDdatanewc<-HDdatanew[-ii,]
# leave out for prediction error estimation
yyte<-HDdatanewc[,4]
xxte<-as.matrix(HDdatanewc[,-c(1,4)])
#
#
#
# HERE IS THE MAIN CV CODE
# I create an empty vector to store the CV prediction errors in
PEL <- rep(0,dim(rleapsall$which)[1])
#
for (bb in (1:B)) {
# do for all folds 
 ij<-seq(1,floor(dim(HDdatanewb)[1]/B))+floor(dim(HDdatanewb)[1]/B)*(bb-1)
 if (bb==B) {
  ij<-seq(floor(dim(HDdatanewb)[1]/B)*(bb-1),dim(HDdatanewb)[1]) }
 yy<-HDdatanewb[-ij,4]
 xx<-as.matrix(HDdatanewb[-ij,-c(1,4)])
 yyt<-HDdatanewb[ij,4]
 xxt<-as.matrix(HDdatanewb[ij,-c(1,4)])
 # create the training and test set - 1 fold for testing
 for (ll in (1:LL)) {
  mmcp<-lm(yy~cbind(rep(1,length(yy)),xx)[,cleaps2$which[ll,]==T]-1)
 # fit model ll to the training set
  PEL[ll]<-PEL[ll]+ (1/B)*sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt)[,cleaps2$which[ll,]==T]%*%mmcp$coef)^2)/length(yyt)
 # compute the CV prediction errors
 # store for average comparison after you've done this to all folds. 
  } }
# Find the winning model from the minimum prediction error min(PEL)
yy<-HDdatanewb[,4]
xx<-as.matrix(HDdatanewb[,-c(1,4)])
mmcp<-lm(yy~cbind(rep(1,length(yy)),xx)[,cleaps2$which[PEL==min(PEL),]==T]-1)
PEK[PEL==min(PEL)] <- PEK[PEL==min(PEL)] + sum((yyte-cbind(rep(1,dim(xxte)[1]),xxte)[,cleaps2$which[PEL==min(PEL),]==T]%*%mmcp$coef)^2)/length(yyte)
# PEK is the actual prediction error of this model on the test data
# 
# END OF MAIN CV CODE.
# YOUR BEST MODEL IS mmcp
#
#
# Across of the K splits, store the number of times a variable was selected
modsel[cleaps2$which[PEL==min(PEL),]]<-modsel[cleaps2$which[PEL==min(PEL),]]+1
# How big was the model?
modsize[kk]<-sum(cleaps2$which[PEL==min(PEL),])-1
# How many times was an actual model selected?
whichmodelkk[sort.list(PEL)[1]]<-whichmodelkk[sort.list(PEL)[1]]+1 
whichmodel[sort.list(PEL)[1]]<-whichmodel[sort.list(PEL)[1]]+1 
#
 for (ll in (1:LL)) {
   PELL[ll] <- PELL[ll]+PEL[ll]/K }
# Here I record across the splits what the average prediction error is.
# PEL was the prediction error from one split
# PELL is the average prediction error the K splits. 
#
# 
#
}
PELL<-PELL/K
PEK<-PEK/K
modseltab<-cbind(names(HDdatanew)[-c(1,4)],modsel[-1])
# After the code stops running look at 
# modelseltab to see which variables are selected and how often
# modsize will tell you the size of the models selected
#
#
plot(tt[PEK>0],PEK[PEK>0],xlab="size",ylab="True Prediction Error")
# plotting actual prediction error performance 
# not plotting the zeroes, since that's for models that were
# never selected.
# 
topmodels<-cleaps2$which[rev(sort.list(whichmodel))[1:10],]
print(cbind(c("CV-PE",PELL[rev(sort.list(whichmodel))][1:10]),c("PE",PEK[rev(sort.list(whichmodel))[1:10]]),rbind(names(HDdatanew)[-c(1,4)],topmodels[,-1])))
print(c("Topmodels selected #", rev(sort(whichmodel))[1:10]))
# How many times was each top model selected - is there a clear winner? 
# Compare CV prediction error PELL to actual prediction errors PEK
#