randomsplits<-function(dataset,yind,frac=.5,K=100) {
# dataset consist of x and y variables in one matrix with variables names in the header. yind is the column number corresponding to
# the response
#
#
# use frac of the data for training, 1-frac for testing
# Try other splits like frac=2/3..
# if frac=1, no test data set is used
#
# Number of random splits K
#
# Storage for models and
modsizecp<-rep(0,K)
modsizeaic<-rep(0,K)
modsizebic<-rep(0,K)
PEcpK<-rep(0,K)
PEaicK<-rep(0,K)
PEbicK<-rep(0,K)
modselcp<-rep(0,dim(dataset)[2]-1)
modselaic<-rep(0,dim(dataset)[2]-1)
modselbic<-rep(0,dim(dataset)[2]-1)
# The main program
for (kk in (1:K)) {
if (frac<1) {
 ii<-sample(seq(1,dim(dataset)[1]),round(dim(dataset)[1]*frac))
 yy<-dataset[ii,yind]
 xx<-as.matrix(dataset[ii,-yind])
 yyt<-dataset[-ii,yind]
 xxt<-as.matrix(dataset[-ii,-yind]) }
if (frac==1) {
 yy<-dataset[,yind]
 yyt<-yy
 xx<-dataset[,yind]
 xxt<-xx }
# model selection criteria computed for top 1 model of each size
library(leaps)
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(dataset)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
# winning models
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
# winning model parameters
mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T])
mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T])
mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T])
# prediction errors on test data
PEcpK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt)
PEaicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt)
PEbicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt)
# model sizes of winning models
modsizecp[kk]<-sum(cpmod)
modsizeaic[kk]<-sum(aicmod)
modsizebic[kk]<-sum(bicmod)
# keeping track of which variables were chosen
modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1
modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1
modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1 }
modseltabs<-cbind(names(dataset)[-yind],modselcp,modselaic,modselbic)
return(list(PEcpK=PEcpK,PEaicK=PEaicK,PEbicK=PEbicK,modsizecp=modsizecp,modsizeaic=modsizeaic,modsizebic=modsizebic,modseltabs=modseltabs))
}