randomsplits<-function(dataset,yind,frac=.5,K=100) { # dataset consist of x and y variables in one matrix with variables names in the header. yind is the column number corresponding to # the response # # # use frac of the data for training, 1-frac for testing # Try other splits like frac=2/3.. # if frac=1, no test data set is used # # Number of random splits K # # Storage for models and modsizecp<-rep(0,K) modsizeaic<-rep(0,K) modsizebic<-rep(0,K) PEcpK<-rep(0,K) PEaicK<-rep(0,K) PEbicK<-rep(0,K) modselcp<-rep(0,dim(dataset)[2]-1) modselaic<-rep(0,dim(dataset)[2]-1) modselbic<-rep(0,dim(dataset)[2]-1) # The main program for (kk in (1:K)) { if (frac<1) { ii<-sample(seq(1,dim(dataset)[1]),round(dim(dataset)[1]*frac)) yy<-dataset[ii,yind] xx<-as.matrix(dataset[ii,-yind]) yyt<-dataset[-ii,yind] xxt<-as.matrix(dataset[-ii,-yind]) } if (frac==1) { yy<-dataset[,yind] yyt<-yy xx<-dataset[,yind] xxt<-xx } # model selection criteria computed for top 1 model of each size library(leaps) rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(dataset)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # winning models cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] # winning model parameters mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T]) mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) # prediction errors on test data PEcpK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt) PEaicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEbicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) # model sizes of winning models modsizecp[kk]<-sum(cpmod) modsizeaic[kk]<-sum(aicmod) modsizebic[kk]<-sum(bicmod) # keeping track of which variables were chosen modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1 modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1 modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1 } modseltabs<-cbind(names(dataset)[-yind],modselcp,modselaic,modselbic) return(list(PEcpK=PEcpK,PEaicK=PEaicK,PEbicK=PEbicK,modsizecp=modsizecp,modsizeaic=modsizeaic,modsizebic=modsizebic,modseltabs=modseltabs)) }