randomsplitsBin<-function(dataset,yind,frac=.5,K=100) { # dataset consist of x and y variables in one matrix with variables names in the header. yind is the column number corresponding to # the response # # # use frac of the data for training, 1-frac for testing # Try other splits like frac=2/3.. # if frac=1, no test data set is used # # Number of random splits K # # Storage for models and modsizecp<-rep(0,K) modsizeaic<-rep(0,K) modsizebic<-rep(0,K) PEcpK<-rep(0,K) PEaicK<-rep(0,K) PEbicK<-rep(0,K) modselcp<-rep(0,dim(dataset)[2]-1) modselaic<-rep(0,dim(dataset)[2]-1) modselbic<-rep(0,dim(dataset)[2]-1) # The main program for (kk in (1:K)) { if (frac<1) { ii<-sample(seq(1,dim(dataset)[1]),round(dim(dataset)[1]*frac)) yy<-dataset[ii,yind] xx<-as.matrix(dataset[ii,-yind]) yyt<-dataset[-ii,yind] xxt<-as.matrix(dataset[-ii,-yind]) } if (frac==1) { yy<-dataset[,yind] yyt<-yy xx<-dataset[,-yind] xxt<-xx } # model selection criteria computed for top 1 model of each size library(leaps) rleaps<-regsubsets(xx,yy,int=T,nbest=100,nvmax=dim(dataset)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) BICvec<-rep(0,dim(cleaps$which)[1]) AICvec<-BICvec y<-dataset[,yind] x<-as.matrix(dataset[,-yind]) data1<-as.data.frame(cbind(y,x)) for (zz in (1:dim(cleaps$which)[1])) { gg<-glm(y~x[,cleaps$which[zz,2:dim(cleaps$which)[2]]==T],'binomial',subset=ii,data=data1) AICvec[zz]<-gg$dev+2*tt[zz] BICvec[zz]<-gg$dev+tt[zz]*log(length(yy)) } # winning models aicmod<-cleaps$which[AICvec==min(AICvec),] bicmod<-cleaps$which[BICvec==min(BICvec),] # winning model parameters mmaic<-glm(y~x[,aicmod[2:length(aicmod)]==T],'binomial',subset=ii) mmbic<-glm(y~x[,bicmod[2:length(bicmod)]==T],'binomial',subset=ii) # prediction errors on test data if (frac<1) { ppaic<-predict(mmaic,as.data.frame(x[-ii,]),'response') ppbic<-predict(mmbic,as.data.frame(x[-ii,]),'response')} if (frac==1) { ppaic<-predict(mmaic,as.data.frame(x),'response') ppbic<-predict(mmbic,as.data.frame(x),'response')} if (frac<1) { PEaicK[kk]<-sum(yyt!=round(ppaic[-ii]))/length(yyt) PEbicK[kk]<-sum(yyt!=round(ppbic[-ii]))/length(yyt)} if (frac==1) { PEaicK[kk]<-sum(yyt!=round(ppaic))/length(yyt) PEbicK[kk]<-sum(yyt!=round(ppbic))/length(yyt)} # model sizes of winning models modsizeaic[kk]<-sum(aicmod) modsizebic[kk]<-sum(bicmod) # keeping track of which variables were chosen modselaic[aicmod[2:length(aicmod)]==T]<-modselaic[aicmod[2:length(aicmod)]==T]+1 modselbic[bicmod[2:length(aicmod)]==T]<-modselbic[bicmod[2:length(aicmod)]==T]+1 } modseltabs<-cbind(names(dataset)[-yind],modselaic,modselbic) return(list(PEaicK=PEaicK,PEbicK=PEbicK,modsizeaic=modsizeaic,modsizebic=modsizebic,modseltabs=modseltabs)) }