library(leaps) # the subset selection library frac<-.67 # use 2/3 of the data for training, 1/3 for testing # Try other splits here, like 1/2... # # Number of random splits K<-100 modsizecp<-rep(0,K) modsizeaic<-rep(0,K) modsizebic<-rep(0,K) PEcpK<-rep(0,K) PEaicK<-rep(0,K) PEbicK<-rep(0,K) modselcp<-rep(0,dim(HDdatanew)[2]-2) modselaic<-rep(0,dim(HDdatanew)[2]-2) modselbic<-rep(0,dim(HDdatanew)[2]-2) for (kk in (1:K)) { # ii<-sample(seq(1,dim(HDdatanew)[1]),round(dim(HDdatanew)[1]*frac)) yy<-HDdatanew[ii,4] xx<-as.matrix(HDdatanew[ii,-c(1,4)]) yyt<-HDdatanew[-ii,4] xxt<-as.matrix(HDdatanew[-ii,-c(1,4)]) # rleaps<-regsubsets(xx,yy,int=T,nbest=10,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # various model selection criteria # where is the minimum obtained? That is the selected model. # cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] # rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(HDdatanew)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) # just the best model of every size tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T]) mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) PEcp<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt) PEaic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEbic<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) # PEcpK[kk]<-PEcp PEaicK[kk]<-PEaic PEbicK[kk]<-PEbic # modsizecp[kk]<-sum(cpmod) modsizeaic[kk]<-sum(aicmod) modsizebic[kk]<-sum(bicmod) # modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1 modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1 modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1 } # modseltabs<-cbind(names(HDdatanew)[-c(1,4)],modselcp,modselaic,modselbic) print(modseltabs) # print(c("mean PE for cp, aic and bic")) print(c("PEcp=",round(mean(PEcpK),4 )," PEaic=",round(mean(PEaicK),4)," PEbicK=",round(mean(PEbicK),4))) print(c("mean model size for cp, aic and bic")) print(c("sizecp=",round(mean(modsizecp),4 )," sizeaic=",round(mean(modsizeaic),4)," sizebic=",round(mean(modsizebic),4))) # # par(mfrow=c(1,2)) boxplot(as.data.frame(cbind(modsizecp,modsizeaic,modsizebic))) boxplot(as.data.frame(cbind(PEcpK,PEaicK,PEbicK))) p<-locator() # pairwise comparisons par(mfrow=c(1,1)) boxplot(as.data.frame(cbind(modsizeaic-modsizecp,modsizebic-modsizecp,modsizebic-modsizeaic)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="Model sizes") abline(h=0,lty=2) p<-locator() boxplot(as.data.frame(cbind(PEaicK-PEcpK,PEbicK-PEcpK,PEbicK-PEaicK)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="PE") abline(h=0,lty=2) p<-locator() plot(PEcpK,PEaicK,xlab="PE with Cp", ylab="",main="o is AIC vs Cp, triangle is BIC vs Cp") points(PEcpK,PEbicK,pch=2,col=2) abline(0,1)