###################################################
### chunk number 1: selldl1
###################################################
#line 166 "Lect9.Rnw"
SA<-data.frame(read.table('SA.dat',sep='\t',header=T)) ## read in the data
yy<-SA[,12]
xx<-SA[,-12]
##
library(leaps)
rleaps<-regsubsets(xx,yy,int=T,nbest=250,nvmax=250,really.big=T,method=c("ex")) ## all subset models
cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which
Models<-cleaps$which
Models<-rbind(c(T,rep(F,dim(xx)[2])),Models) ## adding the empty model (just an intercept) to the model matrix


###################################################
### chunk number 2: selldl2
###################################################
#line 179 "Lect9.Rnw"
K<-10
ii<-sample(seq(1,length(yy)),length(yy)) ## random perturbation of observations first.
foldsize<-floor(length(yy)/K)
sizefold<-rep(foldsize,K)
restdata<-length(yy)-K*foldsize
if (restdata>0) {
sizefold[1:restdata]<-sizefold[1:restdata]+1 }
## creates the size for each fold


###################################################
### chunk number 3: selldl3
###################################################
#line 191 "Lect9.Rnw"
Prederrors<-matrix(0,dim(Models)[1],K)
# a matrix to store the prediction errors in
iused<-0
Xmat<-as.matrix(cbind(rep(1,dim(xx)[1]),xx)) # the design matrix
for (k in (1:K)) {
  itest<-ii[(iused+1):(iused+sizefold[k])] ## the k-fold test set
  itrain<-ii[-c((iused+1):(iused+sizefold[k]))] ## the k-fold training set
  iused<-iused+length(itest)
  for (mm in (1:dim(Models)[1])) {
     betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain]
     ypred<-Xmat[itest,Models[mm,]]%*%betahat ## predictions
     Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } }
PE<-apply(Prederrors,1,sum)/length(yy)  ## final prediction errors, average across all folds.


###################################################
### chunk number 4: selldl4
###################################################
#line 209 "Lect9.Rnw"
jj<-sort.list(PE)[1:5]
print(as.matrix(Models[jj,]))


###################################################
### chunk number 5: selldl4b
###################################################
#line 215 "Lect9.Rnw"
z<-data.frame(as.matrix(cbind(Prederrors[jj,],PE[jj])))
colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE")
row.names<-c(seq(1,5))
library(xtable)
xtable(z, digits=c(0, rep(0,10),3),caption="Prederrors in different folds and total",label="tab:CVldl")


###################################################
### chunk number 6: cvldl5
###################################################
#line 224 "Lect9.Rnw"
winmod<-Models[which.min(PE),]
print(winmod)


###################################################
### chunk number 7: selldl6
###################################################
#line 232 "Lect9.Rnw"
mm<-lm(log(ldl)~log(age)+sbp+adiposity+log(obesity)+typea+alcohol+alcind+tobacco+tobind+as.factor(chd)+as.factor(famhist),data=SA)
ss<-step(mm,trace=F) ## backward selection (using AIC)
print(ss)


###################################################
### chunk number 8: sel1
###################################################
#line 240 "Lect9.Rnw"
tt<-apply(cleaps$which,1,sum) ## model size
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2


###################################################
### chunk number 9: sel1a
###################################################
#line 245 "Lect9.Rnw"
plot(tt,cleaps$cp,main="Cp")


###################################################
### chunk number 10: sel1b
###################################################
#line 248 "Lect9.Rnw"
plot(tt,AIC,main="AIC")


###################################################
### chunk number 11: sel1c
###################################################
#line 251 "Lect9.Rnw"
plot(tt,BIC,main="BIC")


###################################################
### chunk number 12: selone
###################################################
#line 267 "Lect9.Rnw"
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(xx)[2]+1,really.big=T,method=c("ex")) ## nbest=1: the best model of each size
cleaps<-summary(rleaps,matrix=T)
# just the best model of every size
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
par(mfrow=c(1,1))
plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Cp-black,AIC-blue,BIC-red")
lines(tt,AIC-min(AIC),col="blue")
lines(tt,BIC-min(BIC),col="red")
abline(h=0,lty=2)


###################################################
### chunk number 13: selcomp
###################################################
#line 293 "Lect9.Rnw"
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
print(cpmod)
print(aicmod)
print(bicmod)


###################################################
### chunk number 14: r1
###################################################
#line 306 "Lect9.Rnw"
frac<-.5
# use frac of the data for training, 1-frac for testing
# Try other splits like frac=2/3..
# Number of random splits
K<-1000
# Storage for models and
modsizecp<-rep(0,K)
modsizeaic<-rep(0,K)
modsizebic<-rep(0,K)
PEcpK<-rep(0,K)
PEaicK<-rep(0,K)
PEbicK<-rep(0,K)
modselcp<-rep(0,dim(SA)[2]-1)
modselaic<-rep(0,dim(SA)[2]-1)
modselbic<-rep(0,dim(SA)[2]-1)


###################################################
### chunk number 15: sel2
###################################################
#line 324 "Lect9.Rnw"
# The main program
# Creating training and test data
for (kk in (1:K)) {
ii<-sample(seq(1,dim(SA)[1]),round(dim(SA)[1]*frac))
yy<-SA[ii,12]
xx<-as.matrix(SA[ii,-12])
yyt<-SA[-ii,12]
xxt<-as.matrix(SA[-ii,-12])
# model selection criteria computed for top 1 model of each size
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(SA)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
# winning models
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
# winning model parameters
mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T])
mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T])
mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T])
# prediction errors on test data
PEcpK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt)
PEaicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt)
PEbicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt)
# model sizes of winning models
modsizecp[kk]<-sum(cpmod)
modsizeaic[kk]<-sum(aicmod)
modsizebic[kk]<-sum(bicmod)
# keeping track of which variables were chosen
modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1
modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1
modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1
}
# end program


###################################################
### chunk number 16: sel3
###################################################
#line 366 "Lect9.Rnw"
modseltabs<-cbind(names(SA)[-12],modselcp,modselaic,modselbic)
print(modseltabs)


###################################################
### chunk number 17: sel3b
###################################################
#line 371 "Lect9.Rnw"
z<-data.frame(as.matrix(cbind(modselcp,modselaic,modselbic)))
colnames(z) <- c("Cp","AIC","BIC")
row.names(z)<-c(names(SA)[-12])
library(xtable)
xtable(z, digits=c(0, rep(0,3)),caption="Random splits results",label="tab:seltable")


###################################################
### chunk number 18: sel4
###################################################
#line 381 "Lect9.Rnw"
print(c("mean PE for cp, aic and bic"))
print(c("PEcp=",round(mean(PEcpK),4
)," PEaic=",round(mean(PEaicK),4)," PEbicK=",round(mean(PEbicK),4)))
print(c("mean model size for cp, aic and bic"))
print(c("sizecp=",round(mean(modsizecp),4
)," sizeaic=",round(mean(modsizeaic),4)," sizebic=",round(mean(modsizebic),4)))


###################################################
### chunk number 19: sel4b
###################################################
#line 390 "Lect9.Rnw"
pes<-c(round(mean(PEcpK),4),round(mean(PEaicK),4),round(mean(PEbicK),4))
siz<-c(round(mean(modsizecp),4),round(mean(modsizeaic),4),round(mean(modsizebic),4))
z<-data.frame(as.matrix(rbind(pes,siz)))
colnames(z)<-c("Cp","AIC","BIC")
row.names(z)<-c("Prediction Error","Model size")
xtable(z, digits=c(0,rep(3,3)),caption="Random splits results: PE and model size",label="tab:seltable2")


###################################################
### chunk number 20: sel5
###################################################
#line 401 "Lect9.Rnw"
par(mfrow=c(1,2))
boxplot(as.data.frame(cbind(modsizecp,modsizeaic,modsizebic)))
boxplot(as.data.frame(cbind(PEcpK,PEaicK,PEbicK)))


###################################################
### chunk number 21: sel6
###################################################
#line 417 "Lect9.Rnw"
# pairwise comparisons
par(mfrow=c(1,1))
boxplot(as.data.frame(cbind(modsizeaic-modsizecp,modsizebic-modsizecp,modsizebic-modsizeaic)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="Model sizes")
abline(h=0,lty=2)


###################################################
### chunk number 22: sel7
###################################################
#line 423 "Lect9.Rnw"
boxplot(as.data.frame(cbind(PEaicK-PEcpK,PEbicK-PEcpK,PEbicK-PEaicK)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="PE")
abline(h=0,lty=2)


###################################################
### chunk number 23: sel8
###################################################
#line 440 "Lect9.Rnw"
plot(PEcpK,PEaicK,xlab="PE with Cp", ylab="")
points(PEcpK,PEbicK,pch=2,col=2)
abline(0,1)