###################################################
### chunk number 1: cv1
###################################################
#line 80 "Lecture8b.Rnw"
library(leaps)
n<-50; noisesd<-2
x<-sort(rnorm(n))
yy<-1+2*x-.5*x^2+rnorm(n,sd=noisesd)
xx<-as.matrix(cbind(x,x^2,x^3))
##
rleaps<-regsubsets(xx,yy,int=T,nbest=4,nvmax=4,really.big=T,method=c("ex")) ## all subset models
cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which
Models<-cleaps$which
Models<-rbind(c(T,rep(F,3)),Models)
colnames(Models)<-c("intercept","b1","b2","b3")
## Note, I actually did the modeling fitting to get the model list instead of programming up an enumeration scheme


###################################################
### chunk number 2: cv2
###################################################
#line 96 "Lecture8b.Rnw"
K<-10
ii<-sample(seq(1,length(yy)),length(yy))
foldsize<-floor(length(yy)/K)
sizefold<-rep(foldsize,K)
restdata<-length(yy)-K*foldsize
if (restdata>0) {
sizefold[1:restdata]<-sizefold[1:restdata]+1}
## creates the size for each fold


###################################################
### chunk number 3: cv3
###################################################
#line 108 "Lecture8b.Rnw"
Prederrors<-matrix(0,dim(Models)[1],K)
# a matrix to store the prediction errors in
iused<-0
Xmat<-cbind(rep(1,n),xx)
yy<-yy[ii]
Xmat<-Xmat[ii,]
for (k in (1:K)) {
  itest<-ii[(iused+1):(iused+sizefold[k])]
  itrain<-ii[-c((iused+1):(iused+sizefold[k]))]
  iused<-iused+length(itest)
  for (mm in (1:dim(Models)[1])) {
     betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain]
     ypred<-Xmat[itest,Models[mm,]]%*%betahat
     Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } }
PE<-apply(Prederrors,1,sum)/n  # final prediction errors.


###################################################
### chunk number 4: cv4
###################################################
#line 124 "Lecture8b.Rnw"
library(xtable)
z<-data.frame(as.matrix(cbind(Prederrors,PE)))
colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE")
row.names<-c(seq(1,8))
xtable(z, digits=c(0, rep(2,11)),caption="Prederrors in different folds and total",label="tab:CVresult")


###################################################
### chunk number 5: cv5
###################################################
#line 134 "Lecture8b.Rnw"
winmod<-Models[which.min(PE),]
print(winmod)

print(Models[sort.list(PE),])


###################################################
### chunk number 6: cv6
###################################################
#line 140 "Lecture8b.Rnw"
n<-1500; noisesd<-2
x<-sort(rnorm(n))
yy<-1+2*x-.5*x^2+rnorm(n,sd=noisesd)
xx<-as.matrix(cbind(x,x^2,x^3))
##
rleaps<-regsubsets(xx,yy,int=T,nbest=4,nvmax=4,really.big=T,method=c("ex")) ## all subset models
cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which
Models<-cleaps$which
Models<-rbind(c(T,rep(F,3)),Models)
colnames(Models)<-c("intercept","b1","b2","b3")
K<-10
ii<-sample(seq(1,length(yy)),length(yy))
foldsize<-floor(length(yy)/K)
sizefold<-rep(foldsize,K)
restdata<-length(yy)-K*foldsize
if (restdata>0) {
sizefold[1:restdata]<-sizefold[1:restdata]+1 }
## creates the size for each fold
Prederrors<-matrix(0,dim(Models)[1],K)
# a matrix to store the prediction errors in
iused<-0
Xmat<-cbind(rep(1,n),xx)
for (k in (1:K)) {
  itest<-ii[(iused+1):(iused+sizefold[k])]
  itrain<-ii[-c((iused+1):(iused+sizefold[k]))]
  iused<-iused+length(itest)
  for (mm in (1:dim(Models)[1])) {
     betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain]
     ypred<-Xmat[itest,Models[mm,]]%*%betahat
     Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } }
PE<-apply(Prederrors,1,sum)/n  # final prediction errors.


###################################################
### chunk number 7: cv6
###################################################
#line 173 "Lecture8b.Rnw"
z<-data.frame(as.matrix(cbind(Prederrors,PE)))
colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE")
row.names<-c(seq(1,8))
xtable(z, digits=c(0, rep(0,10),3),caption="Prederrors in different folds and total",label="tab:CVresult2")


###################################################
### chunk number 8: cv7
###################################################
#line 183 "Lecture8b.Rnw"
winmod<-Models[which.min(PE),]
print(winmod)


###################################################
### chunk number 9: cv8
###################################################
#line 222 "Lecture8b.Rnw"
#LOOCV
PE<-rep(0,dim(Models)[1])
for (mm in (1:dim(Models)[1])) {
     modfit<-lm(yy~Xmat[,Models[mm,]]-1)
     rstud<-rstudent(modfit)*summary(modfit)$sig
     PE[mm]<-sum((rstud)^2)/length(yy) }


###################################################
### chunk number 10: cv7
###################################################
#line 231 "Lecture8b.Rnw"
winmod<-Models[which.min(PE),]
print(winmod)


###################################################
### chunk number 11: cvldl1
###################################################
#line 239 "Lecture8b.Rnw"
SA<-data.frame(read.table('SA.dat',sep='\t',header=T)) ## read in the data
SA<-SA[sample(seq(1,dim(SA)[1]),150),]
yy<-SA[,12]
xx<-SA[,-12]
##
rleaps<-regsubsets(xx,yy,int=T,nbest=250,nvmax=250,really.big=T,method=c("ex")) ## all subset models
cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which
Models<-cleaps$which
Models<-rbind(c(T,rep(F,dim(xx)[2])),Models)


###################################################
### chunk number 12: cvldl2
###################################################
#line 251 "Lecture8b.Rnw"
K<-25
ii<-sample(seq(1,length(yy)),length(yy))
foldsize<-floor(length(yy)/K)
sizefold<-rep(foldsize,K)
restdata<-length(yy)-K*foldsize
if (restdata>0) {
sizefold[1:restdata]<-sizefold[1:restdata]+1 }
## creates the size for each fold


###################################################
### chunk number 13: cvldl3
###################################################
#line 263 "Lecture8b.Rnw"
Prederrors<-matrix(0,dim(Models)[1],K)
# a matrix to store the prediction errors in
iused<-0
Xmat<-as.matrix(cbind(rep(1,dim(xx)[1]),xx))
for (k in (1:K)) {
  itest<-ii[(iused+1):(iused+sizefold[k])] ## the k-fold test set
  itrain<-ii[-c((iused+1):(iused+sizefold[k]))] ## the k-fold training set
  iused<-iused+length(itest)
  for (mm in (1:dim(Models)[1])) {
     betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain]
     ypred<-Xmat[itest,Models[mm,]]%*%betahat ## predictions
     Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } }
PE<-apply(Prederrors,1,sum)/length(yy)  ## final prediction errors, average across all folds.


###################################################
### chunk number 14: cvldl4
###################################################
#line 281 "Lecture8b.Rnw"
jj<-sort.list(PE)[1:5]
print(as.matrix(Models[jj,]))


###################################################
### chunk number 15: cvldl4b
###################################################
#line 285 "Lecture8b.Rnw"
z<-data.frame(as.matrix(cbind(Prederrors[jj,],PE[jj])))
colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE")
row.names<-c(seq(1,5))
xtable(z, digits=c(0, rep(0,10),3),caption="Prederrors in different folds and total",label="tab:CVldl")


###################################################
### chunk number 16: cvldl5
###################################################
#line 293 "Lecture8b.Rnw"
winmod<-Models[which.min(PE),]
print(winmod)


###################################################
### chunk number 17: cvldl6
###################################################
#line 301 "Lecture8b.Rnw"
mm<-lm(log(ldl)~log(age)+sbp+adiposity+log(obesity)+typea+alcohol+alcind+tobacco+tobind+as.factor(chd)+as.factor(famhist),data=SA)
ss<-step(mm,trace=F)
print(ss)


###################################################
### chunk number 18: cvldl6
###################################################
#line 310 "Lecture8b.Rnw"
#LOOCV
PE<-rep(0,dim(Models)[1])
for (mm in (1:dim(Models)[1])) {
     modfit<-lm(yy~Xmat[,Models[mm,]]-1)
     rstud<-rstudent(modfit)*summary(modfit)$sig
     PE[mm]<-sum((rstud)^2)/length(yy) }


###################################################
### chunk number 19: cvldl4
###################################################
#line 319 "Lecture8b.Rnw"
jj<-sort.list(PE)[1:5]
print(as.matrix(Models[jj,]))