################################################### ### chunk number 1: cv1 ################################################### #line 80 "Lecture8b.Rnw" library(leaps) n<-50; noisesd<-2 x<-sort(rnorm(n)) yy<-1+2*x-.5*x^2+rnorm(n,sd=noisesd) xx<-as.matrix(cbind(x,x^2,x^3)) ## rleaps<-regsubsets(xx,yy,int=T,nbest=4,nvmax=4,really.big=T,method=c("ex")) ## all subset models cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which Models<-cleaps$which Models<-rbind(c(T,rep(F,3)),Models) colnames(Models)<-c("intercept","b1","b2","b3") ## Note, I actually did the modeling fitting to get the model list instead of programming up an enumeration scheme ################################################### ### chunk number 2: cv2 ################################################### #line 96 "Lecture8b.Rnw" K<-10 ii<-sample(seq(1,length(yy)),length(yy)) foldsize<-floor(length(yy)/K) sizefold<-rep(foldsize,K) restdata<-length(yy)-K*foldsize if (restdata>0) { sizefold[1:restdata]<-sizefold[1:restdata]+1} ## creates the size for each fold ################################################### ### chunk number 3: cv3 ################################################### #line 108 "Lecture8b.Rnw" Prederrors<-matrix(0,dim(Models)[1],K) # a matrix to store the prediction errors in iused<-0 Xmat<-cbind(rep(1,n),xx) yy<-yy[ii] Xmat<-Xmat[ii,] for (k in (1:K)) { itest<-ii[(iused+1):(iused+sizefold[k])] itrain<-ii[-c((iused+1):(iused+sizefold[k]))] iused<-iused+length(itest) for (mm in (1:dim(Models)[1])) { betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain] ypred<-Xmat[itest,Models[mm,]]%*%betahat Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } } PE<-apply(Prederrors,1,sum)/n # final prediction errors. ################################################### ### chunk number 4: cv4 ################################################### #line 124 "Lecture8b.Rnw" library(xtable) z<-data.frame(as.matrix(cbind(Prederrors,PE))) colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE") row.names<-c(seq(1,8)) xtable(z, digits=c(0, rep(2,11)),caption="Prederrors in different folds and total",label="tab:CVresult") ################################################### ### chunk number 5: cv5 ################################################### #line 134 "Lecture8b.Rnw" winmod<-Models[which.min(PE),] print(winmod) print(Models[sort.list(PE),]) ################################################### ### chunk number 6: cv6 ################################################### #line 140 "Lecture8b.Rnw" n<-1500; noisesd<-2 x<-sort(rnorm(n)) yy<-1+2*x-.5*x^2+rnorm(n,sd=noisesd) xx<-as.matrix(cbind(x,x^2,x^3)) ## rleaps<-regsubsets(xx,yy,int=T,nbest=4,nvmax=4,really.big=T,method=c("ex")) ## all subset models cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which Models<-cleaps$which Models<-rbind(c(T,rep(F,3)),Models) colnames(Models)<-c("intercept","b1","b2","b3") K<-10 ii<-sample(seq(1,length(yy)),length(yy)) foldsize<-floor(length(yy)/K) sizefold<-rep(foldsize,K) restdata<-length(yy)-K*foldsize if (restdata>0) { sizefold[1:restdata]<-sizefold[1:restdata]+1 } ## creates the size for each fold Prederrors<-matrix(0,dim(Models)[1],K) # a matrix to store the prediction errors in iused<-0 Xmat<-cbind(rep(1,n),xx) for (k in (1:K)) { itest<-ii[(iused+1):(iused+sizefold[k])] itrain<-ii[-c((iused+1):(iused+sizefold[k]))] iused<-iused+length(itest) for (mm in (1:dim(Models)[1])) { betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain] ypred<-Xmat[itest,Models[mm,]]%*%betahat Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } } PE<-apply(Prederrors,1,sum)/n # final prediction errors. ################################################### ### chunk number 7: cv6 ################################################### #line 173 "Lecture8b.Rnw" z<-data.frame(as.matrix(cbind(Prederrors,PE))) colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE") row.names<-c(seq(1,8)) xtable(z, digits=c(0, rep(0,10),3),caption="Prederrors in different folds and total",label="tab:CVresult2") ################################################### ### chunk number 8: cv7 ################################################### #line 183 "Lecture8b.Rnw" winmod<-Models[which.min(PE),] print(winmod) ################################################### ### chunk number 9: cv8 ################################################### #line 222 "Lecture8b.Rnw" #LOOCV PE<-rep(0,dim(Models)[1]) for (mm in (1:dim(Models)[1])) { modfit<-lm(yy~Xmat[,Models[mm,]]-1) rstud<-rstudent(modfit)*summary(modfit)$sig PE[mm]<-sum((rstud)^2)/length(yy) } ################################################### ### chunk number 10: cv7 ################################################### #line 231 "Lecture8b.Rnw" winmod<-Models[which.min(PE),] print(winmod) ################################################### ### chunk number 11: cvldl1 ################################################### #line 239 "Lecture8b.Rnw" SA<-data.frame(read.table('SA.dat',sep='\t',header=T)) ## read in the data SA<-SA[sample(seq(1,dim(SA)[1]),150),] yy<-SA[,12] xx<-SA[,-12] ## rleaps<-regsubsets(xx,yy,int=T,nbest=250,nvmax=250,really.big=T,method=c("ex")) ## all subset models cleaps<-summary(rleaps,matrix=T) ## True/False matrix. The r-th is a True/False statement about which Models<-cleaps$which Models<-rbind(c(T,rep(F,dim(xx)[2])),Models) ################################################### ### chunk number 12: cvldl2 ################################################### #line 251 "Lecture8b.Rnw" K<-25 ii<-sample(seq(1,length(yy)),length(yy)) foldsize<-floor(length(yy)/K) sizefold<-rep(foldsize,K) restdata<-length(yy)-K*foldsize if (restdata>0) { sizefold[1:restdata]<-sizefold[1:restdata]+1 } ## creates the size for each fold ################################################### ### chunk number 13: cvldl3 ################################################### #line 263 "Lecture8b.Rnw" Prederrors<-matrix(0,dim(Models)[1],K) # a matrix to store the prediction errors in iused<-0 Xmat<-as.matrix(cbind(rep(1,dim(xx)[1]),xx)) for (k in (1:K)) { itest<-ii[(iused+1):(iused+sizefold[k])] ## the k-fold test set itrain<-ii[-c((iused+1):(iused+sizefold[k]))] ## the k-fold training set iused<-iused+length(itest) for (mm in (1:dim(Models)[1])) { betahat<-solve(t(Xmat[itrain,Models[mm,]])%*%Xmat[itrain,Models[mm,]])%*%t(Xmat[itrain,Models[mm,]])%*%yy[itrain] ypred<-Xmat[itest,Models[mm,]]%*%betahat ## predictions Prederrors[mm,k]<-sum((yy[itest]-ypred)^2) } } PE<-apply(Prederrors,1,sum)/length(yy) ## final prediction errors, average across all folds. ################################################### ### chunk number 14: cvldl4 ################################################### #line 281 "Lecture8b.Rnw" jj<-sort.list(PE)[1:5] print(as.matrix(Models[jj,])) ################################################### ### chunk number 15: cvldl4b ################################################### #line 285 "Lecture8b.Rnw" z<-data.frame(as.matrix(cbind(Prederrors[jj,],PE[jj]))) colnames(z) <- c("Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","PE") row.names<-c(seq(1,5)) xtable(z, digits=c(0, rep(0,10),3),caption="Prederrors in different folds and total",label="tab:CVldl") ################################################### ### chunk number 16: cvldl5 ################################################### #line 293 "Lecture8b.Rnw" winmod<-Models[which.min(PE),] print(winmod) ################################################### ### chunk number 17: cvldl6 ################################################### #line 301 "Lecture8b.Rnw" mm<-lm(log(ldl)~log(age)+sbp+adiposity+log(obesity)+typea+alcohol+alcind+tobacco+tobind+as.factor(chd)+as.factor(famhist),data=SA) ss<-step(mm,trace=F) print(ss) ################################################### ### chunk number 18: cvldl6 ################################################### #line 310 "Lecture8b.Rnw" #LOOCV PE<-rep(0,dim(Models)[1]) for (mm in (1:dim(Models)[1])) { modfit<-lm(yy~Xmat[,Models[mm,]]-1) rstud<-rstudent(modfit)*summary(modfit)$sig PE[mm]<-sum((rstud)^2)/length(yy) } ################################################### ### chunk number 19: cvldl4 ################################################### #line 319 "Lecture8b.Rnw" jj<-sort.list(PE)[1:5] print(as.matrix(Models[jj,]))