set.seed(400) #install.packages('rpart') # if you haven't installed the rpart package, the above command will do so. library(rpart) # HDdataclass<-HDdatanew[,c(11,2:10,12:13)] tree1<-tree(as.factor(chd)~sbp+tobacco+ldl+adiposity+famhist+typea+obesity+alcohol+age+tobind+alcind,data=HDdataclass,minsize=10) par(mfrow=c(1,1)) plot(tree1) text(tree1) # p<-locator() cvtree1<-cv.tree(tree1,K=10) plot(cvtree1) print(cvtree1) # p<-locator() mim<-min(cvtree1$dev) sizesel<-max(8,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) plot(ptree1) text(ptree1) # predtree<-predict.tree(ptree1,newdata=HDdataclass,type="class") print(table(HDdataclass[,1],predtree)) # p<-locator() B<-100 # number of random splits UsedMatrix<-matrix(0,B,12) UsedFirst<-matrix(0,B,12) MSEMatrix<-matrix(0,B,2) for (bb in (1:B)) { # iie<-sample(seq(1,dim(HDdataclass)[1]),20) trdata<-HDdataclass[-iie,] tedata<-HDdataclass[iie,] tree1<-tree(as.factor(chd)~sbp+tobacco+ldl+adiposity+famhist+typea+obesity+alcohol+age+tobind+alcind,data=trdata,minsize=10) cvtree1<-cv.tree(tree1,K=10) mim<-min(cvtree1$dev) sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) predtree<-predict.tree(ptree1,newdata=tedata,type="class") rlabels<-tedata[,11] MSEMatrix[bb,1]<-sum(rlabels!=predtree)/length(predtree) predtree<-predict(tree1,newdata=tedata) MSEMatrix[bb,2]<-sum(rlabels!=predtree)/length(predtree) UsedMatrix[bb,as.real(summary(ptree1)$used)]<-UsedMatrix[bb,as.real(summary(ptree1)$used)]+1 UsedFirst[bb,as.real(summary(ptree1)$used)[1]]<-UsedFirst[bb,as.real(summary(ptree1)$used)[1]]+1 } boxplot(MSEMatrix[,1]-MSEMatrix[,2],main="PE-CV - PE-full") abline(h=0) p<-locator() sizetrees<-apply(UsedMatrix,1,sum) hist(sizetrees,main="Size trees") modtab<-apply(UsedMatrix,2,sum)/B print(cbind(names(HDdataclass)[-1],modtab[-1])) modfirst<-apply(UsedFirst,2,sum)/B print(cbind(names(HDdataclass)[-1],modfirst[-1])) # # p<-locator() # GLM glm1<-glm(chd~sbp+tobacco+ldl+adiposity+famhist+typea+obesity+alcohol+age+tobind+alcind, data=HDdatanew,family='binomial') par(mfrow=c(2,2)) plot(glm1) print(summary(glm1)) p<-locator() rr<-residuals(glm1,'pearson') plot(HDdatanew[,2],rr,xlab="sbp",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,2])]~sort(HDdatanew[,2])) lines(sort(HDdatanew[,2]),ll$fit,col='red',lwd=2) plot(HDdatanew[,3],rr,xlab="tobacco",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,3])]~sort(HDdatanew[,3])) lines(sort(HDdatanew[,3]),ll$fit,col='red',lwd=2) plot(HDdatanew[,4],rr,xlab="ldl",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,4])]~sort(HDdatanew[,4])) lines(sort(HDdatanew[,4]),ll$fit,col='red',lwd=2) plot(HDdatanew[,5],rr,xlab="adiposity",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,5])]~sort(HDdatanew[,5])) lines(sort(HDdatanew[,5]),ll$fit,col='red',lwd=2) p<-locator() plot(HDdatanew[,6],rr,xlab="famhist",ylab="residuals") ll<-lm(rr[sort.list(HDdatanew[,6])]~sort(HDdatanew[,6])-1) lines(sort(HDdatanew[,6]),ll$fit,col='red',lwd=2) plot(HDdatanew[,7],rr,xlab="typea",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,7])]~sort(HDdatanew[,7])) lines(sort(HDdatanew[,7]),ll$fit,col='red',lwd=2) plot(HDdatanew[,8],rr,xlab="obesity",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,8])]~sort(HDdatanew[,8])) lines(sort(HDdatanew[,8]),ll$fit,col='red',lwd=2) plot(HDdatanew[,9],rr,xlab="alcohol",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,9])]~sort(HDdatanew[,9])) lines(sort(HDdatanew[,9]),ll$fit,col='red',lwd=2) p<-locator() plot(HDdatanew[,10],rr,xlab="age",ylab="residuals") ll<-loess(rr[sort.list(HDdatanew[,10])]~sort(HDdatanew[,10])) lines(sort(HDdatanew[,10]),ll$fit,col='red',lwd=2) plot(HDdatanew[,12],rr,xlab="tobind",ylab="residuals") ll<-lm(rr[sort.list(HDdatanew[,12])]~sort(HDdatanew[,12])-1) lines(sort(HDdatanew[,12]),ll$fit,col='red',lwd=2) plot(HDdatanew[,13],rr,xlab="alcind",ylab="residuals") ll<-lm(rr[sort.list(HDdatanew[,13])]~sort(HDdatanew[,13])-1) lines(sort(HDdatanew[,13]),ll$fit,col='red',lwd=2) # p<-locator() print(step(glm1)) p<-locator() # Note obesity sign of slope negative! That's weird... par(mfrow=c(2,1)) plot(HDdatanew$ldl,HDdatanew$obesity) plot(HDdatanew$age,HDdatanew$obesity) # Aha - collinearities