library(randomForestSRC) library(ggplot2) library(ggRandomForests) library(plot3D) library(dplyr) library(parallel) library(data.table) library(RColorBrewer) library(GGally) library(ElemStatLearn) library(rpart) library(rpart.plot) data(SAheart) ##### Regression tree print(names(SAheart)) print(dim(SAheart)) pairs(SAheart) ##### Regression? then you would need to transform some of the variables or include nonlinear terms possibly. ##### CART - splits the data into regions where a constant outcome value is a good data summary. library(rpart) library(rpart.plot) ii<-sample(seq(1,dim(SAheart)[1]),300) #training dat tree1<-rpart(ldl~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3)) prp(tree1,extra=100) #### CV to select tree size plotcp(tree1) printcp(tree1) # nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])][1] minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] #line value pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) pickcp<-tree1$cptable[pickmin+1,1] # ptree1<-prune(tree1,cp=pickcp) prp(ptree1,extra=100) # predtree<-predict(ptree1,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3))) #### larger trees too? k<-1 # try larger and larger trees and check test error rate. # Is Cp a good selection criterion? pickcp<-tree1$cptable[pickmin+k,1] # ptree1<-prune(tree1,cp=pickcp) prp(ptree1,extra=100) # predtree<-predict(ptree1,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3))) ####### Random forest library(randomForestSRC) library(ggplot2) library(ggRandomForests) SA.rf <- rfsrc(ldl~., data=SAheart[ii,],statistics=T,tree.err=T,proximity=T,importance=T) print(SA.rf) ### plotting OOB error SA.error<-gg_error(SA.rf) plot(SA.error) #plot(SA.rf) ###reality check plot(gg_rfsrc(SA.rf),alpha=.5)+coord_cartesian(ylim=c(2,8)) ### variable importance plot(gg_vimp(SA.rf),lbls=names(SAheart[-3])) #### Test error predRF<-predict.rfsrc(SA.rf,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predRF$predicted)^2/length(predRF$predicted),3))) #### variable selection via minimum depth SA.varsel <- var.select(SA.rf) # takes a long time SA.md<-gg_minimal_depth(SA.varsel) plot(SA.md, lbls=names(SAheart)[-3]) ## comparing with permutation VI plot(gg_minimal_vimp(SA.md)) ##### fitted values - nonlinear, interactions plot.variable(SA.rf) #SA.int<-find.interaction(SA.rf) #plot(gg_interaction(SA.int),xvar=SA.md$topvars,panel=TRUE) ### better plotting of fitted values!! library(randomForest) library(plotmo) SA.rf<-randomForest(ldl~.,data=SAheart,importance=TRUE,proximity=TRUE) plotmo(SA.rf) ############################ #### Classification library(rpart) library(rpart.plot) tree1<-rpart(chd~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3),method="class") prp(tree1,extra=100) plotcp(tree1) printcp(tree1) # nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])][1] minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] #line value pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) pickcp<-tree1$cptable[pickmin,1] # ptree1<-prune(tree1,cp=pickcp) prp(ptree1,extra=100) # predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class") print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3))) ## k<-2 pickcp<-tree1$cptable[pickmin+k,1] # ptree1<-prune(tree1,cp=pickcp) prp(ptree1,extra=100) # predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class") print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3))) #### RF library(randomForestSRC) library(ggplot2) library(ggRandomForests) SAclass<-SAheart SAclass$chd<-as.factor(SAclass$chd) SA.rf <- rfsrc(as.factor(chd)~., data=SAclass[ii,],tree.err=T,importance=TRUE,statistics=T,proximity=T) print(SA.rf) plot(SA.rf) hc<-hclust(as.dist(1-SA.rf$proximity)) plot(hc,labels=SAheart$chd[ii]) ### test error predRF<-predict.rfsrc(SA.rf,newdata=SAclass[-ii,-10]) pp<-apply(predRF$predicted,1,sort.list)[2,]-1 print(c("Test error rate",round(sum(SAheart[-ii,10]!=pp)/length(predRF$predicted),3))) table(pp,SAclass[-ii,10]) ### OOB error SA.error<-gg_error(SA.rf) plot(SA.error) #plot(SA.rf) ### variable importance plot(gg_vimp(SA.rf),lbls=names(SAheart[-10])) ### min depth selection SA.varsel.chd <- var.select(SA.rf) # takes a long time SA.md<-gg_minimal_depth(SA.varsel.chd) plot(SA.md, lbls=names(SAheart)[-10]) ########## library(randomForest) library(plotmo) SA.rf<-randomForest(chd~.,data=SAclass[ii,],importance=TRUE,proximity=TRUE, do.trace=100) plot(SA.rf) print(SA.rf$confusion) ### clustering SA.mds<-cmdscale(1-SA.rf$proximity) plot(SA.mds, col=SAheart$chd[ii]+1,xlab="Dim1",ylab="Dim2") ### outliers based on proximity plot(outlier(SA.rf),type="h",col=c("black","red")[SAheart$chd[ii]+1]) plotmo(SA.rf) varImpPlot(SA.rf) par(mfrow=c(2,2)) barplot(SA.rf$importance[,1],main="VarImp chd=0") barplot(SA.rf$importance[,2],main="VarImp chd=1") barplot(SA.rf$importance[,3],main="VarImp overall") barplot(SA.rf$importance[,4],main="VarImp GINI") ######## using ranger library(ranger) SA.rf<-ranger(chd~., data=SAheart,importance="permutation",write.forest=T, replace=T,classification=TRUE) print(SA.rf) barplot(importance(SA.rf)) ######### unsupervised RF library(MASS) library(randomForest) library(plot3D) library(rgl) data(fgl) names(fgl) fgl.rf<-randomForest(type~.,data=fgl,importance=T,do.trace=100) #### print(fgl.rf) varImpPlot(fgl.rf) #### fglR<-fgl[,-10] for (zz in (1:9)) { fglR[,zz]<-sample(fglR[,zz],dim(fgl)[1]) } fglall<-rbind(fgl[,-10],fglR) fglall$type<-c(rep(1,214),rep(0,214)) ssfgl<-svd(fglall) plot3d(ssfgl$u[,1:3],col=c(as.numeric(fgl$type),rep(0,214))+1, pch=c(as.numeric(fgl$type),rep(0,214))+1) clust.rf<-randomForest(as.factor(type)~.,data=fglall,proximity=T) hc<-hclust(as.dist(1-clust.rf$proximity[1:214,1:214])) plot(hc,labels=c(fgl$type)) #### SAclass$famhist<-as.numeric(SAclass$famhist) SAR<-SAclass[,-10] for (zz in (1:9)) { SAR[,zz]<-sample(SAR[,zz],dim(SAR)[1]) } SAall<-rbind(SAclass[,-10],SAR) SAall$type<-c(rep(1,462),rep(0,462)) ssSA<-svd(SAall) plot3d(ssSA$u[,1:3],col=c(as.numeric(SAclass$chd)+2,rep(1,462)), pch=c(as.numeric(SAclass$chd),rep(0,462))+1) clust.rf<-randomForest(as.factor(type)~.,data=SAall,proximity=T) hc<-hclust(as.dist(1-clust.rf$proximity[1:462,1:462])) plot(hc,labels=c(SAclass$chd)) ##### irisR<-iris[,-5] for (zz in (1:4)) { irisR[,zz]<-sample(irisR[,zz],dim(irisR)[1]) } irisall<-rbind(iris[,-5],irisR) irisall$type<-c(rep(1,150),rep(0,150)) ssSA<-svd(irisall) plot3d(ssSA$u[,1:3],col=c(as.numeric(iris$Species)+2,rep(1,150)), pch=c(as.numeric(iris$Species)+2,rep(0,150))+1) clust.rf<-randomForest(as.factor(type)~.,data=irisall,proximity=T) hc<-hclust(as.dist(1-clust.rf$proximity[1:150,1:150])) plot(hc,labels=c(iris$Species)) #### rfsrc has unsupervised RF as an option