# install the packages first library(randomForestSRC) library(ggplot2) library(ggRandomForests) library(plot3D) library(dplyr) library(parallel) library(data.table) library(RColorBrewer) library(GGally) library(ElemStatLearn) library(rpart) library(rpart.plot) # Heartdisease data data(SAheart) ##### Regression tree print(names(SAheart)) print(dim(SAheart)) pairs(SAheart) ##### Regression? then you would need to transform some of the variables or include nonlinear terms possibly. ##### CART - splits the data into regions where a constant outcome value is a good data summary. # Try CART library(rpart) library(rpart.plot) ii<-sample(seq(1,dim(SAheart)[1]),300) #training data - subsample of patients. Try different training data sizes here and see what happens. tree1<-rpart(ldl~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=5,xval=10)) prp(tree1,extra=100) # plotting the full tree plot(tree1) text(tree1,cex=.75,digits=2) # here it's easier which decisions actually reduce the error rate the most (length of the branches) # short branches don't reduce the error rate very much, long branches do. #### CV to select tree size plotcp(tree1) printcp(tree1) # Pick the size tree that minimizes the CV error? # or the smallest tree that's within 1SD of the min CV error - a more conservative option. nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] # min CV + 1 SD pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) # smallest tree within this range? pickcp<-tree1$cptable[pickmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree # ptree1<-prune(tree1,cp=pickcp) plot(ptree1) text(ptree1, cex=.75, digits=2) # What's the prediction error? predtree<-predict(ptree1,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3))) ##### # what about the smallest CV tree? nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error pickcp<-tree1$cptable[rowmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree # ptree1<-prune(tree1,cp=pickcp) plot(ptree1) text(ptree1, cex=.75, digits=2) # What's the prediction error? predtree<-predict(ptree1,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3))) # #### larger trees too? # nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] # min CV + 1 SD pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) # smallest tree within this range? pickcp<-tree1$cptable[pickmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree # k<-3 # vary this to increase the size tree over the conservative choice # try larger and larger trees and check test error rate. # Is Cp a good selection criterion? pickcp<-tree1$cptable[pickmin+k,1] # ptree1<-prune(tree1,cp=pickcp) plot(ptree1) text(ptree1,cex=.75,digits=2) # predtree<-predict(ptree1,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3))) ########################################## ####### Random forest library(randomForestSRC) library(ggplot2) library(ggRandomForests) SA.rf <- rfsrc(ldl~., data=SAheart[ii,],statistics=T,tree.err=T,proximity=T,importance=T) print(SA.rf) # summary of the run ### plotting OOB error SA.error<-gg_error(SA.rf) plot(SA.error) # notice that the OOB error drops off quickly after about 50 trees. Also, no overfitting problem. # plot(SA.rf) # the OOB erro and the variable importance. Adiposity is the most important predictor for ldl (cholesterol) ### variable importance plot(gg_vimp(SA.rf),lbls=names(SAheart[-3])) #### Test error predRF<-predict.rfsrc(SA.rf,newdata=SAheart[-ii,-3]) print(c("Test error rate",round(sum(SAheart[-ii,3]-predRF$predicted)^2/length(predRF$predicted),3))) # much better than CART above #### variable selection via minimum depth (how high up in a tree a variable appears) SA.varsel <- var.select(SA.rf) # takes a long time SA.md<-gg_minimal_depth(SA.varsel) plot(SA.md, lbls=names(SAheart)[-3]) ## comparing with permutation variable importance plot(gg_minimal_vimp(SA.md)) # Agree on the most important ones. ##### fitted values - nonlinear, interactions plot.variable(SA.rf) # Here, each tree gives you a fitted value for the outcome # average of these is the random forest fitted value. # Plot these fitted values against each predictor variable to see what kind of association # RF has learnt between each x and y. # Here: some nonlinear dependencies between tobacco and ldl (2nd in top row), linear for sbp (1st in top row) # Etc... # You can use the RF results to look for interactions (perhaps to later include in a regression model) # You can look for how high up in a tree a variable j shows up below a variable i. If a variable i shows up # high in the tree (minimum depth) and j follows soon after, it's indicative of an interaction SA.int<-find.interaction(SA.rf, method="maxsubtree") plot(gg_interaction(SA.int),xvar=SA.md$topvars,panel=TRUE) # You can also use variable importance of PAIRS or variables to look for interactions # What is the individual variable importance (permutation based) compared to the joint one (both are permuted)? SA.int<-find.interaction(SA.rf, method="vimp") # compare additive effect to paired effect # suggest adiposity/obesity is an interaction ### better plotting of fitted values using the plotmo library library(randomForest) library(plotmo) SA.rf<-randomForest(ldl~.,data=SAheart,importance=TRUE,proximity=TRUE) plotmo(SA.rf) # you get a visual summary of the fitted values vs variables as before # and an interaction plot with a response surface of fitted values vs pairs of variables. # Look for variables where this response surface is NOT made up of parallel lines which means # the effect is purely additive. # Adiposity-obesity looks like a strong contender, as the paired vimp indicated above. ############################ #### Classification - predicting heart disease (chd) library(rpart) library(rpart.plot) # ii<-sample(seq(1,dim(SAheart)[1]),300) tree1<-rpart(chd~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3),method="class") plot(tree1) text(tree1, cex=.75, digits=2, use.n=T) # plotcp(tree1) printcp(tree1) # nspl<-seq(1,dim(tree1$cptable)[1]) rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) pickcp<-tree1$cptable[pickmin,1] # ptree1<-prune(tree1,cp=pickcp) plot(ptree1) text(ptree1,digits=2,cex=.75,use.n=T) # predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class") print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3))) ## k<-2 # try a bigger tree pickcp<-tree1$cptable[pickmin+k,1] # ptree1<-prune(tree1,cp=pickcp) plot(ptree1) text(ptree1,digits=2,cex=.75,use.n=T) # predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class") print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3))) ########## #### RF library(randomForestSRC) library(ggplot2) library(ggRandomForests) SAclass<-SAheart SAclass$chd<-as.factor(SAclass$chd) SA.rf <- rfsrc(as.factor(chd)~., data=SAclass[ii,],tree.err=T,importance=TRUE,statistics=T,proximity=T) print(SA.rf) plot(SA.rf) # notice that the error rate for the chd=1 class is much higher than for the chd=0 class... # also, slightly different variables are important for the two classes. # # proximity - distance based on how many trees a pair of observations end up in the same terminal node # can be used to visualize the data, finding groups of observations hc<-hclust(as.dist(1-SA.rf$proximity)) plot(hc,labels=SAheart$chd[ii]) # you can see that there is a tight group of chd=0 that stay together but it shows there is a lot of # spread within each class = difficult classification problem! ### test error predRF<-predict.rfsrc(SA.rf,newdata=SAclass[-ii,-10]) pp<-apply(predRF$predicted,1,sort.list)[2,]-1 print(c("Test error rate",round(sum(SAheart[-ii,10]!=pp)/length(predRF$predicted),3))) table(pp,SAclass[-ii,10]) ### OOB error SA.error<-gg_error(SA.rf) plot(SA.error) ### variable importance plot(gg_vimp(SA.rf),lbls=names(SAheart[-10])) ### min depth selection SA.varsel.chd <- var.select(SA.rf) # takes a long time SA.md<-gg_minimal_depth(SA.varsel.chd) plot(SA.md, lbls=names(SAheart)[-10]) # tobacco and age are identified as the most important predictors.... ########## library(randomForest) library(plotmo) # SA.rf<-randomForest(chd~.,data=SAclass[ii,],importance=TRUE,proximity=TRUE, do.trace=100) plot(SA.rf) print(SA.rf$confusion) # on the training data - check how well we do in terms of separating the classes ### clustering - can proximity separate the chd labels? SA.mds<-cmdscale(1-SA.rf$proximity) # this is a kind of projection based on the proximity data - we will revisit this later plot(SA.mds, col=SAheart$chd[ii]+1,xlab="Dim1",ylab="Dim2") # som 0s are well separated but this illustrates # how difficult the classification problem is for this data set # plotmo(SA.rf) # We see how each variable is related to an increased risk of chd=1. adiposity/tobacco interaction? varImpPlot(SA.rf) tobacco and adiposity and age appear to be the most important predictors # different variable importance measures par(mfrow=c(2,2)) barplot(SA.rf$importance[,1],main="VarImp chd=0") barplot(SA.rf$importance[,2],main="VarImp chd=1") barplot(SA.rf$importance[,3],main="VarImp overall") barplot(SA.rf$importance[,4],main="VarImp GINI") ######## using ranger - a different RF packages library(ranger) SA.rf<-ranger(chd~., data=SAheart,importance="permutation",write.forest=T, replace=T,classification=TRUE) print(SA.rf) barplot(importance(SA.rf)) ######### unsupervised RF library(MASS) library(randomForest) library(plot3D) library(rgl) # a glass data set. Classifying glass based on chemical profile data(fgl) names(fgl) fgl.rf<-randomForest(type~.,data=fgl,importance=T,do.trace=100) #### print(fgl.rf) varImpPlot(fgl.rf) #### Pretty good result. Mg is the most important predictor # fglR<-fgl[,-10] # remove type for (zz in (1:9)) { fglR[,zz]<-sample(fglR[,zz],dim(fgl)[1]) } fglall<-rbind(fgl[,-10],fglR) fglall$type<-c(rep(1,214),rep(0,214)) # create synthetic data ssfgl<-svd(fglall) plot3d(ssfgl$u[,1:3],col=c(as.numeric(fgl$type),rep(0,214))+1, pch=c(as.numeric(fgl$type),rep(0,214))+1) # plotting the data. Black is the synthetic clust.rf<-randomForest(as.factor(type)~.,data=fglall,proximity=T) # RF on synthetic and real data hc<-hclust(as.dist(1-clust.rf$proximity[1:214,1:214])) # cluster on only the real data proximities plot(hc,labels=c(fgl$type)) ####### #### on the heart disease data SAclass$famhist<-as.numeric(SAclass$famhist) SAR<-SAclass[,-10] for (zz in (1:9)) { SAR[,zz]<-sample(SAR[,zz],dim(SAR)[1]) } SAall<-rbind(SAclass[,-10],SAR) SAall$type<-c(rep(1,462),rep(0,462)) # ssSA<-svd(SAall) plot3d(ssSA$u[,1:3],col=c(as.numeric(SAclass$chd)+2,rep(1,462)), pch=c(as.numeric(SAclass$chd),rep(0,462))+1) clust.rf<-randomForest(as.factor(type)~.,data=SAall,proximity=T) hc<-hclust(as.dist(1-clust.rf$proximity[1:462,1:462])) plot(hc,labels=c(SAclass$chd)) # again - we find a subset of the non-chd data - same as when we classify ##### irisR<-iris[,-5] for (zz in (1:4)) { irisR[,zz]<-sample(irisR[,zz],dim(irisR)[1]) } irisall<-rbind(iris[,-5],irisR) irisall$type<-c(rep(1,150),rep(0,150)) ssSA<-svd(irisall) plot3d(ssSA$u[,1:3],col=c(as.numeric(iris$Species)+2,rep(1,150)), pch=c(as.numeric(iris$Species)+2,rep(0,150))+1) clust.rf<-randomForest(as.factor(type)~.,data=irisall,proximity=T) hc<-hclust(as.dist(1-clust.rf$proximity[1:150,1:150]), "complete") plot(hc,labels=c(iris$Species)) # not bad? the flowers of the same species stay together but # the clusters are not well separated... plot(cmdscale(1-clust.rf$proximity[1:150,1:150]),col=as.numeric(iris$Species)) # project proximities into # a data space that represents the distance. xx<-cmdscale(1-clust.rf$proximity[1:150,1:150]) table(kmeans(xx,3)$cluster,iris$Species) #### rfsrc package has unsupervised RF as an option with a bit more settings to play with