# install the packages first
library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
library(plot3D)
library(dplyr)
library(parallel)
library(data.table)
library(RColorBrewer)
library(GGally)
library(ElemStatLearn)
library(rpart)
library(rpart.plot)
# Heartdisease data
data(SAheart)

##### Regression tree
print(names(SAheart))
print(dim(SAheart))
pairs(SAheart)
##### Regression? then you would need to transform some of the variables or include nonlinear terms possibly.
##### CART - splits the data into regions where a constant outcome value is a good data summary.

# Try CART
library(rpart)
library(rpart.plot)
ii<-sample(seq(1,dim(SAheart)[1]),300)
#training data - subsample of patients. Try different training data sizes here and see what happens.
tree1<-rpart(ldl~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=5,xval=10))
prp(tree1,extra=100)
# plotting the full tree
plot(tree1)
text(tree1,cex=.75,digits=2)
# here it's easier which decisions actually reduce the error rate the most (length of the branches)
# short branches don't reduce the error rate very much, long branches do.

#### CV to select tree size
plotcp(tree1)
printcp(tree1)
# Pick the size tree that minimizes the CV error?
# or the smallest tree that's within 1SD of the min CV error - a more conservative option.
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error
minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] # min CV + 1 SD
pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) # smallest tree within this range?
pickcp<-tree1$cptable[pickmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree
#
ptree1<-prune(tree1,cp=pickcp)
plot(ptree1)
text(ptree1, cex=.75, digits=2)
# What's the prediction error?
predtree<-predict(ptree1,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3)))
#####
# what about the smallest CV tree?
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error
pickcp<-tree1$cptable[rowmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree
#
ptree1<-prune(tree1,cp=pickcp)
plot(ptree1)
text(ptree1, cex=.75, digits=2)
# What's the prediction error?
predtree<-predict(ptree1,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3)))
# 
#### larger trees too?
#
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])] # min CV error
minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5] # min CV + 1 SD
pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0]) # smallest tree within this range?
pickcp<-tree1$cptable[pickmin,1] # this is the Cp value (the tuning alpha from lecture notes) of this tree
#
k<-3 # vary this to increase the size tree over the conservative choice
# try larger and larger trees and check test error rate.
# Is Cp a good selection criterion?
pickcp<-tree1$cptable[pickmin+k,1]
#
ptree1<-prune(tree1,cp=pickcp)
plot(ptree1)
text(ptree1,cex=.75,digits=2)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3)))
##########################################
####### Random forest
library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
SA.rf <- rfsrc(ldl~., data=SAheart[ii,],statistics=T,tree.err=T,proximity=T,importance=T)
print(SA.rf) # summary of the run
### plotting OOB error
SA.error<-gg_error(SA.rf)
plot(SA.error) # notice that the OOB error drops off quickly after about 50 trees. Also, no overfitting problem.
#
plot(SA.rf) # the OOB erro and the variable importance. Adiposity is the most important predictor for ldl (cholesterol)

### variable importance
plot(gg_vimp(SA.rf),lbls=names(SAheart[-3]))

#### Test error
predRF<-predict.rfsrc(SA.rf,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predRF$predicted)^2/length(predRF$predicted),3)))
# much better than CART above

#### variable selection via minimum depth (how high up in a tree a variable appears)
SA.varsel <- var.select(SA.rf) # takes a long time
SA.md<-gg_minimal_depth(SA.varsel)
plot(SA.md, lbls=names(SAheart)[-3])
## comparing with permutation variable importance
plot(gg_minimal_vimp(SA.md))
# Agree on the most important ones.

##### fitted values - nonlinear, interactions
plot.variable(SA.rf) # Here, each tree gives you a fitted value for the outcome
# average of these is the random forest fitted value. 
# Plot these fitted values against each predictor variable to see what kind of association 
# RF has learnt between each x and y. 
# Here: some nonlinear dependencies between tobacco and ldl (2nd in top row), linear for sbp (1st in top row)
# Etc... 

# You can use the RF results to look for interactions (perhaps to later include in a regression model)
# You can look for how high up in a tree a variable j shows up below a variable i. If a variable i shows up
# high in the tree (minimum depth) and j follows soon after, it's indicative of an interaction
SA.int<-find.interaction(SA.rf, method="maxsubtree")
plot(gg_interaction(SA.int),xvar=SA.md$topvars,panel=TRUE)
# You can also use variable importance of PAIRS or variables to look for interactions
# What is the individual variable importance (permutation based) compared to the joint one (both are permuted)?
SA.int<-find.interaction(SA.rf, method="vimp") # compare additive effect to paired effect
# suggest adiposity/obesity is an interaction


### better plotting of fitted values using the plotmo library
library(randomForest)
library(plotmo)

SA.rf<-randomForest(ldl~.,data=SAheart,importance=TRUE,proximity=TRUE)
plotmo(SA.rf)
# you get a visual summary of the fitted values vs variables as before 
# and an interaction plot with a response surface of fitted values vs pairs of variables.
# Look for variables where this response surface is NOT made up of parallel lines which means
# the effect is purely additive.
# Adiposity-obesity looks like a strong contender, as the paired vimp indicated above.

############################
#### Classification - predicting heart disease (chd)
library(rpart)
library(rpart.plot)
#
ii<-sample(seq(1,dim(SAheart)[1]),300)
tree1<-rpart(chd~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3),method="class")
plot(tree1)
text(tree1, cex=.75, digits=2, use.n=T)
#
plotcp(tree1)
printcp(tree1)
#
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])]
minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5]
pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0])
pickcp<-tree1$cptable[pickmin,1]
#
ptree1<-prune(tree1,cp=pickcp)
plot(ptree1)
text(ptree1,digits=2,cex=.75,use.n=T)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class")
print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3)))
##
k<-2 # try a bigger tree
pickcp<-tree1$cptable[pickmin+k,1]
#
ptree1<-prune(tree1,cp=pickcp)
plot(ptree1)
text(ptree1,digits=2,cex=.75,use.n=T)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class")
print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3)))
##########
#### RF
library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
SAclass<-SAheart
SAclass$chd<-as.factor(SAclass$chd)
SA.rf <- rfsrc(as.factor(chd)~., data=SAclass[ii,],tree.err=T,importance=TRUE,statistics=T,proximity=T)
print(SA.rf)
plot(SA.rf) # notice that the error rate for the chd=1 class is much higher than for the chd=0 class...
# also, slightly different variables are important for the two classes.
# 
# proximity - distance based on how many trees a pair of observations end up in the same terminal node
# can be used to visualize the data, finding groups of observations
hc<-hclust(as.dist(1-SA.rf$proximity))
plot(hc,labels=SAheart$chd[ii])
# you can see that there is a tight group of chd=0 that stay together but it shows there is a lot of 
# spread within each class = difficult classification problem!
### test error
predRF<-predict.rfsrc(SA.rf,newdata=SAclass[-ii,-10])
pp<-apply(predRF$predicted,1,sort.list)[2,]-1
print(c("Test error rate",round(sum(SAheart[-ii,10]!=pp)/length(predRF$predicted),3)))
table(pp,SAclass[-ii,10])
### OOB error
SA.error<-gg_error(SA.rf)
plot(SA.error)
### variable importance
plot(gg_vimp(SA.rf),lbls=names(SAheart[-10]))
### min depth selection
SA.varsel.chd <- var.select(SA.rf) # takes a long time
SA.md<-gg_minimal_depth(SA.varsel.chd)
plot(SA.md, lbls=names(SAheart)[-10])
# tobacco and age are identified as the most important predictors....
##########
library(randomForest)
library(plotmo)
#
SA.rf<-randomForest(chd~.,data=SAclass[ii,],importance=TRUE,proximity=TRUE,
                    do.trace=100)
plot(SA.rf)
print(SA.rf$confusion) # on the training data - check how well we do in terms of separating the classes
### clustering - can proximity separate the chd labels?
SA.mds<-cmdscale(1-SA.rf$proximity) # this is a kind of projection based on the proximity data - we will revisit this later
plot(SA.mds, col=SAheart$chd[ii]+1,xlab="Dim1",ylab="Dim2") # som 0s are well separated but this illustrates
# how difficult the classification problem is for this data set
#
plotmo(SA.rf)
# We see how each variable is related to an increased risk of chd=1.  adiposity/tobacco interaction?
varImpPlot(SA.rf) tobacco and adiposity and age appear to be the most important predictors
# different variable importance measures
par(mfrow=c(2,2))
barplot(SA.rf$importance[,1],main="VarImp chd=0")
barplot(SA.rf$importance[,2],main="VarImp chd=1")
barplot(SA.rf$importance[,3],main="VarImp overall")
barplot(SA.rf$importance[,4],main="VarImp GINI")
######## using ranger - a different RF packages
library(ranger)

SA.rf<-ranger(chd~., data=SAheart,importance="permutation",write.forest=T,
              replace=T,classification=TRUE)
print(SA.rf)

barplot(importance(SA.rf))


######### unsupervised RF
library(MASS)
library(randomForest)
library(plot3D)
library(rgl)

# a glass data set. Classifying glass based on chemical profile
data(fgl)
names(fgl)
fgl.rf<-randomForest(type~.,data=fgl,importance=T,do.trace=100)
####
print(fgl.rf)
varImpPlot(fgl.rf)
#### Pretty good result. Mg is the most important predictor
#
fglR<-fgl[,-10] # remove type
for (zz in (1:9)) {
  fglR[,zz]<-sample(fglR[,zz],dim(fgl)[1]) }
fglall<-rbind(fgl[,-10],fglR)
fglall$type<-c(rep(1,214),rep(0,214))
# create synthetic data

ssfgl<-svd(fglall)
plot3d(ssfgl$u[,1:3],col=c(as.numeric(fgl$type),rep(0,214))+1,
       pch=c(as.numeric(fgl$type),rep(0,214))+1)
# plotting the data. Black is the synthetic

clust.rf<-randomForest(as.factor(type)~.,data=fglall,proximity=T)
# RF on synthetic and real data
hc<-hclust(as.dist(1-clust.rf$proximity[1:214,1:214])) # cluster on only the real data proximities
plot(hc,labels=c(fgl$type)) 
#######
#### on the heart disease data
SAclass$famhist<-as.numeric(SAclass$famhist)
SAR<-SAclass[,-10]
for (zz in (1:9)) {
  SAR[,zz]<-sample(SAR[,zz],dim(SAR)[1]) }
SAall<-rbind(SAclass[,-10],SAR)
SAall$type<-c(rep(1,462),rep(0,462))
#
ssSA<-svd(SAall)
plot3d(ssSA$u[,1:3],col=c(as.numeric(SAclass$chd)+2,rep(1,462)),
       pch=c(as.numeric(SAclass$chd),rep(0,462))+1)


clust.rf<-randomForest(as.factor(type)~.,data=SAall,proximity=T)

hc<-hclust(as.dist(1-clust.rf$proximity[1:462,1:462]))
plot(hc,labels=c(SAclass$chd)) # again - we find a subset of the non-chd data - same as when we classify
#####
irisR<-iris[,-5]
for (zz in (1:4)) {
  irisR[,zz]<-sample(irisR[,zz],dim(irisR)[1]) }
irisall<-rbind(iris[,-5],irisR)
irisall$type<-c(rep(1,150),rep(0,150))

ssSA<-svd(irisall)
plot3d(ssSA$u[,1:3],col=c(as.numeric(iris$Species)+2,rep(1,150)),
       pch=c(as.numeric(iris$Species)+2,rep(0,150))+1)


clust.rf<-randomForest(as.factor(type)~.,data=irisall,proximity=T)

hc<-hclust(as.dist(1-clust.rf$proximity[1:150,1:150]), "complete")
plot(hc,labels=c(iris$Species)) # not bad? the flowers of the same species stay together but
# the clusters are not well separated...
plot(cmdscale(1-clust.rf$proximity[1:150,1:150]),col=as.numeric(iris$Species)) # project proximities into 
# a data space that represents the distance.
xx<-cmdscale(1-clust.rf$proximity[1:150,1:150])
table(kmeans(xx,3)$cluster,iris$Species)



#### rfsrc package has unsupervised RF as an option with a bit more settings to play with