library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
library(plot3D)
library(dplyr)
library(parallel)
library(data.table)
library(RColorBrewer)
library(GGally)
library(ElemStatLearn)
library(rpart)
library(rpart.plot)
data(SAheart)

##### Regression tree
print(names(SAheart))
print(dim(SAheart))
pairs(SAheart)
##### Regression? then you would need to transform some of the variables or include nonlinear terms possibly.
##### CART - splits the data into regions where a constant outcome value is a good data summary.


library(rpart)
library(rpart.plot)
ii<-sample(seq(1,dim(SAheart)[1]),300)
#training dat
tree1<-rpart(ldl~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3))
prp(tree1,extra=100)

#### CV to select tree size
plotcp(tree1)
printcp(tree1)
#
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])][1]
minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5]
#line value
pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0])
pickcp<-tree1$cptable[pickmin+1,1]
#
ptree1<-prune(tree1,cp=pickcp)
prp(ptree1,extra=100)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3)))
#### larger trees too?
k<-1
# try larger and larger trees and check test error rate.
# Is Cp a good selection criterion?
pickcp<-tree1$cptable[pickmin+k,1]
#
ptree1<-prune(tree1,cp=pickcp)
prp(ptree1,extra=100)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predtree)^2/length(predtree),3)))

####### Random forest
library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
SA.rf <- rfsrc(ldl~., data=SAheart[ii,],statistics=T,tree.err=T,proximity=T,importance=T)
print(SA.rf)

### plotting OOB error
SA.error<-gg_error(SA.rf)
plot(SA.error)
#plot(SA.rf)

###reality check
plot(gg_rfsrc(SA.rf),alpha=.5)+coord_cartesian(ylim=c(2,8))

### variable importance
plot(gg_vimp(SA.rf),lbls=names(SAheart[-3]))

#### Test error
predRF<-predict.rfsrc(SA.rf,newdata=SAheart[-ii,-3])
print(c("Test error rate",round(sum(SAheart[-ii,3]-predRF$predicted)^2/length(predRF$predicted),3)))


#### variable selection via minimum depth
SA.varsel <- var.select(SA.rf) # takes a long time
SA.md<-gg_minimal_depth(SA.varsel)
plot(SA.md, lbls=names(SAheart)[-3])
## comparing with permutation VI
plot(gg_minimal_vimp(SA.md))


##### fitted values - nonlinear, interactions
plot.variable(SA.rf)


#SA.int<-find.interaction(SA.rf)
#plot(gg_interaction(SA.int),xvar=SA.md$topvars,panel=TRUE)


### better plotting of fitted values!!
library(randomForest)
library(plotmo)

SA.rf<-randomForest(ldl~.,data=SAheart,importance=TRUE,proximity=TRUE)
plotmo(SA.rf)


############################
#### Classification
library(rpart)
library(rpart.plot)

tree1<-rpart(chd~.,data=SAheart,subset=ii,control=rpart.control(cp=0.01, minsplit=10,xval=3),method="class")
prp(tree1,extra=100)


plotcp(tree1)
printcp(tree1)
#
nspl<-seq(1,dim(tree1$cptable)[1])
rowmin<-nspl[tree1$cptable[,4]==min(tree1$cptable[,4])][1]
minmin<-tree1$cptable[rowmin,4]+tree1$cptable[rowmin,5]
#line value
pickmin<-min(nspl[tree1$cptable[,4]<=minmin & tree1$cptable[,2]>0])
pickcp<-tree1$cptable[pickmin,1]
#
ptree1<-prune(tree1,cp=pickcp)
prp(ptree1,extra=100)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class")
print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3)))
##
k<-2
pickcp<-tree1$cptable[pickmin+k,1]
#
ptree1<-prune(tree1,cp=pickcp)
prp(ptree1,extra=100)
#
predtree<-predict(ptree1,newdata=SAheart[-ii,-10],type="class")
print(c("Misclassification error rate",round(sum(SAheart[-ii,10]!=predtree)/length(predtree),3)))



#### RF
library(randomForestSRC)
library(ggplot2)
library(ggRandomForests)
SAclass<-SAheart
SAclass$chd<-as.factor(SAclass$chd)
SA.rf <- rfsrc(as.factor(chd)~., data=SAclass[ii,],tree.err=T,importance=TRUE,statistics=T,proximity=T)
print(SA.rf)
plot(SA.rf)

hc<-hclust(as.dist(1-SA.rf$proximity))
plot(hc,labels=SAheart$chd[ii])

### test error
predRF<-predict.rfsrc(SA.rf,newdata=SAclass[-ii,-10])
pp<-apply(predRF$predicted,1,sort.list)[2,]-1
print(c("Test error rate",round(sum(SAheart[-ii,10]!=pp)/length(predRF$predicted),3)))
table(pp,SAclass[-ii,10])

### OOB error
SA.error<-gg_error(SA.rf)
plot(SA.error)
#plot(SA.rf)


### variable importance
plot(gg_vimp(SA.rf),lbls=names(SAheart[-10]))


### min depth selection
SA.varsel.chd <- var.select(SA.rf) # takes a long time

SA.md<-gg_minimal_depth(SA.varsel.chd)
plot(SA.md, lbls=names(SAheart)[-10])


##########
library(randomForest)
library(plotmo)

SA.rf<-randomForest(chd~.,data=SAclass[ii,],importance=TRUE,proximity=TRUE,
                    do.trace=100)

plot(SA.rf)
print(SA.rf$confusion)



### clustering
SA.mds<-cmdscale(1-SA.rf$proximity)
plot(SA.mds, col=SAheart$chd[ii]+1,xlab="Dim1",ylab="Dim2")

### outliers based on proximity     
plot(outlier(SA.rf),type="h",col=c("black","red")[SAheart$chd[ii]+1])


plotmo(SA.rf)

varImpPlot(SA.rf)

par(mfrow=c(2,2))
barplot(SA.rf$importance[,1],main="VarImp chd=0")
barplot(SA.rf$importance[,2],main="VarImp chd=1")
barplot(SA.rf$importance[,3],main="VarImp overall")
barplot(SA.rf$importance[,4],main="VarImp GINI")


######## using ranger
library(ranger)

SA.rf<-ranger(chd~., data=SAheart,importance="permutation",write.forest=T,
              replace=T,classification=TRUE)
print(SA.rf)

barplot(importance(SA.rf))


######### unsupervised RF
library(MASS)
library(randomForest)
library(plot3D)
library(rgl)

data(fgl)
names(fgl)
fgl.rf<-randomForest(type~.,data=fgl,importance=T,do.trace=100)
####
print(fgl.rf)
varImpPlot(fgl.rf)
####

fglR<-fgl[,-10]
for (zz in (1:9)) {
  fglR[,zz]<-sample(fglR[,zz],dim(fgl)[1]) }
fglall<-rbind(fgl[,-10],fglR)
fglall$type<-c(rep(1,214),rep(0,214))

ssfgl<-svd(fglall)
plot3d(ssfgl$u[,1:3],col=c(as.numeric(fgl$type),rep(0,214))+1,
       pch=c(as.numeric(fgl$type),rep(0,214))+1)


clust.rf<-randomForest(as.factor(type)~.,data=fglall,proximity=T)

hc<-hclust(as.dist(1-clust.rf$proximity[1:214,1:214]))
plot(hc,labels=c(fgl$type))

####
SAclass$famhist<-as.numeric(SAclass$famhist)
SAR<-SAclass[,-10]
for (zz in (1:9)) {
  SAR[,zz]<-sample(SAR[,zz],dim(SAR)[1]) }
SAall<-rbind(SAclass[,-10],SAR)
SAall$type<-c(rep(1,462),rep(0,462))

ssSA<-svd(SAall)
plot3d(ssSA$u[,1:3],col=c(as.numeric(SAclass$chd)+2,rep(1,462)),
       pch=c(as.numeric(SAclass$chd),rep(0,462))+1)


clust.rf<-randomForest(as.factor(type)~.,data=SAall,proximity=T)

hc<-hclust(as.dist(1-clust.rf$proximity[1:462,1:462]))
plot(hc,labels=c(SAclass$chd))

#####
irisR<-iris[,-5]
for (zz in (1:4)) {
  irisR[,zz]<-sample(irisR[,zz],dim(irisR)[1]) }
irisall<-rbind(iris[,-5],irisR)
irisall$type<-c(rep(1,150),rep(0,150))

ssSA<-svd(irisall)
plot3d(ssSA$u[,1:3],col=c(as.numeric(iris$Species)+2,rep(1,150)),
       pch=c(as.numeric(iris$Species)+2,rep(0,150))+1)


clust.rf<-randomForest(as.factor(type)~.,data=irisall,proximity=T)

hc<-hclust(as.dist(1-clust.rf$proximity[1:150,1:150]))
plot(hc,labels=c(iris$Species))




#### rfsrc has unsupervised RF as an option