library(caret) library(psych) library(glmnet) library(pamr) library(sparseLDA) library(pamr) library(mlr) library(FSelector) library(MASS) #### NOTE: caret and mlr have overlapping function names! # rename the mlr train and mlrtrain mlrtrain<-mlr::train train<-caret::train ## https://topepo.github.io/caret/index.html ############################################## ############################################## # Feature selection for the the iris data mydata<-iris # this is to make the code run for other data classlab<-5 # which column is the label # #mydata<-wine #classlab<-1 # #mydata<-SAheart #mydata$famhist<-as.numeric(mydata$famhist) #classlab<-10 # # #mydata<-SAheart #mydata$famhist<-as.numeric(mydata$famhist) #classlab<-10 # standardize<-function(x) { x<-(x-mean(x))/sd(x) } unrel<-10 #### how many unrelated features mydata[,-classlab]<-apply(mydata[,-classlab],2,standardize) # standardize features newdata<-matrix(rnorm(dim(mydata)[1]*unrel),dim(mydata[1]),unrel) # unrelated features pv<-dim(mydata)[2] mydata<-cbind(mydata,newdata) names(mydata)[(pv+1):dim(mydata)[2]]<-paste("A",seq(1,unrel),sep=".") # #mydata<-as.data.frame(zip.train[sample(seq(1,7291),2000),]) # this takes a long time to run.... #classlab<-1 #names(mydata)<-c("Class",paste("A",seq(1,256),sep=".")) # ##### names(mydata)[classlab]<-"Class" splitprop<-.25 #######How much data to train on... ###### ii<-createDataPartition(mydata[,classlab],p=splitprop,list=F) # x.train<-mydata[ii,-classlab] x.test<-mydata[-ii,-classlab] y<-as.factor(mydata[,classlab]) y.train<-y[ii] y.test<-y[-ii] # #### data split to try methods on ### # First out - RFE (recursive feature elemination through the caret package) # ctrl <- rfeControl(functions = ldaFuncs, method = "cv", # look for other options in the help file verbose = FALSE) ldaProfile <- rfe(x.train, y.train, sizes=1:dim(x.train)[2], rfeControl = ctrl) plot(ldaProfile, type = c("o", "g")) ldaProfile Anames<-c("LDA") # store method name AA<-postResample(predict(ldaProfile, x.test), y.test) # store test performance FS<-matrix(0,1,dim(x.train)[2]) usewhich<-apply(matrix(names(x.train),length(names(x.train)),1),1,is.element, set=ldaProfile$optVariables) # which were selected FS[usewhich]<-1 FS ######################### ctrl <- rfeControl(functions = nbFuncs, method = "cv", verbose = FALSE) nbProfile <- rfe(x.train, y.train, sizes=1:dim(x.train)[2], rfeControl = ctrl) plot(nbProfile, type = c("o", "g")) nbProfile Anames<-c(Anames,"NB") AA<-rbind(AA,postResample(predict(nbProfile, x.test), y.test)) usewhich<-apply(matrix(names(x.train),length(names(x.train)),1),1,is.element,set=nbProfile$optVariables) # which were selected FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],usewhich]<-1 ############################## ctrl <- rfeControl(functions = rfFuncs, method = "cv", verbose = FALSE) rfProfile <- rfe(x.train, y.train, sizes=1:dim(x.train)[2], rfeControl = ctrl) plot(rfProfile, type = c("o", "g")) rfProfile Anames<-c(Anames,"RF") AA<-rbind(AA,postResample(predict(rfProfile, x.test), y.test)) usewhich<-apply(matrix(names(x.train),length(names(x.train)),1),1,is.element,set=rfProfile$optVariables) # which were selected FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],usewhich]<-1 ###### # shrunken centroids cc<-train(x=x.train,y=y.train,method="pam") confusionMatrix(cc) plot(cc) # training error pp<-predict(cc,x.test) confusionMatrix(pp,y.test) cc$finalModel$centroids cc$bestTune tt<-cc$finalModel$centroids ts<-as.matrix(abs(tt))-as.numeric(cc$bestTune) ts[ts<0]<-0 ts<-sign(tt)*ts ts<-apply(abs(ts),1,sum) # test error pp<-predict(cc,x.test,type="prob") pp$obs<-y.test head(pp) pp$pred<-predict(cc,x.test) Anames<-c(Anames,"ShrCent") AA<-rbind(AA,multiClassSummary(pp,lev=levels(pp$obs))[4:5]) FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],ts!=0]<-1 ## # sparseLDA cc<-train(x=x.train,y=y.train,method="sparseLDA") confusionMatrix(cc) plot(cc) # training error pp<-predict(cc,x.test) confusionMatrix(pp,y.test) cc$finalModel # test error pp<-predict(cc,x.test,type="prob") pp$obs<-y.test head(pp) pp$pred<-predict(cc,x.test) Anames<-c(Anames,"spLDA") AA<-rbind(AA,multiClassSummary(pp,lev=levels(pp$obs))[4:5]) FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],cc$finalModel$varIndex]<-1 ### # sparse logistic regression cc<-train(x=x.train,y=y.train,method="glmnet") confusionMatrix(cc) plot(cc) # training error pp<-predict(cc,x.test) confusionMatrix(pp,y.test) coef(cc$finalModel,cc$bestTune$lambda) if (length(unique(mydata$Class))>2) { BB<-Reduce("+",coef(cc$finalModel,cc$bestTune$lambda)) } if (length(unique(mydata$Class))==2) { BB<-coef(cc$finalModel,cc$bestTune$lambda) } # FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],as.vector(BB)[-1]!=0]<-1 # test error pp<-predict(cc,x.test,type="prob") pp$obs<-y.test head(pp) pp$pred<-predict(cc,x.test) Anames<-c(Anames,"spLogcaret") AA<-rbind(AA,multiClassSummary(pp,lev=levels(pp$obs))[4:5]) ## instead of caret - using built in cv function in glmnet typeclass<-"multinomial" if (length(unique(mydata$Class))==2) { typeclass<-"binomial" } # gg<-glmnet(x=as.matrix(x.train),y=y.train,family=typeclass) plot(gg,xvar="lambda") cv.gg<-cv.glmnet(x=as.matrix(x.train),y=y.train,family=typeclass) plot(cv.gg) names(cv.gg) coef(cv.gg,s="lambda.1se") coef(cv.gg,s="lambda.min") # if (length(unique(mydata$Class))>2) { BB<-Reduce("+",coef(cv.gg,s="lambda.1se")) } if (length(unique(mydata$Class))==2) { BB<-coef(cv.gg,s="lambda.1se") } # FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],as.vector(BB)[-1]!=0]<-1 # pp<-predict(gg,s=cv.gg$lambda.1se,type="class",newx=as.matrix(x.test)) acc<-length(y.test[pp==y.test])/length(y.test) kap<-cohen.kappa(table(pp,y.test),n.obs=length(y.test)) AA<-rbind(AA,c(acc,kap$kappa)) Anames<-c(Anames,"spLog1SE") # if (length(unique(mydata$Class))>2) { BB<-Reduce("+",coef(cv.gg,s="lambda.min")) } if (length(unique(mydata$Class))==2) { BB<-coef(cv.gg,s="lambda.min") } FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],as.vector(BB)[-1]!=0]<-1 # pp<-predict(gg,s=cv.gg$lambda.min,type="class",newx=as.matrix(x.test)) acc<-length(y.test[pp==y.test])/length(y.test) kap<-cohen.kappa(table(pp,y.test),n.obs=length(y.test)) AA<-rbind(AA,c(acc,kap$kappa)) Anames<-c(Anames,"spLog1min") # try grouped also (for more than 2 classes) if (length(unique(mydata$Class))>2) { gg<-glmnet(x=as.matrix(x.train),y=y.train,family="multinomial",type.multinomial="grouped") plot(gg,xvar="lambda") cv.gg<-cv.glmnet(x=as.matrix(x.train),y=y.train,family="multinomial",type.multinomial="grouped") plot(cv.gg) names(cv.gg) coef(cv.gg,s="lambda.1se") coef(cv.gg,s="lambda.min") # if (length(unique(mydata$Class))>2) { BB<-Reduce("+",coef(cv.gg,s="lambda.1se")) } if (length(unique(mydata$Class))==2) { BB<-coef(cv.gg,s="lambda.1se") } FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],as.vector(BB)[-1]!=0]<-1 pp<-predict(gg,s=cv.gg$lambda.1se,type="class",newx=as.matrix(x.test)) acc<-length(y.test[pp==y.test])/length(y.test) kap<-cohen.kappa(table(pp,y.test),n.obs=length(y.test)) AA<-rbind(AA,c(acc,kap$kappa)) Anames<-c(Anames,"spLogGroup1se") # if (length(unique(mydata$Class))>2) { BB<-Reduce("+",coef(cv.gg,s="lambda.min")) } if (length(unique(mydata$Class))==2) { BB<-coef(cv.gg,s="lambda.min") } FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],as.vector(BB)[-1]!=0]<-1 pp<-predict(gg,s=cv.gg$lambda.min,type="class",newx=as.matrix(x.test)) acc<-length(y.test[pp==y.test])/length(y.test) kap<-cohen.kappa(table(pp,y.test),n.obs=length(y.test)) AA<-rbind(AA,c(acc,kap$kappa)) Anames<-c(Anames,"spLogGroupmin") } ##### ### #################### # ############################################## #### wrapper functions for some classifiers # https://mlr-org.github.io/mlr-tutorial/devel/html/feature_selection/index.html my.task<-makeClassifTask(data = mydata[ii,], target = "Class") ctrl = makeFeatSelControlSequential(method = "sbs", alpha = 0.02) #backward search - change method for other searches. set.seed(1) rdesc = makeResampleDesc("CV", iters = 10) # how to evaluate sfeats = selectFeatures(learner = "classif.lda", task = my.task, resampling = rdesc, control = ctrl, show.info = FALSE) sfeats # usewhich<-apply(matrix(names(x.train),length(names(x.train)),1),1,is.element,set=sfeats$x) #tr<-lda(x.train[,usewhich==T],y.train) #pp<-predict(tr,x.test[,usewhich==T])$class #acc<-length(y.test[pp==y.test])/length(y.test) #kap<-cohen.kappa(table(pp,y.test),n.obs=length(y.test)) ##### Alternatively, prediction on selected features through mlr uu<-rep(T,dim(mydata)[2]) uu[-classlab]<-usewhich my.task2<-makeClassifTask(data = mydata[,uu==T], target = "Class") lrn = makeFeatSelWrapper("classif.lda", resampling = rdesc, control = makeFeatSelControlRandom(maxit = 10), show.info = FALSE) mod<-mlrtrain(lrn,my.task2,subset=ii) pp2<-predict(mod,my.task2,subset=setdiff(seq(1,dim(mydata)[1]),ii)) # acc<-length(y.test[pp2$data$response==y.test])/length(y.test) kap<-cohen.kappa(table(pp2$data$response,y.test),n.obs=length(y.test)) c(acc,kap$kappa) AA<-rbind(AA,c(acc,kap$kappa)) Anames<-c(Anames,"LDA-rfe") FS<-rbind(FS,rep(0,length(names(x.train)))) FS[dim(FS)[1],usewhich]<-1 #### which methods can be used here? #lrns = listLearners() #head(lrns[c("class", "package")]) #### which can be used for multiclass tasks? #lrns = listLearners(iris.task, properties = "prob") #head(lrns[c("class", "package")]) ##### Results? rownames(AA)<-Anames rownames(FS)<-Anames colnames(FS)<-names(x.train) ##### # Filter methods: gives you an importance for each variable - select top K fscore<-function(x,y) { fscore<-summary(aov(x~y))[[1]][4] } ff<-apply(x.train,2,fscore,y=y.train) ff # top 2 variables usevar<-rev(rank(unlist(ff)))[1:2] # Now you could use only the features in usevar to run your classifier on... # other filter methods from the FSelector package ff<-relief(Class~., data=mydata, neighbours.count = 5, sample.size = 25) ff ff<-information.gain(Class~.,data=mydata) ff