SimMini3<-function(B=25,ns=50,noiselev=0.25,grcorr=c(1,1,1),sparsity=.1,plot=F) { # A lasso simulation # B is the number of reps # ns is the sample size. The dimension is fixed to p=80 # noiselev is the additive error # there are 3 clusters of variables in the sim - grcorr is vector that controls how correlated the variables # within the groups are. Setting an entry to a smaller number like 0.25 makes the group correlated, a large number # like 4 makes the group uncorrelated. Give one number per group e.g. grcorr=c(0.25,4,4) # sparsity is the percent of variabels that are true predictors # plot is optional. Default is False # Truth<-rep(0,80) #Truth[c(1,2,11,21,31)]<-1 Truth<-round(rbinom(n=80,size=1,prob=sparsity)) # The true model Coeflm<-matrix(0,80,B) CoefR<-matrix(0,80,B) Coefla<-matrix(0,80,B) Coefala<-matrix(0,80,B) Coefala2<-matrix(0,80,B) Coefen<-matrix(0,80,B) MSElm<-matrix(0,B,1) MSER<-matrix(0,B,1) MSEla<-matrix(0,B,1) MSEala<-matrix(0,B,1) MSEen<-matrix(0,B,1) # nt<-sum(Truth) tr.coef<-rnorm(nt,mean=1,sd=.25)*sign(rbinom(n=nt,size=1,prob=.5)*2-1) all.coef<-rep(0,80) all.coef[Truth==1]<-tr.coef # for (b in (1:B)) { # x1<-rnorm(ns) x1<-matrix(rep(x1,10),ns,10,byrow=F)+matrix(rnorm(ns*10,sd=grcorr[1]),ns,10) x1<-apply(x1,2,standardize) # x2<-rnorm(ns) x2<-matrix(rep(x2,10),ns,10,byrow=F)+matrix(rnorm(ns*10,sd=grcorr[2]),ns,10) x2<-apply(x2,2,standardize) # x3<-rnorm(ns) x3<-matrix(rep(x3,10),ns,10,byrow=F)+matrix(rnorm(ns*10,sd=grcorr[3]),ns,10) x3<-apply(x3,2,standardize) # x4<-matrix(rnorm(ns*50),ns,50) x4<-apply(x4,2,standardize) # y<-cbind(x1,x2,x3,x4)%*%all.coef+rnorm(ns)*noiselev df<-as.data.frame(cbind(x1,x2,x3,x4,y)) names(df)<-c(as.character(seq(1,80)),"y") #### y.new<-cbind(x1,x2,x3,x4)%*%all.coef+rnorm(ns)*noiselev df.new<-as.data.frame(cbind(x1,x2,x3,x4,y.new)) names(df.new)<-c(as.character(seq(1,80)),"y") #### # Run OLS if possible if (ns>80) { mm<-lm(y~., data=df) Coeflm[,b]<-mm$coef[-1] pm<-predict(mm,newdata=df.new) MSElm[b]<-sum((y.new-pm)^2)/length(pm) } # # lasso cv.gg<-cv.glmnet(x=cbind(x1,x2,x3,x4),y=y) Coefla[,b]<-coef(cv.gg,s="lambda.1se")[-1] pm<-cbind(rep(1,ns),cbind(x1,x2,x3,x4))%*%coef(cv.gg,s="lambda.1se") MSEla[b]<-sum((y.new-pm)^2)/length(pm) # # adaptive lasso with ridge initialization gamma<-1 ma<-lm.ridge(y~., data=df,lambda=1) CoefR[,b]<-coef(ma)[-1] pm<-cbind(rep(1,ns),cbind(x1,x2,x3,x4))%*%coef(ma) MSER[b]<-sum((y.new-pm)^2)/length(pm) # penf<-1/(abs(coef(ma)[-1])^gamma) cv.gg2<-cv.glmnet(x=cbind(x1,x2,x3,x4),y=y,penalty.factor=penf) cc<-coef(cv.gg2,s="lambda.1se")[-1] Coefala[,b]<-cc pm<-cbind(rep(1,ns),cbind(x1,x2,x3,x4))%*%coef(cv.gg2,s="lambda.1se") MSEala[b]<-sum((y.new-pm)^2)/length(pm) # # alternative adaptive lasso xx3<-cbind(x1,x2,x3,x4)%*%diag(abs(coef(ma)[-1])^gamma) gg3<-cv.glmnet(x=xx3,y=y,standardize=F) cc<-(coef(gg3,s="lambda.1se")[-1])*(abs(coef(ma)[-1])^gamma) Coefala2[,b]<-cc ##### # elastic net gg4<-cv.glmnet(x=cbind(x1,x2,x3,x4),y=y,alpha=0.5) cc<-coef(gg4,s="lambda.1se")[-1] Coefen[,b]<-cc pm<-cbind(rep(1,ns),cbind(x1,x2,x3,x4))%*%coef(gg4,s="lambda.1se") MSEen[b]<-sum((y.new-pm)^2)/length(pm) } ### plotting the coefficient estimates, highlighting the true model with red stars if (plot==T) { par(mfrow=c(2,2)) if (ns>80) { boxplot(t(Coeflm),ylim=c(min(tr.coef)-.25,max(tr.coef)+.25)) } if (ns<=80) { boxplot(t(CoefR),ylim=c(min(tr.coef)-.25,max(tr.coef)+.25)) } abline(h=0) abline(v=c(10.5,20.5,30.5),lty=2) points(seq(1,80)[Truth==1],tr.coef,col=2,pch=3) boxplot(t(Coefla),ylim=c(min(tr.coef)-.25,max(tr.coef)+.25),main="Lasso") abline(h=0) abline(v=c(10.5,20.5,30.5),lty=2) points(seq(1,80)[Truth==1],tr.coef,col=2,pch=3) boxplot(t(Coefala),ylim=c(min(tr.coef)-.25,max(tr.coef)+.25),main="ALasso") abline(h=0) abline(v=c(10.5,20.5,30.5),lty=2) points(seq(1,80)[Truth==1],tr.coef,col=2,pch=3) boxplot(t(Coefen),ylim=c(min(tr.coef)-.25,max(tr.coef)+.25),main="Enet") abline(h=0) abline(v=c(10.5,20.5,30.5),lty=2) points(seq(1,80)[Truth==1],tr.coef,col=2,pch=3) } ##### Bias<-rbind(apply(Coeflm,1,mean)[Truth==1]-tr.coef, apply(CoefR,1,mean)[Truth==1]-tr.coef, apply(Coefla,1,mean)[Truth==1]-tr.coef, apply(Coefala,1,mean)[Truth==1]-tr.coef, apply(Coefen,1,mean)[Truth==1]-tr.coef) Var<-rbind(apply(Coeflm,1,var)[Truth==1], apply(CoefR,1,var)[Truth==1], apply(Coefla,1,var)[Truth==1], apply(Coefala,1,var)[Truth==1], apply(Coefen,1,var)[Truth==1]) ### Bias and Variance of the different estimates (of true parameters only) if (plot==T) { print(Bias) print(Var) } ##### Function to compute the True Positive Rate, The False positive rate and the false detection rate # TPR = % of true model found, FPR = % percent of false variables include, FDR =% percent of selected variables that are false TPFP<-function(Coefs,truth) { tr<-truth cp<-Coefs TP<-length(cp[cp!=0 & tr==1])/length(tr[tr==1]) FP<-length(cp[cp!=0 & tr==0])/length(tr[tr==0]) FDR<-length(cp[cp!=0 & tr==0])/length(cp[cp!=0]) return(list(FP=FP,FDR=FDR,TP=TP))} #### Same thing but at the group level grTPFP<-function(Coefs,truth) { grtr<-c(max(truth[1:10]),max(truth[11:20]),max(truth[21:30]),max(truth[31:80])) tr<-truth cp<-c(max(abs(Coefs)[1:10]),max(abs(Coefs)[11:20]),max(abs(Coefs[21:30])),max(abs(Coefs[31:80]))) TP<-length(cp[cp!=0 & grtr==1])/length(grtr[grtr==1]) FP<-length(cp[cp!=0 & grtr==0])/length(grtr[grtr==0]) FDR<-length(cp[cp!=0 & grtr==0])/length(cp[cp!=0]) return(list(FP=FP,FDR=FDR,TP=TP))} ##### Applying the functions LassoSel<-apply(Coefla,2,TPFP,truth=Truth) ll<-unlist(LassoSel) ALassoSel<-apply(Coefala,2,TPFP,truth=Truth) lla<-unlist(ALassoSel) EnetSel<-apply(Coefen,2,TPFP,truth=Truth) lle<-unlist(EnetSel) # LassoSelgr<-apply(Coefla,2,grTPFP,truth=Truth) llgr<-unlist(LassoSelgr) ALassoSelgr<-apply(Coefala,2,grTPFP,truth=Truth) llagr<-unlist(ALassoSelgr) EnetSelgr<-apply(Coefen,2,grTPFP,truth=Truth) llegr<-unlist(EnetSelgr) ##### Plotting the TPR, FPR and FDR for the methods if (plot==T) { locator() par(mfrow=c(3,2)) boxplot(cbind(ll[seq(1,(B*3),by=3)],ll[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(ll[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) boxplot(cbind(lla[seq(1,(B*3),by=3)],lla[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(lla[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) boxplot(cbind(lle[seq(1,(B*3),by=3)],lle[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(lle[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) ### At the group level locator() par(mfrow=c(3,2)) boxplot(cbind(llgr[seq(1,(B*3),by=3)],llgr[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(llgr[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) boxplot(cbind(lla[seq(1,(B*3),by=3)],llagr[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(llagr[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) boxplot(cbind(lle[seq(1,(B*3),by=3)],llegr[seq(3,(B*3),by=3)]),names=c("FPR","TPR")) boxplot(llegr[seq(2,(B*3),by=3)],names=c("FDR"),ylim=c(0,1)) ### cross-validation plots locator() par(mfrow=c(2,3)) plot(cv.gg) plot(cv.gg2) plot(gg4) ### Bias plot for true variables # black is ridge, red is lasso, green is adaptive lasso and cyan is enet plot(Bias[2,],type="l",ylim=c(min(Bias),max(Bias)),main="Bias") abline(h=0) lines(Bias[3,],col=2) lines(Bias[4,],col=3) lines(Bias[5,],col=5) ### Prediction performance locator() boxplot(cbind(MSElm,MSER,MSEla,MSEala,MSEen),names=c("LM","R","Lasso","ALasso","Enet")) } # Returning prediction errors and TPR, FPR, FDR results return(list(MSE=cbind(MSElm,MSER,MSEla,MSEala,MSEen),LL=cbind(ll,lla,lle))) }