#### Regression methods #### SAheart$famhist<-as.numeric(SAheart$famhist)-1 # make the famhist variable numeric # plotting the data... pairs(SAheart) mm<-lm(ldl~., data=SAheart,subset=-c(17,214,347,433)) library(car) # package for nicer diagnostic plots, amongst other things residualPlots(mm) leveragePlots(mm) qqPlot(mm,id.n=3) # perhaps better to transform some of the data? SAheart$ldl<-log(SAheart$ldl) SAheart$obesity<-log(SAheart$obesity) SAheart$age<-log(SAheart$age) mm<-lm(ldl~., data=SAheart) residualPlots(mm) leveragePlots(mm) qqPlot(mm,id.n=3) # drop some outliers? mm<-lm(ldl~., data=SAheart,subset=-c(17,214,347,433)) residualPlots(mm) leveragePlots(mm) qqPlot(mm,id.n=3) # a bunch of tools for spotting influential observations influenceIndexPlot(mm,id.n=3) influencePlot(mm,id.n=3) # variable importance in terms of Rsquared # What's the Rsquared contribution of a variable, if averaged out over # order it is entered, if it's the first or last (aka usefulness) variable # and one Rsquared decomposition (there are many) library(relaimpo) calc.relimp(mm,type=c("lmg","last","first","genizi"), rela=TRUE) # Bootstrap Measures of Relative Importance (takes some time) boot <- boot.relimp(mm, b = 250, type = c("lmg", "last", "first", "genizi"), rank = TRUE, diff = TRUE, rela = TRUE) booteval.relimp(boot) # print result plot(booteval.relimp(boot,sort=TRUE)) ################### # all-subset selection library(leaps) mm<-regsubsets(ldl~., data=SAheart) plot(mm) # model map plot(mm,scale="adjr2") # R2 instead of bic to rank models subsets(mm,statistics="rsq") # which subsets correspond to which models? ## redo but ask for more candidate models of each size library(leaps) mm<-regsubsets(ldl~., data=SAheart, nbest=50, really.big=T) ms<-summary(mm) plot(mm) #BIC curve plot(apply(ms$which,1,sum),ms$bic,xlab="size",ylab="ic") # Importance measure # Here, I score variables by BIC of the models where they # enter so variables that are present in the best models # get a big score ms$which[sort.list(ms$bic)[1:5],] wt<-exp(-ms$bic) mat<-ms$which mat[mat==T]<-1 mat[mat==F]<-0 mat<-wt*mat impwt<-apply(mat,2,sum) barplot(impwt[-1],names.arg=c(names(SAheart)[-3])) ########glmulti - another subset selection tool # works for logistic regression also! library(glmulti) # ll<-glmulti(ldl~., data=SAheart,level=1,fitfunction=lm,method="h",crit="bic") # here I use an exhaustive search plot(ll,type="s") # variable importance in terms of BIC weights plot(ll,type="r") # residual plots for model # the winning model summary(ll)$bestmodel # You can search for models using a genetic algorithm instead # also level=2 option allows you to include interactions in the model # Be careful though, since it is an automatic procedure it may # result in models with interactions but without main effects... ll<-glmulti(ldl~., data=SAheart,level=2,fitfunction=lm,method="g",crit="bic") plot(ll,type="s") ## Top-10 models weightable(ll)[1:10,] # To fix the problems with main effects, use option marginality=T ll<-glmulti(ldl~., data=SAheart,level=2,fitfunction=lm,method="g",marginality=T,crit="bic") plot(ll,type="s") ## Top-10 models weightable(ll)[1:10,] ############################ # logistic regression example ll<-glmulti(chd~., data=SAheart,level=1,fitfunction=glm,family=binomial,method="g",crit="bic") plot(ll,type="s") summary(ll)$bestmodel gg<-glm(chd~tobacco+ldl+famhist+typea+age,data=SAheart,family=binomial) residualPlots(gg) influencePlot(gg,id.n=3)