#### Regression methods ####

SAheart$famhist<-as.numeric(SAheart$famhist)-1
# make the famhist variable numeric

# plotting the data...
pairs(SAheart)

mm<-lm(ldl~., data=SAheart,subset=-c(17,214,347,433))

library(car)
# package for nicer diagnostic plots, amongst other things
residualPlots(mm)
leveragePlots(mm)
qqPlot(mm,id.n=3)

# perhaps better to transform some of the data? 

SAheart$ldl<-log(SAheart$ldl)
SAheart$obesity<-log(SAheart$obesity)
SAheart$age<-log(SAheart$age)

mm<-lm(ldl~., data=SAheart)
residualPlots(mm)
leveragePlots(mm)
qqPlot(mm,id.n=3)

# drop some outliers?
mm<-lm(ldl~., data=SAheart,subset=-c(17,214,347,433))

residualPlots(mm)
leveragePlots(mm)
qqPlot(mm,id.n=3)


# a bunch of tools for spotting influential observations
influenceIndexPlot(mm,id.n=3)

influencePlot(mm,id.n=3)


# variable importance in terms of Rsquared
# What's the Rsquared contribution of a variable, if averaged out over
# order it is entered, if it's the first or last (aka usefulness) variable
# and one Rsquared decomposition (there are many)
library(relaimpo)
calc.relimp(mm,type=c("lmg","last","first","genizi"),
            rela=TRUE)

# Bootstrap Measures of Relative Importance  (takes some time)
boot <- boot.relimp(mm, b = 250, type = c("lmg", 
                                          "last", "first", "genizi"), rank = TRUE, 
                    diff = TRUE, rela = TRUE)
booteval.relimp(boot) # print result
plot(booteval.relimp(boot,sort=TRUE)) 

###################
# all-subset selection
library(leaps)
mm<-regsubsets(ldl~., data=SAheart)
plot(mm)
# model map
plot(mm,scale="adjr2")
# R2 instead of bic to rank models
subsets(mm,statistics="rsq")
# which subsets correspond to which models?


## redo but ask for more candidate models of each size
library(leaps)
mm<-regsubsets(ldl~., data=SAheart, nbest=50, really.big=T)
ms<-summary(mm)
plot(mm)

#BIC curve
plot(apply(ms$which,1,sum),ms$bic,xlab="size",ylab="ic")

# Importance measure
# Here, I score variables by BIC of the models where they
# enter so variables that are present in the best models
# get a big score
ms$which[sort.list(ms$bic)[1:5],]
wt<-exp(-ms$bic)
mat<-ms$which
mat[mat==T]<-1
mat[mat==F]<-0
mat<-wt*mat
impwt<-apply(mat,2,sum)
barplot(impwt[-1],names.arg=c(names(SAheart)[-3]))


########glmulti - another subset selection tool
# works for logistic regression also!

library(glmulti)
#
ll<-glmulti(ldl~., data=SAheart,level=1,fitfunction=lm,method="h",crit="bic")
# here I use an exhaustive search
plot(ll,type="s")
# variable importance in terms of BIC weights
plot(ll,type="r")
# residual plots for model

# the winning model
summary(ll)$bestmodel

# You can search for models using a genetic algorithm instead
# also level=2 option allows you to include interactions in the model
# Be careful though, since it is an automatic procedure it may
# result in models with interactions but without main effects...
ll<-glmulti(ldl~., data=SAheart,level=2,fitfunction=lm,method="g",crit="bic")
plot(ll,type="s")
## Top-10 models
weightable(ll)[1:10,]


# To fix the problems with main effects, use option marginality=T
ll<-glmulti(ldl~., data=SAheart,level=2,fitfunction=lm,method="g",marginality=T,crit="bic")
plot(ll,type="s")
## Top-10 models
weightable(ll)[1:10,]


############################
# logistic regression example
ll<-glmulti(chd~., data=SAheart,level=1,fitfunction=glm,family=binomial,method="g",crit="bic")
plot(ll,type="s")
summary(ll)$bestmodel

gg<-glm(chd~tobacco+ldl+famhist+typea+age,data=SAheart,family=binomial)
residualPlots(gg)

influencePlot(gg,id.n=3)