# Interactions.... # # Before we get started - here are some notes on fitting models with # qualitative variables. Let's say SI is an indicator variable (smokers). # I can fit an interaction model in R with the commands: # mod1 <- lm(bloodpressure~SI*age) # the * symbol tells R I want the main effects (different intercepts) and # the different slopes. # An alternative way of fitting these models is # mod1 <- lm(bloodpressure~SI+age+SI:age) # which more clearly follows the outline I did in the lecture: that is # that I use the indicator variable just like any other predictor, and # that I create a new age variable using this indicator as well. # # If your qualitative variable is not encoded as 0 and 1, or has more than # 2 levels you need to tell R this by calling the variable a "factor". # Example: # mod1 <- lm(bloodpressure~as.factor(SI)*age) # # Back to the demo # cars4<-cars2 cars4[,7]<-1/cars4[,7] cars4<-cars4[,c(-1,-2,-4,-5,-6,-8,-9,-16,-14,-15)] cars4<-cars4[cars4$cyl!=5,] names(cars4)[2]<-c("city.gpm") # The above commands removes some variables I don't want to include in the # analysis, and also transforms mpg into gpm (variable 7) - that's our # outcome variable print(names(cars4)) # # Let's predict city.gpm from predictors (not including price and hw.mpg # par(mfrow=c(2,2)) plot(cars4$type,cars4$ci) # type is clearly a qualitative variable - and the relationship is not linear plot(cars4$dr,cars4$ci) # this relationship is not too far from linear? can we use drive train # as numeric? plot(cars4$cyl,cars4$ci) # not linear - a qualitative variable plot(cars4$pas,cars4$ci) # passenger room, maybe linear - and is a numerical variable p<-locator() # # other variabels plot(cars4$eng,cars4$ci) # second order term? (non-linear) plot(cars4$ho,cars4$ci) # second order? (non-linear) plot(cars4$fu,cars4$ci) # linear plot(cars4$le,cars4$ci) # linear p<-locator() # plot(cars4$wh,cars4$ci) plot(cars4$wi,cars4$ci) plot(cars4$ut,cars4$ci) plot(cars4$we,cars4$ci) p<-locator() # # print(c("Let's try to limit to the most crucial variables")) print(c("perhaps type, drivetrn, cylnbr, engsize, weight, domestic?")) # print(c("In fact; barely significant improvement if keep more than these (p-val 4.3)")) # p<-locator() print(c("INTERACTIONS?")) par(mfrow=c(1,1)) plot(cars4$we,cars4$ci,main="Weight effect, type interaction?") points(cars4$we[cars4$ty=="Compact"],cars4$ci[cars4$ty=="Compact"],pch=2,col="red") points(cars4$we[cars4$ty=="Large"],cars4$ci[cars4$ty=="Large"],pch=3,col="blue") points(cars4$we[cars4$ty=="Midsize"],cars4$ci[cars4$ty=="Midsize"],pch=4,col="green") points(cars4$we[cars4$ty=="Small"],cars4$ci[cars4$ty=="Small"],pch=5,col="yellow") print(c("INTERACTION WITH TYPE")) print(c("No indication of interaction")) print(c("almost as if weight takes care of type variation")) # exception being sporty-black with steeper slope. p<-locator() plot(cars4$we,cars4$ci,main="Weight effect, domestic interaction?") points(cars4$we[cars4$dom==1],cars4$ci[cars4$dom==1],col="red",pch=2) print(c("INTERACTION WITH DOMESTIC?")) print(c("slight indication of interaction, different slope of red and black data set")) p<-locator() # plot(cars4$we,cars4$ci,main="Weight effect, by cylinder nbr") points(cars4$we[cars4$cy==3],cars4$ci[cars4$cy==3],pch=2,col="red") points(cars4$we[cars4$cy==4],cars4$ci[cars4$cy==4],pch=3,col="blue") points(cars4$we[cars4$cy==6],cars4$ci[cars4$cy==6],pch=5,col="green") print(c("INTERACTION WITH CYLINDER NBR?")) print(c("again see no indication of interaction")) p<-locator() # # plot(cars4$we,cars4$ci,main="Weight effect, by drive train") points(cars4$we[cars4$dr==0],cars4$ci[cars4$dr==0],col="red") points(cars4$we[cars4$dr==1],cars4$ci[cars4$dr==1],col="blue") print(c("INTERACTION WITH DRIVE TRAIN?")) print(c(" maybe interaction - but very little data to fit this")) p<-locator() # print(c("SUGGESTED MODEL")) print(c("type+dom*we+log(cyl)+log(eng)+dr")) # cyl sqr term? # sqrt term from plotting also - poss interaction with type and eng? # mm<-lm(cars4$ci~as.factor(cars4$ty)+as.factor(cars4$dom)*cars4$we+log(cars4$cy)+log(cars4$eng)+cars4$dr) print(c("How did this model work out?")) p<-locator() print(summary(mm)) p<-locator() print(step(mm)) # performing backward selection we # keep type (as factor), log(cyl), log(eng), we # p<-locator() print(c("Let's try the simplified model selected by step")) mm2<-lm(cars4$ci~as.factor(cars4$ty)+log(cars4$cy)+log(cars4$eng)+cars4$we) print(summary(mm2)) p<-locator() print(anova(mm,mm2)) # p<-locator() par(mfrow=c(2,2)) plot(mm2) p<-locator() par(mfrow=c(2,2)) plot(cars4$ty,mm2$res) plot(log(cars4$cy),mm2$res) plot(log(cars4$eng),mm2$res) plot(cars4$we,mm2$res) # residual plots look good p<-locator() plot(cars4$dr,mm2$res) plot(cars4$fu,mm2$res) plot(cars4$ho,mm2$res) plot(cars4$wh,mm2$res) # and so do residual plots on omitted variables p<-locator() # # # Example of an interaction: simulated data # mm<-lm(cars4$ci~as.factor(cars4$dom)*cars4$we) newy <- rep(0,length(cars4$ci)) newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig) newy[cars4$dom==1] <- mm$coef[1]+mm$coef[2]*3+(mm$coef[3]+mm$coef[4]*3)*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig) # Note, if dom=1, we have a different intercept and slope! par(mfrow=c(1,1)) plot(cars4$we,newy,main="Weight effect, domestic interaction?") points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red") mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we) print(summary(mmnew)) print(step(mmnew)) # backward selection keeps both the different slope and intercept p<-locator() # # Example of an additive model newy <- rep(0,length(cars4$ci)) newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig) newy[cars4$dom==1] <- mm$coef[1]+3*mm$coef[2]+(mm$coef[3])*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig) # now we only have a different intercept # par(mfrow=c(1,1)) plot(cars4$we,newy,main="Weight effect, domestic interaction?") points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red") mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we) print(summary(mmnew)) print(step(mmnew)) # backward selection removes the difference in slope p<-locator() # # Example of a reinforcement model newy <- rep(0,length(cars4$ci)) newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig) newy[cars4$dom==1] <- mm$coef[1]+(mm$coef[3]*1.5)*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig) # par(mfrow=c(1,1)) plot(cars4$we,newy,main="Weight effect, domestic interaction?") points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red") mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we) print(summary(mmnew)) print(step(mmnew)) p<-locator() #