# Interactions....
#
# Before we get started - here are some notes on fitting models with
# qualitative variables. Let's say SI is an indicator variable (smokers).
# I can fit an interaction model in R with the commands:
# mod1 <- lm(bloodpressure~SI*age)
# the * symbol tells R I want the main effects (different intercepts) and
# the different slopes.
# An alternative way of fitting these models is
# mod1 <- lm(bloodpressure~SI+age+SI:age)
# which more clearly follows the outline I did in the lecture: that is
# that I use the indicator variable just like any other predictor, and 
# that I create a new age variable using this indicator as well. 
#
# If your qualitative variable is not encoded as 0 and 1, or has more than
# 2 levels you need to tell R this by calling the variable a "factor". 
# Example:
# mod1 <- lm(bloodpressure~as.factor(SI)*age)
#
# Back to the demo
#
cars4<-cars2
cars4[,7]<-1/cars4[,7]
cars4<-cars4[,c(-1,-2,-4,-5,-6,-8,-9,-16,-14,-15)]
cars4<-cars4[cars4$cyl!=5,]
names(cars4)[2]<-c("city.gpm")
# The above commands removes some variables I don't want to include in the 
# analysis, and also transforms mpg into gpm (variable 7) - that's our
# outcome variable
print(names(cars4))
#
# Let's predict city.gpm from predictors (not including price and hw.mpg
#
par(mfrow=c(2,2))
plot(cars4$type,cars4$ci)
# type is clearly a qualitative variable - and the relationship is not linear
plot(cars4$dr,cars4$ci)
# this relationship is not too far from linear? can we use drive train 
# as numeric? 
plot(cars4$cyl,cars4$ci)
# not linear - a qualitative variable
plot(cars4$pas,cars4$ci)
# passenger room, maybe linear - and is a numerical variable
p<-locator()
#
# other variabels
plot(cars4$eng,cars4$ci)
# second order term? (non-linear)
plot(cars4$ho,cars4$ci)
# second order? (non-linear)
plot(cars4$fu,cars4$ci)
# linear
plot(cars4$le,cars4$ci)
# linear
p<-locator()
#
plot(cars4$wh,cars4$ci)
plot(cars4$wi,cars4$ci)
plot(cars4$ut,cars4$ci)
plot(cars4$we,cars4$ci)
p<-locator()
#
#
print(c("Let's try to limit to the most crucial variables"))
print(c("perhaps type, drivetrn, cylnbr, engsize, weight, domestic?")) 
# 
print(c("In fact; barely significant improvement if keep more than these (p-val 4.3)"))
#
p<-locator()
print(c("INTERACTIONS?"))
par(mfrow=c(1,1))
plot(cars4$we,cars4$ci,main="Weight effect, type interaction?")
points(cars4$we[cars4$ty=="Compact"],cars4$ci[cars4$ty=="Compact"],pch=2,col="red")
points(cars4$we[cars4$ty=="Large"],cars4$ci[cars4$ty=="Large"],pch=3,col="blue")
points(cars4$we[cars4$ty=="Midsize"],cars4$ci[cars4$ty=="Midsize"],pch=4,col="green")
points(cars4$we[cars4$ty=="Small"],cars4$ci[cars4$ty=="Small"],pch=5,col="yellow")
print(c("INTERACTION WITH TYPE")) 
print(c("No indication of interaction"))
print(c("almost as if weight takes care of type variation"))
# exception being sporty-black with steeper slope. 
p<-locator()
plot(cars4$we,cars4$ci,main="Weight effect, domestic interaction?")
points(cars4$we[cars4$dom==1],cars4$ci[cars4$dom==1],col="red",pch=2)
print(c("INTERACTION WITH DOMESTIC?"))
print(c("slight indication of interaction, different slope of red and black data set"))
p<-locator()
# 
plot(cars4$we,cars4$ci,main="Weight effect, by cylinder nbr")
points(cars4$we[cars4$cy==3],cars4$ci[cars4$cy==3],pch=2,col="red")
points(cars4$we[cars4$cy==4],cars4$ci[cars4$cy==4],pch=3,col="blue")
points(cars4$we[cars4$cy==6],cars4$ci[cars4$cy==6],pch=5,col="green")
print(c("INTERACTION WITH CYLINDER NBR?"))
print(c("again see no indication of interaction"))
p<-locator()
# 
# 
plot(cars4$we,cars4$ci,main="Weight effect, by drive train")
points(cars4$we[cars4$dr==0],cars4$ci[cars4$dr==0],col="red")
points(cars4$we[cars4$dr==1],cars4$ci[cars4$dr==1],col="blue")
print(c("INTERACTION WITH DRIVE TRAIN?"))
print(c(" maybe interaction - but very little data to fit this"))
p<-locator()
#
print(c("SUGGESTED MODEL"))
print(c("type+dom*we+log(cyl)+log(eng)+dr"))
# cyl sqr term? 
# sqrt term from plotting also - poss interaction with type and eng?
#
mm<-lm(cars4$ci~as.factor(cars4$ty)+as.factor(cars4$dom)*cars4$we+log(cars4$cy)+log(cars4$eng)+cars4$dr)
print(c("How did this model work out?"))
p<-locator()
print(summary(mm))
p<-locator()
print(step(mm))
# performing backward selection we 
# keep type (as factor), log(cyl), log(eng), we
#
p<-locator()
print(c("Let's try the simplified model selected by step"))
mm2<-lm(cars4$ci~as.factor(cars4$ty)+log(cars4$cy)+log(cars4$eng)+cars4$we)
print(summary(mm2))
p<-locator()
print(anova(mm,mm2))
#
p<-locator()
par(mfrow=c(2,2))
plot(mm2)
p<-locator()
par(mfrow=c(2,2))
plot(cars4$ty,mm2$res)
plot(log(cars4$cy),mm2$res)
plot(log(cars4$eng),mm2$res)
plot(cars4$we,mm2$res)
# residual plots look good
p<-locator()
plot(cars4$dr,mm2$res)
plot(cars4$fu,mm2$res)
plot(cars4$ho,mm2$res)
plot(cars4$wh,mm2$res)
# and so do residual plots on omitted variables
p<-locator()
#
#
# Example of an interaction: simulated data
#
mm<-lm(cars4$ci~as.factor(cars4$dom)*cars4$we)
newy <- rep(0,length(cars4$ci))
newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig)
newy[cars4$dom==1] <- mm$coef[1]+mm$coef[2]*3+(mm$coef[3]+mm$coef[4]*3)*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig)
# Note, if dom=1, we have a different intercept and slope!
par(mfrow=c(1,1))
plot(cars4$we,newy,main="Weight effect, domestic interaction?")
points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red")
mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we)
print(summary(mmnew))
print(step(mmnew))
# backward selection keeps both the different slope and intercept
p<-locator()
#
# Example of an additive model
newy <- rep(0,length(cars4$ci))
newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig)
newy[cars4$dom==1] <- mm$coef[1]+3*mm$coef[2]+(mm$coef[3])*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig)
# now we only have a different intercept
#
par(mfrow=c(1,1))
plot(cars4$we,newy,main="Weight effect, domestic interaction?")
points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red")
mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we)
print(summary(mmnew))
print(step(mmnew))
# backward selection removes the difference in slope
p<-locator()
#
# Example of a reinforcement model
newy <- rep(0,length(cars4$ci))
newy[cars4$dom==0] <- mm$coef[1]+mm$coef[3]*cars4$we[cars4$dom==0]+rnorm(length(cars4$dom[cars4$dom==0]),sd=summary(mm)$sig)
newy[cars4$dom==1] <- mm$coef[1]+(mm$coef[3]*1.5)*cars4$we[cars4$dom==1]+rnorm(length(cars4$dom[cars4$dom==1]),sd=summary(mm)$sig)
#
par(mfrow=c(1,1))
plot(cars4$we,newy,main="Weight effect, domestic interaction?")
points(cars4$we[cars4$dom==1],newy[cars4$dom==1],pch=2,col="red")
mmnew<-lm(newy~cars4$we+cars4$dom+as.factor(cars4$dom):cars4$we)
print(summary(mmnew))
print(step(mmnew))
p<-locator()
#