#
# READ FIRST
# Note - the demo was run some commands blocked since I had already
# read the data into R etc. 
# Unblock these commands to run the demo yourselves.
# Example: unblock the read.table command below to read the data into R.
#
#pollution<-read.table('polldata.txt',sep='\t')
#
#
print(names(pollution))
# GOAL: predict mortality rate as a function of 
# socio-economic factors, climate factors and 
# pollution factors.
#
# First start with a graphical exploration of the data.
#
par(mfrow=c(2,2))
plot(pollution$prec,pollution$mort,main="mort on prec")
plot(pollution$jant,pollution$mort,main="mort on jant")
plot(pollution$jult,pollution$mort,main="mort on jult")
plot(pollution$ovr65,pollution$mort,main="mort on ovr65")
p<-locator()
# precipitation is clearly linearly related to mortality, while
# there seems to be some outliers or less clear relationships
# in the other plots. 
#
plot(pollution$popn,pollution$mort,main="mort on popn")
plot(pollution$educ,pollution$mort,main="mort on educ")
plot(pollution$hous,pollution$mort,main="mort on hous")
plot(pollution$dens,pollution$mort,main="mort on dens")
p<-locator()
# Note that education level and housing conditions are negatively
# correlated with mortality - makes sense: richer more affluent 
# neighborhoods may be associated with better health status
# Similarly, number of persons per household and population density
# are positively related to mortality. That is, in less affluent neighborhoods
# the household size may be larger and the neighborhood more crowded. 
#
plot(pollution$nonw,pollution$mort,main="mort on nonw")
plot(pollution$wwdrk,pollution$mort,main="mort on wwdrk")
plot(pollution$poor,pollution$mort,main="mort on poor")
plot(pollution$hc,pollution$mort,main="mort on hc")
p<-locator()
# mortality is positively correlated with the percentage of nonwhites
# and with the percentage of people living below the poverty line.
# mortality is negatively correlated with whitecolar working percentages.
# hc is one of the pollution variables - it's difficult to see any pattern
# so I take logs to supress the extreme scale of hc.
#
plot(log(pollution$hc),pollution$mort,main="mort on log(hc)")
plot(log(pollution$nox),pollution$mort,main="mort on log(nox)")
plot(log(pollution$so),pollution$mort,main="mort on log(so)")
plot(pollution$humid,pollution$mort,main="mort on humid")
p<-locator()
# mortality is positively correlated with all the pollution factors.
# humidity seems to play no significant role.
#
#
#
polltrans<-pollution
polltrans$hc<-log(polltrans$hc)
polltrans$nox<-log(polltrans$nox)
polltrans$so<-log(polltrans$so)
# here I create a new data set with the pollution variables transformed.
#
# 
# ii<-sort(sample(seq(1,dim(pollution)[1]),20))
# polltest<-polltrans[ii,]
# polltrain<-polltrans[-ii,]
# I start by splitting the data into a training and test set
# Unblock the above 3 command lines when you run the code yourself.
# 
#
#
mm1<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrain)
print(summary(mm1))
par(mfrow=c(2,2))
plot(mm1)
p<-locator()
# the fit looks pretty good, though there are some outliers in the data set
# possible outliers 6,31,37,56
# note, these may be different for you since you have a different
# random subset of data from me.
#
cooksd<-cooks.distance(mm1)
par(mfrow=c(1,1))
plot(cooksd,main="Cook's Distance",type="h")
abline(h=qf(.95,1,mm1$df),lty=2)
p<-locator()
# 37 does stand out (observation 24 in training set is observation 37 
# in the original data set.
#
par(mfrow=c(2,2))
lm1<-lm.influence(mm1)
plot(lm1$hat,main="Leverage")
abline(h=3*length(mm1$coef)/dim(polltrain)[1])
identify(lm1$hat)
# no excessively large leverages
plot(lm1$coeff[,6],main="change in slope 6")
identify(lm1$coeff[,6])
plot(lm1$coeff[,12],main="change in slope 12")
identify(lm1$coeff[,12])
plot(lm1$coeff[,13],main="change in slope 13")
identify(lm1$coeff[,13])
plot(lm1$sigma, main="Residual MSE when obs is dropped")
identify(lm1$sigma)
p<-locator()
# Note observations 21,31,6,24,38 in the training set are
# observations 6,31,37,48,56 in the original data set (as seen in the
# plot(mm1) figure.
#
# Let's drop 6, 31,37 and 56
# that is 21,31,6,38 on training data
# These observations seem to be problematic both in terms of affecting slopes
# and the mse estimates.
#
mm1b<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrain,subset=-c(6,21,31,38))
print(summary(mm1b))
par(mfrow=c(2,2))
plot(mm1b)
p<-locator()
# Looks pretty good
#
print(step(mm1b))
p<-locator()
# drop poor and humidity
#
# Checking collinearity
par(mfrow=c(1,1))
image(seq(1,15),seq(1,15),abs(cor(pollution[,1:15])))
# hc and nox! (variables 12 and 13) are very highly correlated (white
# block in the figure). 
# variables 5,6,7 are also highly correlated (that's popn, educ, hous)
# Collinearity problems will def. be an issue!
#
polltrain2<-polltrain[-c(6,21,31,38),]
# here I create my new training data without outliers.
#
#
# Here is another run on a different subset of data. Note that you may
# get different variables retained in the model after doing the backward
# search.
iij<-sort(sample(seq(1,dim(pollution)[1]),20))
polltestb<-polltrans[iij,]
polltrainb<-polltrans[-iij,]
#
mm1b<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrainb)
print(summary(mm1b))
par(mfrow=c(2,2))
plot(mm1b)
p<-locator()
# Looks pretty good
#
print(step(mm1b))
p<-locator()