# # READ FIRST # Note - the demo was run some commands blocked since I had already # read the data into R etc. # Unblock these commands to run the demo yourselves. # Example: unblock the read.table command below to read the data into R. # #pollution<-read.table('polldata.txt',sep='\t') # # print(names(pollution)) # GOAL: predict mortality rate as a function of # socio-economic factors, climate factors and # pollution factors. # # First start with a graphical exploration of the data. # par(mfrow=c(2,2)) plot(pollution$prec,pollution$mort,main="mort on prec") plot(pollution$jant,pollution$mort,main="mort on jant") plot(pollution$jult,pollution$mort,main="mort on jult") plot(pollution$ovr65,pollution$mort,main="mort on ovr65") p<-locator() # precipitation is clearly linearly related to mortality, while # there seems to be some outliers or less clear relationships # in the other plots. # plot(pollution$popn,pollution$mort,main="mort on popn") plot(pollution$educ,pollution$mort,main="mort on educ") plot(pollution$hous,pollution$mort,main="mort on hous") plot(pollution$dens,pollution$mort,main="mort on dens") p<-locator() # Note that education level and housing conditions are negatively # correlated with mortality - makes sense: richer more affluent # neighborhoods may be associated with better health status # Similarly, number of persons per household and population density # are positively related to mortality. That is, in less affluent neighborhoods # the household size may be larger and the neighborhood more crowded. # plot(pollution$nonw,pollution$mort,main="mort on nonw") plot(pollution$wwdrk,pollution$mort,main="mort on wwdrk") plot(pollution$poor,pollution$mort,main="mort on poor") plot(pollution$hc,pollution$mort,main="mort on hc") p<-locator() # mortality is positively correlated with the percentage of nonwhites # and with the percentage of people living below the poverty line. # mortality is negatively correlated with whitecolar working percentages. # hc is one of the pollution variables - it's difficult to see any pattern # so I take logs to supress the extreme scale of hc. # plot(log(pollution$hc),pollution$mort,main="mort on log(hc)") plot(log(pollution$nox),pollution$mort,main="mort on log(nox)") plot(log(pollution$so),pollution$mort,main="mort on log(so)") plot(pollution$humid,pollution$mort,main="mort on humid") p<-locator() # mortality is positively correlated with all the pollution factors. # humidity seems to play no significant role. # # # polltrans<-pollution polltrans$hc<-log(polltrans$hc) polltrans$nox<-log(polltrans$nox) polltrans$so<-log(polltrans$so) # here I create a new data set with the pollution variables transformed. # # # ii<-sort(sample(seq(1,dim(pollution)[1]),20)) # polltest<-polltrans[ii,] # polltrain<-polltrans[-ii,] # I start by splitting the data into a training and test set # Unblock the above 3 command lines when you run the code yourself. # # # mm1<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrain) print(summary(mm1)) par(mfrow=c(2,2)) plot(mm1) p<-locator() # the fit looks pretty good, though there are some outliers in the data set # possible outliers 6,31,37,56 # note, these may be different for you since you have a different # random subset of data from me. # cooksd<-cooks.distance(mm1) par(mfrow=c(1,1)) plot(cooksd,main="Cook's Distance",type="h") abline(h=qf(.95,1,mm1$df),lty=2) p<-locator() # 37 does stand out (observation 24 in training set is observation 37 # in the original data set. # par(mfrow=c(2,2)) lm1<-lm.influence(mm1) plot(lm1$hat,main="Leverage") abline(h=3*length(mm1$coef)/dim(polltrain)[1]) identify(lm1$hat) # no excessively large leverages plot(lm1$coeff[,6],main="change in slope 6") identify(lm1$coeff[,6]) plot(lm1$coeff[,12],main="change in slope 12") identify(lm1$coeff[,12]) plot(lm1$coeff[,13],main="change in slope 13") identify(lm1$coeff[,13]) plot(lm1$sigma, main="Residual MSE when obs is dropped") identify(lm1$sigma) p<-locator() # Note observations 21,31,6,24,38 in the training set are # observations 6,31,37,48,56 in the original data set (as seen in the # plot(mm1) figure. # # Let's drop 6, 31,37 and 56 # that is 21,31,6,38 on training data # These observations seem to be problematic both in terms of affecting slopes # and the mse estimates. # mm1b<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrain,subset=-c(6,21,31,38)) print(summary(mm1b)) par(mfrow=c(2,2)) plot(mm1b) p<-locator() # Looks pretty good # print(step(mm1b)) p<-locator() # drop poor and humidity # # Checking collinearity par(mfrow=c(1,1)) image(seq(1,15),seq(1,15),abs(cor(pollution[,1:15]))) # hc and nox! (variables 12 and 13) are very highly correlated (white # block in the figure). # variables 5,6,7 are also highly correlated (that's popn, educ, hous) # Collinearity problems will def. be an issue! # polltrain2<-polltrain[-c(6,21,31,38),] # here I create my new training data without outliers. # # # Here is another run on a different subset of data. Note that you may # get different variables retained in the model after doing the backward # search. iij<-sort(sample(seq(1,dim(pollution)[1]),20)) polltestb<-polltrans[iij,] polltrainb<-polltrans[-iij,] # mm1b<-lm(mort~prec+jant+jult+ovr65+popn+educ+hous+dens+nonw+wwdrk+poor+hc+nox+so+humid,data=polltrainb) print(summary(mm1b)) par(mfrow=c(2,2)) plot(mm1b) p<-locator() # Looks pretty good # print(step(mm1b)) p<-locator()