#install.packages('mda')
# if you haven't installed the mda package, the above command will do so.
#install.packages('lars')
# 
#library(lars)
#library(mda)
#
# 
cars4<-read.table('cars4.txt',sep='\t',header=T)
print(names(cars4))
#
xx<-as.matrix(cars4[,-1])
xxs<-apply(xx,2,function(xx) { xx <- (xx-mean(xx))/sd(xx)})
# standardize the x-variables
ppr<-prcomp(t(xxs)%*%xxs)
#
par(mfrow=c(1,1))
plot(ppr$sdev^2/sum(ppr$sdev^2),main="Percent variance explained",xlab="Components")
p<-locator()
plot(ppr$rot[,1],main="loading factors in PR decomposition")
abline(h=0)
#
print(cbind(names(cars4[,-1]),ppr$rot[,1]))
p<-locator()
#
par(mfrow=c(2,2))
plot(cars4[,2],cars4[,1],xlab="city",ylab="price")
plot(cars4[,3],cars4[,1],xlab="hwy",ylab="price")
plot(cars4[,6],cars4[,1],xlab="fueltank",ylab="price")
p<-locator()
par(mfrow=c(2,2))
plot(cars4[,7],cars4[,1],xlab="length",ylab="price")
plot(cars4[,8],cars4[,1],xlab="width",ylab="price")
plot(cars4[,9],cars4[,1],xlab="uturn",ylab="price")
plot(cars4[,10],cars4[,1],xlab="luggage",ylab="price")
p<-locator()
plot(cars4[,4],cars4[,1],xlab="rpm.at.max",ylab="price")
plot(cars4[,5],cars4[,1],xlab="engrev.high",ylab="price")
p<-locator()
#
print(round(cor(cars4),2))
p<-locator()
#
# so looks like PCR is an average of the "weight" type variables, minus an
# average of the "rpm" measurements
# of course, we don't know how much these components can predict price...
#
mm1 <- lm(cars4[,1]~xxs)
par(mfrow=c(2,2))
plot(mm1)
print(summary(mm1))
p<-locator()
#
#
newxxs<-xxs%*%ppr$rot
mm2 <- lm(cars4[,1]~newxxs)
par(mfrow=c(2,2))
plot(mm2)
print(summary(mm2))
p<-locator()
# need only use the 1st two components, explains almost all of the variance...
#
mm3 <- lm(cars4[,1]~newxxs[,1:2])
par(mfrow=c(2,2))
plot(mm3)
print(summary(mm3))
p<-locator()
#
#
# prediction....
#
iie<-sample(seq(1,dim(cars4)[1]),40)
trcars<-cars4[-iie,]
tecars<-cars4[iie,]
#
xx<-as.matrix(trcars[,-1])
xxs<-apply(xx,2,function(xx) { xx <- (xx-mean(xx))/sd(xx)})
xxn<-as.matrix(tecars[,-1])
xxns<-apply(xxn,2,function(xxn) { xxn <- (xxn-mean(xxn))/sd(xxn)})
#
# standardize the x-variables
ppr<-prcomp(t(xxs)%*%xxs)
newxxs<-xxs%*%ppr$rot
newxxns<-xxns%*%ppr$rot
# new standardized data
#
mm1<-lm(trcars[,1]~xxs)
mm2<-lm(trcars[,1]~newxxs)
mm3<-lm(trcars[,1]~newxxs[,1:2])
# 
pred1<-xxns%*%(as.vector(mm1$coef[2:length(mm1$coef)]))+mm1$coef[1]
pred2<-newxxns%*%(as.vector(mm2$coef[2:length(mm2$coef)]))+mm2$coef[1]
pred3<-newxxns[,1:2]%*%(as.vector(mm3$coef[2:length(mm3$coef)]))+mm3$coef[1]
#
pe1<-sum((tecars[,1]-pred1)^2)
pe2<-sum((tecars[,1]-pred2)^2)
pe3<-sum((tecars[,1]-pred3)^2)
print("Prediction errors")
print(c("standard regression ", pe1))
print(c("on principal components ", pe2))
print(c("on first 2 components", pe3))
# You can also consider doing model selection on the PCs
p<-locator()
#
# let's try ridge regression
trcars2<-trcars
tecars2<-tecars
trcars2[,1]<-trcars2[,1]-mean(trcars2[,1])
tecars2[,1]<-tecars2[,1]-mean(tecars2[,1])
#
gm1<-lm.ridge(trcars2[,1]~xxs,lambda=1)
gm2<-lm.ridge(trcars2[,1]~xxs,lambda=3)
gm3<-lm.ridge(trcars2[,1]~xxs,lambda=10)
# make sure to try other lambda values on your data
#
gpred1<-xxns%*%(as.vector(gm1$coef))
gpred2<-xxns%*%(as.vector(gm2$coef))
gpred3<-xxns%*%(as.vector(gm3$coef))
#
gpe1<-sum((tecars2[,1]-gpred1)^2)
gpe2<-sum((tecars2[,1]-gpred2)^2)
gpe3<-sum((tecars2[,1]-gpred3)^2)
print("Prediction errors - Ridge")
print(c("lambda=1 ", gpe1))
print(c("lambda=3 ", gpe2))
print(c("lambda=10 ", gpe3))
p<-locator()
# Try other lambda values ... perhaps even bigger than 10 is even better...
#
#
lm1<-lars(xxs,trcars2[,1],type='lasso',trace=T)
plm1<-predict(lm1,xxns)
par(mfrow=c(1,1))
plot(lm1)
p<-locator()
# predicts for all regularization paths - which order variables are 
# dropped/added from the model.
# Read from left to right in the figure - we start with a really strong
# penalty on the betas, and then most parameters are set to 0.
# As we go toward the rigth, the penalty is smaller and smaller and more
# parameters are allowed to be kept in the model.
#
yy<-matrix(rep(tecars2[,1],dim(plm1$fit)[2]),length(tecars2[,1]),dim(plm1$fit)[2])
lpe1<-apply((yy-plm1$fit)^2,2,sum)
print("Prediction errors - Lasso")
cdf<-as.data.frame(rbind(as.vector(as.real(lm1$ac)),lpe1[-1]))
names(cdf)<-names(tecars2[,c(1+as.real(lm1$ac))])
print(cdf)
# this gives you the prediction errors for the different models. From left
# to right, variables in the order they are added to the model.