#install.packages('mda') # if you haven't installed the mda package, the above command will do so. #install.packages('lars') # #library(lars) #library(mda) # # cars4<-read.table('cars4.txt',sep='\t',header=T) print(names(cars4)) # xx<-as.matrix(cars4[,-1]) xxs<-apply(xx,2,function(xx) { xx <- (xx-mean(xx))/sd(xx)}) # standardize the x-variables ppr<-prcomp(t(xxs)%*%xxs) # par(mfrow=c(1,1)) plot(ppr$sdev^2/sum(ppr$sdev^2),main="Percent variance explained",xlab="Components") p<-locator() plot(ppr$rot[,1],main="loading factors in PR decomposition") abline(h=0) # print(cbind(names(cars4[,-1]),ppr$rot[,1])) p<-locator() # par(mfrow=c(2,2)) plot(cars4[,2],cars4[,1],xlab="city",ylab="price") plot(cars4[,3],cars4[,1],xlab="hwy",ylab="price") plot(cars4[,6],cars4[,1],xlab="fueltank",ylab="price") p<-locator() par(mfrow=c(2,2)) plot(cars4[,7],cars4[,1],xlab="length",ylab="price") plot(cars4[,8],cars4[,1],xlab="width",ylab="price") plot(cars4[,9],cars4[,1],xlab="uturn",ylab="price") plot(cars4[,10],cars4[,1],xlab="luggage",ylab="price") p<-locator() plot(cars4[,4],cars4[,1],xlab="rpm.at.max",ylab="price") plot(cars4[,5],cars4[,1],xlab="engrev.high",ylab="price") p<-locator() # print(round(cor(cars4),2)) p<-locator() # # so looks like PCR is an average of the "weight" type variables, minus an # average of the "rpm" measurements # of course, we don't know how much these components can predict price... # mm1 <- lm(cars4[,1]~xxs) par(mfrow=c(2,2)) plot(mm1) print(summary(mm1)) p<-locator() # # newxxs<-xxs%*%ppr$rot mm2 <- lm(cars4[,1]~newxxs) par(mfrow=c(2,2)) plot(mm2) print(summary(mm2)) p<-locator() # need only use the 1st two components, explains almost all of the variance... # mm3 <- lm(cars4[,1]~newxxs[,1:2]) par(mfrow=c(2,2)) plot(mm3) print(summary(mm3)) p<-locator() # # # prediction.... # iie<-sample(seq(1,dim(cars4)[1]),40) trcars<-cars4[-iie,] tecars<-cars4[iie,] # xx<-as.matrix(trcars[,-1]) xxs<-apply(xx,2,function(xx) { xx <- (xx-mean(xx))/sd(xx)}) xxn<-as.matrix(tecars[,-1]) xxns<-apply(xxn,2,function(xxn) { xxn <- (xxn-mean(xxn))/sd(xxn)}) # # standardize the x-variables ppr<-prcomp(t(xxs)%*%xxs) newxxs<-xxs%*%ppr$rot newxxns<-xxns%*%ppr$rot # new standardized data # mm1<-lm(trcars[,1]~xxs) mm2<-lm(trcars[,1]~newxxs) mm3<-lm(trcars[,1]~newxxs[,1:2]) # pred1<-xxns%*%(as.vector(mm1$coef[2:length(mm1$coef)]))+mm1$coef[1] pred2<-newxxns%*%(as.vector(mm2$coef[2:length(mm2$coef)]))+mm2$coef[1] pred3<-newxxns[,1:2]%*%(as.vector(mm3$coef[2:length(mm3$coef)]))+mm3$coef[1] # pe1<-sum((tecars[,1]-pred1)^2) pe2<-sum((tecars[,1]-pred2)^2) pe3<-sum((tecars[,1]-pred3)^2) print("Prediction errors") print(c("standard regression ", pe1)) print(c("on principal components ", pe2)) print(c("on first 2 components", pe3)) # You can also consider doing model selection on the PCs p<-locator() # # let's try ridge regression trcars2<-trcars tecars2<-tecars trcars2[,1]<-trcars2[,1]-mean(trcars2[,1]) tecars2[,1]<-tecars2[,1]-mean(tecars2[,1]) # gm1<-lm.ridge(trcars2[,1]~xxs,lambda=1) gm2<-lm.ridge(trcars2[,1]~xxs,lambda=3) gm3<-lm.ridge(trcars2[,1]~xxs,lambda=10) # make sure to try other lambda values on your data # gpred1<-xxns%*%(as.vector(gm1$coef)) gpred2<-xxns%*%(as.vector(gm2$coef)) gpred3<-xxns%*%(as.vector(gm3$coef)) # gpe1<-sum((tecars2[,1]-gpred1)^2) gpe2<-sum((tecars2[,1]-gpred2)^2) gpe3<-sum((tecars2[,1]-gpred3)^2) print("Prediction errors - Ridge") print(c("lambda=1 ", gpe1)) print(c("lambda=3 ", gpe2)) print(c("lambda=10 ", gpe3)) p<-locator() # Try other lambda values ... perhaps even bigger than 10 is even better... # # lm1<-lars(xxs,trcars2[,1],type='lasso',trace=T) plm1<-predict(lm1,xxns) par(mfrow=c(1,1)) plot(lm1) p<-locator() # predicts for all regularization paths - which order variables are # dropped/added from the model. # Read from left to right in the figure - we start with a really strong # penalty on the betas, and then most parameters are set to 0. # As we go toward the rigth, the penalty is smaller and smaller and more # parameters are allowed to be kept in the model. # yy<-matrix(rep(tecars2[,1],dim(plm1$fit)[2]),length(tecars2[,1]),dim(plm1$fit)[2]) lpe1<-apply((yy-plm1$fit)^2,2,sum) print("Prediction errors - Lasso") cdf<-as.data.frame(rbind(as.vector(as.real(lm1$ac)),lpe1[-1])) names(cdf)<-names(tecars2[,c(1+as.real(lm1$ac))]) print(cdf) # this gives you the prediction errors for the different models. From left # to right, variables in the order they are added to the model.