# X Salt Property Value Data (Chapter 3 in book) X<-read.table("B:/www/S5600S06/Data/LESLIE_SALT.txt",header=T) dim(X) X[1:10,] hist(X$Price, freq=F) lines(density(X$Price, bw="SJ"), col="red") hist(log(X$Price), freq=F) lines(density(log(X$Price), bw="SJ"), col="red") X <- cbind(logPrice = log(X$Price),X) head(X) cor(X) # Simple linear regression plot(logPrice ~ Date, data=X, pch=16) reg1 <- lm(logPrice ~ Date, data=X) abline(reg1) reg1 summary(reg1) # Adding a second independent variable pairs(X[,c(1,5,7)]) library(rgl) plot3d( X$Date, X$Elevation, X$logPrice, type="s", size=.25) rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\reg1.png") reg2 <- lm(logPrice ~ Date + Elevation, data=X) # Note how we add a variable to our model summary(reg2) corners <- data.frame(Date=c(0,0,-110,-100), Elevation=c(0,20,20,0)) predict(reg2, corners) quads3d(corners$Date, corners$Elevation, predict(reg2, corners), col="green", alpha=.5) # Model with additional main effects pairs(X[,c(1, 5, 7, 8, 9)]) reg3 <- lm(logPrice ~ Date + Elevation + Flood + Distance, data=X) summary(reg3) # Examine Residual Plot plot(fitted.values(reg3), residuals(reg3), pch=16) abline(h=0, col="grey") # Look for autocorrelation (time series data) plot(residuals(reg3), type="o", pch=16) abline(h=0, col="grey") # Measures of influential points influence.measures(reg3) plot(hatvalues(reg3), type="h") plot(rstudent(reg3), type="h") matplot(dfbetas(reg3), pch=1:5) legend("bottomright", colnames(dfbetas(reg3)), pch=1:5, col=1:5) # Partial Regression Leverage Plots termplot(reg3,partial=T,col.res="black") # Slope Interaction Terms X$County <- factor(X$County, labels=c("San Mateo","Santa Clara")) plot(logPrice ~ Elevation, col = unclass(County), data = X, pch=unclass(County)) legend("bottomright", levels(X$County), pch=1:2, col=1:2) abline(lm(logPrice ~ Elevation, data = X, subset= County=="San Mateo")) abline(lm(logPrice ~ Elevation, data = X, subset= County=="Santa Clara"), col=2) reg4a <- lm(logPrice ~ Date + Elevation + Flood + Distance + County + County:Elevation, data=X) # Note County:Elevation is interaction term reg4a reg4 <- lm(logPrice ~ Date + Flood + Distance + Elevation*County, data=X) # Elevation*County includes both main effects, plus interaction summary(reg4) summary(reg3) # Testing the difference of models anova(reg4, reg3) # Forecasting X0 <- data.frame(County = "Santa Clara", Elevation = 0, Flood = 0, Distance = 0, Date = 0) X0 # Values for Leslie Salt property Yhat <- predict(reg4, X0, se.fit=T, interval="prediction") Yhat exp(Yhat$fit) # Naive back-transformation sigma2 <- summary(reg4)$sigma^2 exp(Yhat$fit[1]+sigma2/2) # Better; E(exp(Yhat)) (lognormal) # Note: Your book assumes the same formula works for base-10 logs; it doesn't. # You can get the same results using base 10 as follows: X$log10Price <- log10(X$Price) reg5 <- lm(log10Price ~ Date + Flood + Distance + Elevation*County, data=X) Yhat.10 <- predict(reg5, X0, se.fit=T, interval="prediction") 10^Yhat.10$fit # Naive, compare to naive above sigma2.10 <- summary(reg6)$sigma^2 10^(Yhat.10$fit[1]+sigma2.10/2) # Book’s “corrected” answer exp(log(10)*Yhat.10$fit[1]+log(10)^2*sigma2.10/2) # Actually correct