R6 Principal Components Analysis options(digits=4) # Eigenvalues and eigenvectors A <- matrix(sample(9), nrow=3) A A.e <- eigen(A) A.e A %*% A.e$vectors[,1] A.e$values[1] * A.e$vectors[,1] A %*% A.e$vectors[,2] A.e$values[2] * A.e$vectors[,2] A %*% A.e$vectors[,3] A.e$values[3] * A.e$vectors[,3] # PCA Example X<-read.table("F:\\Stat 5600\\Data\\PCA_EXAMPLE.txt") mean(X) sd(X) # already standardized pairs(X) library(rgl) plot3d(X, type="s", radius=.1, col=rainbow(300)[rank(X[,1])]) rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca1.png") X.cor <- cor(X) X.cor X.e <- eigen(X.cor) X.e X.svd <- svd(X.cor) X.svd U <- X.e$vectors # Get rotation matrix of eigenvectors t(U) %*% U zapsmall(t(U) %*% U) Z <- X %*% U # Rotated data matrix Z <- as.matrix(X) %*% U pairs(Z) zapsmall(cor(Z)) plot3d(Z, type="s", radius=.1, col=rainbow(300)[rank(X[,1])]) rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca2.png") plot3d(Z[,1], Z[,2], rep(0,300), type="s", radius=.1, col=rainbow(300)[rank(X[,1])]) # First two principal components rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca3.png") plot3d(Z[,1], rep(0,300), rep(0,300), type="s", radius=.1, col=rainbow(300)[rank(X[,1])]) # First principal component rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca4.png") # Rotate back to X-space Z1 <- cbind(Z[,1], rep(0,300), rep(0,300)) X1 <- Z1 %*% t(U) plot3d(X, type="s", radius=.1, col=rainbow(300)[rank(X[,1])], alpha=.3) # Original data spheres3d(X1[,1],X1[,2],X1[,3], radius=.1, color=rainbow(300)[rank(X[,1])]) # First PC rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca5.png") Z2 <- cbind(rep(0,300), Z[,2], rep(0,300)) X2 <- Z2 %*% t(U) spheres3d(X2[,1],X2[,2],X2[,3], radius=.1, color=rainbow(300)[rank(X[,1])]) # Second PC rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca6.png") Z3 <- cbind(rep(0,300),rep(0,300), Z[,3]) X3 <- Z3 %*% t(U) spheres3d(X3[,1],X3[,2],X3[,3], radius=.1, color=rainbow(300)[rank(X[,1])]) # Third PC rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca7.png") Z12 <- cbind(Z[,1:2],rep(0, 300)) X12 <- Z12 %*% t(U) plot3d(X, type="s", radius=.1, col=rainbow(300)[rank(X[,1])], alpha=.3) # Original data spheres3d(X12[,1],X12[,2],X12[,3], radius=.1, color=rainbow(300)[rank(X[,1])]) # First two PCs rgl.snapshot("C:\\Documents and Settings\\minnotte\\My Documents\\Stat5600\\pca8.png") # Fractions of variance lambda <- X.e$values # Eigenvalues of cor(X) = variances of PCs lambda zapsmall(cov(Z)) zapsmall(cor(Z)) diag(cov(Z)) sum(diag(cov(Z))) # Note: sum = number of dimensions = sum of original correlations # The prcomp( ) function X.pc <- prcomp(X, scale=T) X.pc X.pc$rotation # Compare to U above - same except for possible sign reversal U X.pc$sdev # Component sds X.pc$sdev^2 # Component Variances - compare to lambda above lambda pairs(X.pc$x) # Same as Z above, again except for possible sign change # New example: State economic data data(state) gsp.raw<-read.table("F:\\Stat 5600\\Data\\GSP_RAW.txt", header=T) rownames(gsp.raw) <- state.abb pairs(gsp.raw) stars(gsp.raw, key.loc=c(15, 1.5)) cor(gsp.raw) # Strong correlation gsp.raw.pc <- prcomp(gsp.raw, scale=T) gsp.raw.pc$sdev^2 # Variances - almost everything in PC 1 gsp.raw.pc$rotation[,1:2] # Loadings for PCs 1 and 2 # Loadings for PC 1 almost all equal - simply a measure of economic size # COnvert each state's earnings into a percentage for each sector gsp.share <- gsp.raw/apply(gsp.raw, 1, sum)*100 pairs(gsp.share) stars(gsp.share, key.loc=c(15, 1.5)) cor(gsp.share) gsp.share.pc <- prcomp(gsp.share, scale=T) zapsmall(gsp.share.pc$sdev^2) # Variances - much more balanced pairs(gsp.share.pc$x[,1:6]) # First 6 PCs plot(gsp.share.pc$x[,1:2], pch=16) # First 2 plot(gsp.share.pc$x[,1:2], type='n') text(gsp.share.pc$x[,1:2], state.abb) # Loadings - first 6 PCs gsp.share.pc$rotation[,1:6] # Values of some outliers in original units gsp.share[c(2,8,11,28,50),] # biplot - loadings on 1st 2 PCs and plot of first 2 PCs biplot(gsp.share.pc) # How many components? plot(gsp.share.pc) # Scree plot plot(gsp.share.pc$sdev^2, type="o", pch=16) # Alternate Version zapsmall(gsp.share.pc$sdev^2) sum(gsp.share.pc$sdev>1) # Kaiser's Rule - keep components with variance > 1 abline(h=1,col="grey") summary(gsp.share.pc) # Horn's procedure - compare to randomly permuted (bootstrapped) data gsp.boot <- gsp.share for (i in 1:13) { gsp.boot[,i] <- sample(gsp.share[,i], replace=T) } gsp.boot.pc <- prcomp(gsp.boot, scale=T) points(gsp.boot.pc$sdev^2, type="o") # Add to alternate scree plot; compare # Repeat; add different runs in different colors # Another example - planetary data planets1<-read.table("H:/S5600/planets.txt") planets1 pairs(planets1) # Log transform values except rings (add one to number Moons) planets <- cbind(logDist=log(planets1$Dist), logRadius = log(planets1$Radius), logMass =log(planets1$Mass), logDensity=log(planets1$Density), logMoons=log(planets1$Moons+1), Rings=unclass(planets1$Rings)-1) rownames(planets)<-rownames(planets1) planets pairs(planets) planets.pc <- prcomp(planets, scale=T) planets.pc$sdev^2 plot(planets.pc$sdev^2, type="o", pch=16) abline(h=1,col="grey") plot(planets.pc$x[,1:2], pch=16) text(planets.pc$x[,1], planets.pc$x[,2], rownames(planets)) # Note the three clusters planets.pc$rotation[,1:2] biplot(planets.pc)