R13 Canonical Correlation # Belly Dancer Ratings (Artificial) # Top "shimmies/circles" vs Bottom "shimmies/circles" dance <- read.table("E:/S5600/Data sets/B_DANCE.txt", header=T) dance cor(dance) pairs(dance) X <- scale(dance[,1:2]) # Start with standardized data Y <- scale(dance[,3:4]) dance.cc <- cancor(X,Y) dance.cc A <- dance.cc$xcoef*sqrt(7) # sqrt(n-1) gives standardized coefficients B <- dance.cc$ycoef*sqrt(7) # var(U) = var(T) = 1 A B U <- X%*%A # X Variates T <- Y%*%B # Y Variates U T pairs(cbind(U,T)) cov(U) zapsmall(cov(U)) zapsmall(cov(T)) zapsmall(cov(U,T)) zapsmall(cov(X, U)) # Loadings zapsmall(cov(Y, T)) f <- cor(X)%*%A # Alternative Calculation of Loadings f g <- cor(Y)%*%B g pvx <- apply(f^2,2,sum)/2 # Proportion variance of X explained pvx # Per variate sum(pvx) # total pvy <- apply(g^2,2,sum)/2 pvy sum(pvy) rd.xy <- pvy*dance.cc$cor^2 # Redundancy - how much of Y's variance rd.xy # is explained by X variates sum(rd.xy) rd.yx <- pvx*dance.cc$cor^2 # Redundancy - how much of X's variance rd.yx # is explained by Y variates sum(rd.yx) # Same Data - Starting with correlation matrix dance.cor <- cor(dance) # Starting with just correlation matrix Rxx <- dance.cor[1:2,1:2] Ryy <- dance.cor[3:4,3:4] Rxy <- dance.cor[1:2,3:4] Ryx <- dance.cor[3:4,1:2] Ryy.svd <- svd(Ryy) Ryy.irt <- Ryy.svd$u %*% diag(1/sqrt(Ryy.svd$d)) %*% t(Ryy.svd$v) Rxx.svd <- svd(Rxx) Rxx.irt <- Rxx.svd$u %*% diag(1/sqrt(Rxx.svd$d)) %*% t(Rxx.svd$v) K <- Rxx.irt %*% Rxy %*% Ryy.irt K K.svd <- svd(K) cc <- K.svd$d # Canonical Correlations cc A <- Rxx.irt %*% K.svd$u # Standardized Coefficients B <- Ryy.irt %*% K.svd$v f <- Rxx %*% A # Loadings f g <- Ryy %*% B g pvx <- apply(f^2,2,sum)/2 # Proportion variance of X explained pvx pvy <- apply(g^2,2,sum)/2 pvy rd.xy <- pvy*cc^2 # Redundancy - how much of Y's variance rd.xy # is explained by X variates rd.yx <- pvx*cc^2 # Redundancy - how much of X's variance rd.yx # is explained by Y variates # Marketing Data - Structural Features of Groceries vs # Promotional Features market <- read.table("F:/S5600/Data sets/FACTBOOK.txt") head(market) dim(market) cor(market) pairs(market) image(1:10, 1:10, cor(market), zlim=c(-1,1), col=cm.colors(21)) struct <- scale(market[,1:5]) promot <- scale(market[,6:10]) market.cc <- cancor(struct, promot) market.cc$cor n <- nrow(struct) n U <- struct %*% market.cc$xcoef * sqrt(n-1) T <- promot %*% market.cc$ycoef * sqrt(n-1) zapsmall(cov(U,T)) # Wilks's Lambda Lambda <- prod(1-market.cc$cor^2) Lambda # Barlett's Test of Model Significance p <- ncol(struct) q <- ncol(promot) V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) # Bartlett's Test for second and higher pairs Lambda <- prod(1-market.cc$cor[-1]^2) Lambda p <- ncol(struct)-1 q <- ncol(promot)-1 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) # Bartlett's Test for third and higher pairs Lambda <- prod(1-market.cc$cor[-(1:2)]^2) Lambda p <- ncol(struct)-2 q <- ncol(promot)-2 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) # Bartlett's Test for fourth and fifth pairs Lambda <- prod(1-market.cc$cor[-(1:3)]^2) Lambda p <- ncol(struct)-3 q <- ncol(promot)-3 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) # Alternative: "Scree" plot of r^2 for pairs plot(market.cc$cor^2, type="o", pch=16) U <- U[,1:3] T <- T[,1:3] f <- cor(struct, U) f g <- cor(promot, T) g plot(U[,1],T[,1], pch=16) identify(U[,1],T[,1],rownames(U)) plot(U[,2],T[,2], pch=16) identify(U[,2],T[,2],rownames(U)) pvx <- apply(f^2,2,sum)/5 # Proportion variance of X explained pvx # Per variate sum(pvx) # total pvy <- apply(g^2,2,sum)/5 pvy sum(pvy) rd.xy <- pvy*market.cc$cor[1:3]^2 # Redundancy - how much of Y's variance rd.xy # is explained by X variates sum(rd.xy) rd.yx <- pvx*market.cc$cor[1:3]^2 # Redundancy - how much of X's variance rd.yx # is explained by Y variates sum(rd.yx) # Exam Scores - Closed- vs. Open-book library(bootstrap) # must be installed - only used for exam dataset data(scor) head(scor) dim(scor) pairs(scor) closed <- scale(scor[,1:2]) open <- scale(scor[,3:5]) scor.cc <- cancor(closed, open) scor.cc$cor n <- nrow(closed) n U <- closed %*% scor.cc$xcoef*sqrt(n-1) T <- open %*% scor.cc$ycoef*sqrt(n-1) zapsmall(cov(U,T)) f <- cor(closed, U) f g <- cor(open, T) g plot(U[,1],T[,1], pch=16) plot(U[,2],T[,2], pch=16) Lambda <- prod(1-scor.cc$cor^2) # Test first pair Lambda p <- ncol(closed) q <- ncol(open) V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) Lambda <- prod(1-scor.cc$cor[-1]^2) # Test second pair Lambda p <- ncol(closed)-1 q <- ncol(open)-1 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) pvx <- apply(f^2,2,sum)/2 # Proportion variance of X explained pvx # Per variate pvy <- apply(g^2,2,sum)/3 pvy rd.xy <- pvy[1:2]*scor.cc$cor^2 # Redundancy - Y from X rd.xy rd.yx <- pvx*scor.cc$cor^2 # Redundancy - X from Y rd.yx # Same Data - Starting with correlation matrix scor.cor <- cor(scor) # Starting with just correlation matrix Rxx <- scor.cor[1:2,1:2] Ryy <- scor.cor[3:5,3:5] Rxy <- scor.cor[1:2,3:5] Ryx <- scor.cor[3:5,1:2] Ryy.svd <- svd(Ryy) Ryy.irt <- Ryy.svd$u %*% diag(1/sqrt(Ryy.svd$d)) %*% t(Ryy.svd$v) Rxx.svd <- svd(Rxx) Rxx.irt <- Rxx.svd$u %*% diag(1/sqrt(Rxx.svd$d)) %*% t(Rxx.svd$v) K <- Rxx.irt %*% Rxy %*% Ryy.irt K K.svd <- svd(K) cc <- K.svd$d # Canonical Correlations cc A <- Rxx.irt %*% K.svd$u # Standardized Coefficients B <- Ryy.irt %*% K.svd$v f <- Rxx %*% A # Loadings f g <- Ryy %*% B g Lambda <- prod(1-cc^2) # Test first pair Lambda p <- 2 q <- 3 n <- 88 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) Lambda <- prod(1-cc[-1]^2) # Test second pair Lambda p <- 2-1 q <- 3-1 V <- -((n-1)-(p+q+1)/2)*log(Lambda) V p*q pchisq(V, p*q, lower=F) pvx <- apply(f^2,2,sum)/2 # Proportion variance of X explained pvx pvy <- apply(g^2,2,sum)/3 pvy rd.xy <- pvy*cc^2 # Redundancy - Y from X rd.xy rd.yx <- pvx*cc^2 # Redundancy - X from Y rd.yx