#######################################################################
# #
# Data Image S-plus Functions #
# #
# Save as text, then source into S-plus #
# #
# Copyright 1998, Michael C. Minnotte #
# #
# For assistance, or to report bugs, please contact: #
# #
# minnotte@math.usu.edu #
# #
# Last updated, November 18, 1998 #
# #
# A data image is a way to look at moderately high-dimensional data #
# sets for structure across many variables. It was developed by Mike #
# Minnotte and Webster West, and owes a lot to the color histogram #
# ideas of Ed Wegman. #
# #
# For more information about data images, see: #
# #
# "The Data Image: A Tool for Exploring High Dimensional Data" #
# #
# Functions include: #
# #
# dataimage - main function; calculates (and optionally plots) #
# data image based on any of several sorting schemes, for both #
# observations and variables #
# diplot - data image plotting function; usually called automatically #
# from dataimage #
# distmat - calculate a distance matrix #
# hco - calculate a heirarchical clustering ordering #
# isto - calculate an ordering based on spanning tours using furthest #
# or nearest insertion #
# #
#######################################################################
dataimage<-function(data,obs.sort="complete",var.sort="complete",std=T,
obs.met="euclidean",var.met="euclidean",doplot=T,
maxv=apply(data,2,max),minv=apply(data,2,min),var.l=T,obs.l=T,
lab.l=dimnames(data)[[2]],lab.r=dimnames(data)[[2]],
lab.t=dimnames(data)[[1]],lab.b=dimnames(data)[[1]])
{# calculate (and optionally plot) sortings for dataimage (color histogram)
# data - matrix or data frame of observations (rows) and variables (columns)
# obs.sort - method of sorting observations. One of:
# "none" - leave in original ordering
# "complete" - heirarchical clustering, where cluster distances are
# measured as max of point distances
# "single" - heirarchical clustering, where cluster distances are
# measured as min of point distances
# "average" - heirarchical clustering, where cluster distances are
# measured as average of point distances
# "farthest" - farthest insertion spanning tour
# "nearest" - nearest insertion spanning tour
# k (numeric) - sort on variable k
# var.sort - method of sorting variables, as obs.sort
# std - if T, standardize all variables before sorting
# obs.met - distance metric for observations. One of:
# "euclidean" - L2
# "manhattan" - L1
# "maximum" - L-Infinity
# var.met - distance metric for variables, as obs.met
# doplot - if T, plot data image. If false, return list consisting of
# obs.ord: vector of observation orderings
# var.ord: vector of variable orderings
# data: original data matrix
# maxv - vector of maximums for each variable
# minv - vector of minimums for each variable
# maxv, minv may be passed to keep color scale the same between images
# var.l - if T, label variables in plot
# obs.l - if T, label observations in plot
# lab.l - character vector for variable labels (left side)
# lab.r - character vector for variable labels (right side)
# lab.t - character vector for observation labels (top)
# lab.b - character vector for observation labels (bottom)
no<-nrow(data)
nv<-ncol(data)
dat2<-data
if (std)
{means<-apply(data,2,mean)
sds<-sqrt(apply(data,2,var))
for (i in 1:nv)
if (sds[i] > 0)
{dat2[,i]<-(dat2[,i]-means[i])/sds[i]}
else
{dat2[,i]<-rep(0,no)}
}
om<-charmatch(obs.met,c("euclidean","maximum","manhattan"),nomatch=-1)
if (om<1)
{cat("obs.met not matched. Using Euclidean distances.\n")
obs.met<-"euclidean"}
if (is.numeric(obs.sort) & obs.sort>0 & obs.sort<=nv)
{os<-7
obs.sort<-ceiling(obs.sort)}
else
{os<-charmatch(as.character(obs.sort),c("none","complete","single",
"average","farthest","nearest"),nomatch=-1)
if (os < 1)
{cat("obs.sort not matched. Using complete linkage clustering.\n")
os<-2}
}
if (os > 1 & os < 7) dm<-distmat(dat2,obs.met)
obs.ord<-switch(os,1:no,hco(dm,"c"),hco(dm,"s"),hco(dm,"a"),
isto(dm,"f"),isto(dm,"n"),order(dat2[,obs.sort]))
vm<-charmatch(var.met,c("euclidean","maximum","manhattan"),nomatch=-1)
if (vm<1)
{cat("var.met not matched. Using Euclidean distances.\n")
var.met<-"euclidean"}
if (is.numeric(var.sort) & var.sort>0 & var.sort<=no)
{vs<-7
var.sort<-ceiling(var.sort)}
else
{vs<-charmatch(var.sort,c("none","complete","single","average",
"farthest","nearest"),nomatch=-1)
if (vs < 1)
{cat("var.sort not matched. Using complete linkage clustering.\n")
vs<-2}
}
if (vs > 1 & vs < 7) dm<-distmat(t(dat2),var.met)
var.ord<-switch(vs,1:nv,hco(dm,"c"),hco(dm,"s"),hco(dm,"a"),
isto(dm,"f"),isto(dm,"n"),order(dat2[var.sort,]))
if (doplot)
{diplot(list(obs.ord=obs.ord,var.ord=var.ord,data=data),
maxv,minv,var.l,obs.l,lab.l,lab.r,lab.t,lab.b)
return()}
else
return(obs.ord=obs.ord,var.ord=var.ord,data=data)
}
#######################################################################
diplot<-function(dimage,maxv=apply(dimage$data,2,max),
minv=apply(dimage$data,2,min),
var.l=T,obs.l=T,lab.l=NULL,lab.r=NULL,lab.t=NULL,
lab.b=NULL,spc=rep(1,4))
{# plot dataimage (color histogram)
# dimage - output from dataimage function
# maxv - vector of maximums for each variable
# minv - vector of minimums for each variable
# maxv, minv may be passed to keep color scale the same between images
# var.l - if T, label variables in plot
# obs.l - if T, label observations in plot
# lab.l - character vector for variable labels (left side)
# lab.r - character vector for variable labels (right side)
# lab.t - character vector for observation labels (top)
# lab.b - character vector for observation labels (bottom)
# spc - vector of space to leave on each side of labels on (bottom, left, top,
# right)
arrangex<-dimage$obs.ord
arrangev<-dimage$var.ord
data<-dimage$data
if (is.null(lab.t))
lab.t<-dimnames(data)[[1]]
if (is.null(lab.b))
lab.b<-dimnames(data)[[1]]
if (is.null(lab.l))
lab.l<-dimnames(data)[[2]]
if (is.null(lab.r))
lab.r<-dimnames(data)[[2]]
nx<-nrow(data)
nv<-ncol(data)
if (length(lab.t)< 1) lab.t<-1:nx
if (length(lab.b)< 1) lab.b<-1:nx
if (length(lab.l)< 1) lab.l<-1:nv
if (length(lab.r)< 1) lab.r<-1:nv
if (!var.l)
{lab.l<-rep("",nv)
lab.r<-rep("",nv)}
if (!obs.l)
{lab.t<-rep("",nx)
lab.b<-rep("",nx)}
if (spc[1]>0)
{add<-NULL
for (i in 1:spc[1]) add<-paste(add," ",sep="")
lab.b<-paste(add,lab.b,add,sep="")
}
if (spc[2]>0)
{add<-NULL
for (i in 1:spc[2]) add<-paste(add," ",sep="")
lab.l<-paste(add,lab.l,add,sep="")
}
if (spc[3]>0)
{add<-NULL
for (i in 1:spc[3]) add<-paste(add," ",sep="")
lab.t<-paste(add,lab.t,add,sep="")
}
if (spc[4]>0)
{add<-NULL
for (i in 1:spc[4]) add<-paste(add," ",sep="")
lab.r<-paste(add,lab.r,add,sep="")
}
lab.t<-as.character(lab.t)
ltf<-(nchar(lab.t))
ltn<-max(ltf)
for (i in 1:length(lab.t))
lab.t[i]<-paste(substring(lab.t[i],1:nchar(lab.t[i]),1:nchar(lab.t[i])),collapse="\n")
lab.b<-as.character(lab.b)
lbn<-max(nchar(lab.b))
for (i in 1:length(lab.b))
lab.b[i]<-paste(substring(lab.b[i],1:nchar(lab.b[i]),1:nchar(lab.b[i])),collapse="\n")
lab.l<-as.character(lab.l)
lln<-max(nchar(lab.l))
lab.r<-as.character(lab.r)
lrn<-max(nchar(lab.r))
for (i in 1:nv)
if (maxv[i]!=minv[i])
{data[,i]<-(data[,i]-minv[i])/(maxv[i]-minv[i])}
else
{only<-(data[,i]==maxv[i])
data[,i]<-rep(2,nx)
data[only,i]<-0.5}
par(mar=c(lbn,lln/2,ltn,lrn/2),xaxs="i",yaxs="i")
image(0:nx,0:nv,data[arrangex,arrangev],lab=c(0,0,0),bty="n",axes=F,zlim=c(0,1))
mtext(lab.b[arrangex],1,at=1:nx-.5)
for (i in 1:nx)
mtext(lab.t[arrangex[i]],3,at=i-.5,line=ltf[arrangex[i]]-1)
mtext(lab.l[arrangev],2,at=1:nv-.5,adj=1)
mtext(lab.r[arrangev],4,at=1:nv-.5,adj=0)
}
#######################################################################
distmat<-function(data,met="euclidean")
{# use function dist to compute nrow(data)*nrow(data) matrix of interpoint
# distances.
distvec<-dist(data,metric=met)
n <- attr(distvec, "Size")
full <- matrix(0, n, n)
full[lower.tri(full)] <- distvec
return(full + t(full))}
#######################################################################
hco<-function(dm,method="complete")
{#heirarchical clustering ordering for points with distance matrix dm
# method should be one of "complete", "single", or "average"
# see dataimage for differences
meth<-charmatch(method,c("single","average","complete"),nomatch=-1)
if (meth < 1) meth <- 3
maxdist<-2*max(dm)
pointdist<-dm
clustdist<-dm
n<-ncol(dm)
clustdist[cbind(1:n,1:n)]<-maxdist
height<-NULL
clust<-as.list(1:n)
for (i in 1:(n-1))
{
lt<-clustdist[lower.tri(clustdist)]
mindist<-min(lt)
newmerge<-cbind(row(clustdist)[clustdist==mindist],
col(clustdist)[clustdist==mindist])[1,]
height<-c(height,mindist)
# merge clusters newmerge[1], newmerge[2]
oldclust1<-clust[[newmerge[1]]]
oldclust2<-clust[[newmerge[2]]]
len1<-length(oldclust1)
len2<-length(oldclust2)
linkdist<-pointdist[c(oldclust1[1],oldclust1[len1]),
c(oldclust2[1],oldclust2[len2])]
minlink<-min(linkdist)
if (linkdist[1,1]==minlink)
newclust<-c(rev(oldclust1),oldclust2)
else if (linkdist[2,1]==minlink)
newclust<-c(oldclust1,oldclust2)
else if (linkdist[1,2]==minlink)
newclust<-c(rev(oldclust1),rev(oldclust2))
else
newclust<-c(oldclust1,rev(oldclust2))
clust<-c(clust[-newmerge],list(newclust))
if(i<(n-2))
{oldclustdist<-clustdist[-newmerge,-newmerge]
if (meth==1)
newclustdist<-apply(clustdist[-newmerge,newmerge],1,min)
else if (meth==3)
newclustdist<-apply(clustdist[-newmerge,newmerge],1,max)
else
{newclustdist<-rep(0,n-i-1)
for (j in 1:(n-i-1))
newclustdist[j]<-mean(pointdist[newclust,clust[[j]]])}
clustdist<-rbind(cbind(oldclustdist,newclustdist),
cbind(t(newclustdist),maxdist))}
else if(i==(n-2))
{if (meth==1)
newclustdist<-min(clustdist[-newmerge,newmerge])
else if (meth==3)
newclustdist<-max(clustdist[-newmerge,newmerge])
else
newclustdist<-mean(pointdist[newclust,-newclust])
clustdist<-matrix(c(maxdist,rep(newclustdist,2),maxdist),2,2)}
}
return(clust[[1]])}
#######################################################################
isto<-function(dm,method="farthest")
{# spanning tour orderings for points with distance matrix dm
# based on furthest or nearest insertion method
# method should be one of "farthest" or "nearest"
# see dataimage for differences
meth<-charmatch(method,c("farthest", "nearest"),nomatch=-1)
if (meth < 1) meth <- 1
n<-nrow(dm)
dist0<-sqrt(apply(dm^2,1,sum))
arrange<-min((1:n)[dist0==max(dist0)])
for (i in 1:(n-1))
{if (meth==2)
new<-min((1:n)[-arrange][apply(matrix(dm[arrange,-arrange],nrow=i),
2,min)==min(apply(matrix(dm[arrange,-arrange],nrow=i),2,min))])
else
new<-min((1:n)[-arrange][apply(matrix(dm[arrange,-arrange],nrow=i),
2,min)==max(apply(matrix(dm[arrange,-arrange],nrow=i),2,min))])
if (i > 1)
{distadd<-dm[new,arrange]+dm[new,arrange[c(i,1:(i-1))]]-
dm[cbind(arrange,arrange[c(i,1:i-1)])]
place<-min((1:i)[distadd==min(distadd)])
if (place==1)
arrange<-c(new,arrange)
else
arrange<-c(arrange[1:(place-1)],new,arrange[place:i])
}
else
arrange<-c(arrange,new)
}
dist<-dm[cbind(arrange,arrange[c(n,1:(n-1))])]
big<-(1:n)[dist==max(dist)]
if (big > 1)
arrange<-arrange[c(big:n,1:(big-1))]
return(arrange)
}