###############################################################################
# #
# Mode Tree S-plus Functions #
# #
# Save this file as text, then source into S-plus. #
# #
# These functions will allow the user to generate univariate mode trees, as #
# described in Minnotte and Scott, (1993) "The Mode Tree: A Tool for #
# Visualization of Nonparametric Density Features," Journal of Computational #
# and Graphical Statistics, 2, 51-68. #
# #
# The primary functions are the first four. #
# #
# modetree - generates a basic mode tree including only modes and branchings #
# enhmtree - generates an enhanced mode tree including modes, branchings, #
# mode sizes, bumps, and antimodes #
# modetreeplot - plots a basic mode tree (if doplot=T, called automatically #
# from modetree) #
# enhmtreeplot - plots an enhanced mode tree (if doplot=T, called #
# automatically from enhmtree) #
# #
# The remaining functions exist primarily to be called from the above #
# functions, but may be useful themselves. #
# #
# bin1 - bins a data set into nbin bins for use in nash1 #
# nash1 - approximates a kernel density estimate with normal kernel using #
# an appropriately weighted average shifted histogram (ASH) #
# nicerange - range of a data set extended on each side (default: 10%) #
# matcher - matches elements from two lists as described in the appendix #
# to Minnotte and Scott #
# quadest - given three ordered pairs, finds the extremum of the interpolated #
# parabola #
# findmodes - given sequence of values representing a density function, #
# identify number of modes, locations of modes, and indices of modes #
# and antimodes #
# findbumps - given sequence of values representing a density function, #
# identify bumps (regions of negative second derivative) #
# #
# All functions are copyright 1993, Michael C. Minnotte except bin1 and #
# nicerange, copyright 1986, David W. Scott, and nash1, copyright 1993 David #
# W. Scott and Michael C. Minnotte. #
# #
# The authors may be contacted at minnotte@math.usu.edu and scottdw@rice.edu. #
###############################################################################
modetree<-
function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/400),
maintit = "", xlb="", nbin = 500, nh = 200, doplot = T, docat=T)
# modetree - generates a basic mode tree including only modes and branchings
# X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit -
# main title on plot; xlb - x label on plot; nbin - number of bins for data
# and ASH; nh - number of different bandwidths; doplot - plot tree on
# current device if TRUE; docat - indicate status if TRUE
# Copyright 1993, Michael C. Minnotte
{
tree <- NULL
modes <- NULL
bins <- bin1(X, nbin = nbin)
maxh <- ran[1]
minh <- ran[2]
loghlist <- seq(log(maxh), log(minh), length = nh)
n <- length(X)
hlist <- exp(loghlist)
dh<- hlist[2]/hlist[1]
est <- nash1(bins, hlist[1])
oldmodes <- findmodes(est$x, est$y)
tree <- cbind(oldmodes$modes[, 1], rep(hlist[1], oldmodes$nm), rep(
0, oldmodes$nm), rep(0, oldmodes$nm))
count <- 0
for(i in 2:length(hlist)) {
if(docat)
cat("h number ", i, " starting.\n")
est <- nash1(bins, hlist[i])
newmodes <- findmodes(est$x, est$y)
mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1])
for(j in 1:newmodes$nm) {
if(mout[j, 1] <= oldmodes$nm)
tree <- rbind(tree, c(newmodes$modes[j, 1],
hlist[i], count + mout[j, 1], 1))
else {
oldfrom <- (1:oldmodes$nm)[newmodes$modes[
j, 2] > oldmodes$am[, 1] & newmodes$
modes[j, 2] < oldmodes$am[, 2]]
newfrom <- mout[oldfrom, 2]
tree <- rbind(tree, c(newmodes$modes[j, 1],
hlist[i], count + oldmodes$nm + newfrom,
2))
}
}
count <- count + oldmodes$nm
oldmodes <- newmodes
}
if(doplot)
modetreeplot(tree, maintit, xlb)
return(tree)
}
###############################################################################
enhmtree<-
function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/200),
maintit = "", xlb = "", nbin = 500, nh = 200, doplot = T, docat = T)
# enhmtree - generates an enhanced mode tree including modes, branchings,
# mode sizes, bumps, and antimodes
# X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit -
# main title on plot; xlb - x label on plot; nbin - number of bins for data
# and ASH; nh - number of different bandwidths; doplot - plot tree on
# current device if TRUE; docat - indicate status if TRUE
# Copyright 1993, Michael C. Minnotte
{
tree <- NULL
am <- NULL
bins <- bin1(X, nbin = nbin)
maxh <- ran[1]
minh <- ran[2]
loghlist <- seq(log(maxh), log(minh), length = nh)
hlist <- exp(loghlist)
n <- length(X)
est <- nash1(bins, hlist[1])
d <- est$x[2] - est$x[1]
oldbumps <- findbumps(est$x, est$y)
nob <- nrow(oldbumps)
shade <- cbind(oldbumps, rep(hlist[1], nob), rep(0, nob), rep(0, nob))
oldmodes <- findmodes(est$x, est$y)
if(oldmodes$nm == 1)
tree <- c(oldmodes$modes[1, 1], hlist[1], 0, 1)
else for(i in 1:oldmodes$nm) {
temp<-est$y
exclev<-max(temp[oldmodes$am[i,1:2]])
change<-(1:nbin)[(1:nbin)>oldmodes$am[i,1] &
(1:nbin) exclev]
temp[change]<-exclev
wd <- sum(est$y - temp) * d
tree <- rbind(tree, c(oldmodes$modes[i, 1], hlist[
1], 0, wd))
}
onm1<-oldmodes$nm-1
if(oldmodes$nm > 1){
oldam<-rep(0,onm1)
for(j in 1:onm1)
oldam[j] <- quadest(est$x[(oldmodes$am[j + 1,
1] - 1):(oldmodes$am[j + 1, 1] + 1)], est$y[
(oldmodes$am[j + 1, 1] - 1):(oldmodes$am[j +
1, 1] + 1)])
am<-cbind(oldam,rep(hlist[1],onm1),rep(0,onm1),
rep(0,onm1))}
count <- 0
countb<-0
countam <- 0
for(i in 2:length(hlist)) {
if(docat)
cat("h number ", i, " starting.\n")
est <- nash1(bins, hlist[i])
newbumps <- findbumps(est$x, est$y)
nnb <- nrow(newbumps)
mblout <- matcher(newbumps[,1],oldbumps[,1])[1:nnb,1]
mblout[mblout>nob] <- -countb
mbrout <- matcher(newbumps[,2],oldbumps[,2])[1:nnb,1]
mbrout[mbrout>nob] <- -countb
newbumps <- cbind(newbumps, rep(hlist[i], nnb),
countb + mblout, countb + mbrout)
shade <- rbind(shade, newbumps)
countb <- countb + nob
nob <- nnb
oldbumps <- newbumps
# shade <- rbind(shade, cbind(bumps, rep(hlist[i], nrow(bumps))))
newmodes <- findmodes(est$x, est$y)
mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1])
for(j in 1:newmodes$nm) {
if(newmodes$nm == 1)
wd <- 1
else {
temp<-est$y
exclev<-max(temp[newmodes$am[j,1:2]])
change<-(1:nbin)[(1:nbin)>newmodes$am[j,1] &
(1:nbin) exclev]
temp[change]<-exclev
wd <- sum(est$y - temp) * d
}
if(mout[j, 1] <= oldmodes$nm)
tree <- rbind(tree, c(newmodes$modes[j, 1],
hlist[i], count + mout[j, 1], wd))
else {
oldfrom <- (1:oldmodes$nm)[newmodes$modes[
j, 2] > oldmodes$am[, 1] & newmodes$
modes[j, 2] < oldmodes$am[, 2]]
newfrom <- mout[oldfrom, 2]
tree <- rbind(tree, c(newmodes$modes[j, 1],
hlist[i], count + oldmodes$nm + newfrom,
-1 * wd))
}
}
count <- count + oldmodes$nm
if(newmodes$nm > 1) {
nnm1<-newmodes$nm-1
newam<-rep(0,nnm1)
for(j in 1:nnm1)
newam[j] <- quadest(est$x[(newmodes$am[j + 1,
1] - 1):(newmodes$am[j + 1, 1] + 1)], est$y[
(newmodes$am[j + 1, 1] - 1):(newmodes$am[j +
1, 1] + 1)])
if (is.null(am))
am<-cbind(newam,rep(hlist[i],nnm1),rep(0,nnm1),
rep(0,nnm1))
else {
mout<-matcher(newam,oldam)
for (j in 1:nnm1)
if (mout[j,1]<=onm1)
am<-rbind(am,c(newam[j],
hlist[i],countam+
mout[j,1],1))
else
am<-rbind(am,c(newam[j],
hlist[i],0,0))
}
countam<-countam + onm1
onm1<-nnm1
oldam<-newam
}
oldmodes <- newmodes
}
out <- list(tree = tree, shade = shade, am = am)
if(doplot)
enhmtreeplot(out, maintit, xlb=xlb)
return(out)
}
###############################################################################
modetreeplot<-
function(tree, maintit = "", xlb="",add=F)
# modetreeplot - plots a basic mode tree (if doplot=T, called automatically
# from modetree)
# tree - mode tree (output from modetree); maintit - main title; xlb - x label
# Copyright 1993, Michael C. Minnotte
{
nr <- nrow(tree)
if (!add)
plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim =
nicerange(tree[, 1]), ylim = log(range(tree[, 2])),
xlab = xlb,ylab = "h", main = maintit,tck=0,axes=F)
xtck<-pretty(nicerange(tree[,1]))
yt1<-floor(log10(min(tree[,2])))
yt2<-ceiling(log10(max(tree[,2])))
ytck<-sort(c(10^(yt1:yt2),5*10^(yt1:(yt2-1))))
axis(1,at=xtck,labels=xtck)
axis(2,at=log(ytck),labels=ytck,srt=90)
box()
t1 <- (1:nrow(tree))[tree[, 4] == 1]
t2 <- (1:nrow(tree))[tree[, 4] == 2]
segments(tree[t1, 1], log(tree[t1, 2]), tree[tree[t1, 3], 1],
log(tree[tree[t1, 3], 2]), lty = 1)
segments(tree[t2, 1], log(tree[t2, 2]), tree[tree[t2, 3], 1],
log(tree[tree[t2, 3], 2]), lty = 2)
}
###############################################################################
enhmtreeplot<-
function(enhmtree, maintit = "", xlb="", wd = 0, cols=c(16,1,2,1,0,1), amp=3)
# enhmtreeplot - plots an enhanced mode tree (if doplot=T, called
# automatically from enhmtree)
# tree - mode tree (output from modetree); maintit - main title; xlb - x label;
# wd - width of line indicating mode consisting of entire density (default of
# 0 calculates so that no mode line goes beyond its bump's shaded area);
# cols - colors: [1] - modes (polygon), [2] - modes (segments), [3] - bumps
# (polygons), [4] - antimodes (points/segments), [5] - background/antibumps
# (polygon), [6] - labels/box (lines/text), selected values look good for
# X11 graphics mode, for openlook cyan-to-red I like c(5,5,2,1,6,1), and
# for (b&w) postscript I like c(1,1,2,1,0,1); amp - antimode points, if
# positive one dot for every amp h values will be plotted, if nonpositive,
# line type -amp will be used.
# Copyright 1993, Michael C. Minnotte
{
par(las = 1)
tree <- enhmtree$tree
shade <- enhmtree$shade
am <- enhmtree$am
nr <- nrow(tree)
plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim =
range(shade[, 1:2]), ylim = range(shade[, 3]), log = "y", xlab
= xlb, ylab = "h", main = maintit, col=cols[6])
h <- unique(shade[, 3])
lh <- length(h)
logsep <- log(h[2]) - log(h[1])
logsep2 <- logsep/2
# factor <- sqrt(h[1]/h[2])
# polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])),
# c(rep(min(shade[, 3])/factor, 2), rep(max(shade[, 3]) * factor,
# 2)), border = F, col = cols[5])
polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])),
c(rep(min(shade[, 3]), 2), rep(max(shade[, 3]),
2)), border = F, col = cols[5])
s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] > 0]
x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5], 2],
shade[shade[s1,4], 1], rep(NA,length(s1)))
y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3],
shade[shade[s1,4],3], rep(NA,length(s1)))
x2 <- as.vector(t(x1))
y2 <- as.vector(t(y1))
polygon(x2, y2, border = F, col = cols[3])
s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] == 0]
x1 <- cbind(shade[s1, 1], shade[s1, 2],
(shade[s1,2] + shade[s1+1,1])/2,
shade[shade[s1,4], 1], rep(NA,length(s1)))
y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,4],3],
shade[shade[s1,4],3], rep(NA,length(s1)))
x2 <- as.vector(t(x1))
y2 <- as.vector(t(y1))
polygon(x2, y2, border = F, col = cols[3])
s1 <- (1:nrow(shade))[shade[,4] == 0 & shade[,5] > 0]
x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5],2],
(shade[s1,1] + shade[s1-1,2])/2, rep(NA,length(s1)))
y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3],
shade[shade[s1,5],3], rep(NA,length(s1)))
x2 <- as.vector(t(x1))
y2 <- as.vector(t(y1))
polygon(x2, y2, border = F, col = cols[3])
if(wd == 0) {
gt <- (1:nrow(tree))[tree[, 4] < 0]
mt <- tree[gt, 3]
ms <- rep(0, length(mt))
for(i in 1:length(mt)) {
ms[i] <- (1:nrow(shade))[shade[, 3] == tree[mt[i],
2] & shade[, 1] < tree[mt[i], 1] & shade[,
2] > tree[mt[i], 1]]
}
wd <- min((shade[ms, 2] - tree[mt, 1])/abs(tree[mt, 4]), (
tree[mt, 1] - shade[ms, 1])/abs(tree[mt, 4])) * .9
}
gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0]
nt <- length(gt)
x1 <- cbind(tree[gt, 1] - wd * abs(tree[gt, 4]), tree[gt, 1] + wd * abs(
tree[gt, 4]), tree[tree[gt, 3], 1] + wd * abs(tree[tree[gt, 3],
4]), tree[tree[gt, 3], 1] - wd * abs(tree[tree[gt, 3], 4]), rep(
NA, nt))
y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[
gt, 3], 2], rep(NA, nt))
x2 <- as.vector(t(x1))
y2 <- as.vector(t(y1))
polygon(x2, y2, border = F, col = cols[1])
t2 <- (1:nrow(tree))[tree[, 4] < 0]
gt <- tree[t2, 3]
nt <- length(gt)
x1 <- cbind(tree[gt, 1] - wd * abs(tree[tree[gt, 3], 4]), tree[gt, 1] +
wd * abs(tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] + wd * abs(
tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] - wd * abs(tree[
tree[gt, 3], 4]), rep(NA, nt))
y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[
gt, 3], 2], rep(NA, nt))
x2 <- as.vector(t(x1))
y2 <- as.vector(t(y1))
polygon(x2, y2, border = F, col = cols[1])
gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0]
segments(tree[gt, 1], tree[gt, 2], tree[tree[gt, 3], 1],
tree[tree[gt, 3], 2], col=cols[2])
segments(tree[t2, 1], tree[t2, 2], tree[tree[t2, 3], 1], tree[tree[
t2, 3], 2], lty = 3, col=cols[2])
if (amp > 0) {
h<-unique(am[,2])
h<-c(h,rep(0,ceiling(length(h)/amp)*amp-length(h)))
h<-matrix(h,amp)
goodh<-h[1,]
goodam<-NULL
for (i in 1:length(goodh))
goodam<-rbind(goodam,am[am[,2]==goodh[i],])
points(goodam[, 1], goodam[, 2], pch = ".",col=cols[4])
} else {
a1 <- (1:nrow(am))[am[, 4] == 1]
segments(am[a1, 1], am[a1, 2], am[am[a1, 3], 1],
am[am[a1, 3], 2], col = cols[4], lty = -amp)
# a2 <- (1:nrow(am))[am[, 2] == am[nrow(am), 2]]
# segments(am[a2, 1], am[a2, 2], am[a2, 1], am[a2, 2]/factor,
# col = cols[4], lty = -amp)
}
}
###############################################################################
bin1<-
function(x, ab = nicerange(x), nbin = 500)
# bin1 - bins a data set into nbin bins for use in nash1
# x - (unbinned) data; ab - range for bins; nbin - number of bins
# Copyright 1986, David W. Scott
{
if(ab[1] >= ab[2])
fatal("Interval vector has negative orientation")
if(nbin <= 0)
fatal("Number of bin intervals nonpositive")
bc<-rep(0,nbin)
d<-(ab[2]-ab[1])/nbin
k<-floor((x-ab[1])/d) + 1
nskip<-length(k[k<1 | k > nbin])
k<-k[k>=1 & k <= nbin]
for (i in 1:nbin)
bc[i]<-length(k[k==i])
return(list(bc = bc, ab = ab, nskip = nskip))
}
###############################################################################
nash1<-
function(bins, h)
# nash1 - approximates a kernel density estimate with normal kernel using
# an appropriately weighted average shifted histogram (ASH)
# bins - binned data; h - bandwidth (s.d. of normal kernel)
# Copyright 1993, David W. Scott and Michael C. Minnotte
{
if(h <= 0)
stop("Smoothing parameter m nonpositive")
bc <- bins$bc
ab <- bins$ab
nbin <- length(bc)
delta<-(ab[2]-ab[1])/nbin
w<-dnorm((0:(nbin-1))*delta,0,h)
nonempt<-(1:nbin)[bc > 0]
x<-ab[1]+delta*(1:nbin-.5)
y<-rep(0,nbin)
for (i in nonempt)
y<-y+bc[i]*w[abs((1:nbin)-i)+1]
return(list(x = x, y = y, ab = ab, h = h, n = sum(bc), nskip = bins
$nskip))
}
###############################################################################
nicerange<-
function(x, beta = 0.2)
# nicerange - range of a data set extended on each side (default: 10%)
# x - unbinned data; beta - total fraction of data range to add (split between
# the two sides)
# Copyright 1986, David W. Scott
{
ab <- range(x) # interval surrounding data
del <- ((ab[2] - ab[1]) * beta)/2
return(c(ab + c(-1*del, del)))
}
###############################################################################
matcher<-
function(alist, blist)
# matcher - matches elements from two lists as described in the appendix
# to Minnotte and Scott
# alist - first vector of sorted values; blist - second vector of sorted
# values
# Copyright 1993, Michael C. Minnotte
{
alist<-unique(sort(alist))
blist<-unique(sort(blist))
na <- length(alist)
nb <- length(blist)
na1<-na+1
nb1<-nb+1
dist<-t(log(exp(-blist)%*%t(exp(alist))))
snd<-dist/abs(dist)
alpha<-matrix(nb1,na1,2)
beta<-matrix(na1,nb1,2)
for (i in 1:na)
{alpha[i,1]<-min((1:nb)[abs(dist[i,])==min(abs(dist[i,]))])
alpha[i,2]<-alpha[i,1]+snd[i,alpha[i,1]]}
alpha[alpha==0]<-nb1
for (i in 1:nb)
{beta[i,1]<-min((1:na)[abs(dist[,i])==min(abs(dist[,i]))])
beta[i,2]<-beta[i,1]-snd[beta[i,1],i]}
beta[beta==0]<-na1
mout<-matrix(nb1,max(na1,nb1),2)
mout[,2]<-na1
fit1<-(1:na)[beta[alpha[1:na,1],1]==(1:na)]
mout[fit1,1]<-alpha[fit1,1]
mout[alpha[fit1,1],2]<-fit1
beta[alpha[fit1,1],]<-na1
alpha[fit1,]<-nb1
fit2<-(1:na)[beta[alpha[1:na,1],2]==(1:na)]
mout[fit2,1]<-alpha[fit2,1]
mout[alpha[fit2,1],2]<-fit2
beta[alpha[fit2,1],]<-na1
alpha[fit2,]<-nb1
fit3<-(1:na)[beta[alpha[1:na,2],1]==(1:na)]
mout[fit3,1]<-alpha[fit3,2]
mout[alpha[fit3,2],2]<-fit3
beta[alpha[fit3,2],]<-na1
alpha[fit3,]<-nb1
fit4<-(1:na)[beta[alpha[1:na,2],2]==(1:na)]
mout[fit4,1]<-alpha[fit4,2]
mout[alpha[fit4,2],2]<-fit4
beta[alpha[fit4,2],]<-na1
alpha[fit4,]<-nb1
return(mout)
}
###############################################################################
quadest<-
function(x, y)
# quadest - given three ordered pairs, finds the extremum of the interpolated
# parabola
# x - 3-vector of x values; y - 3-vector of y values
# Copyright 1993, Michael C. Minnotte
{
a <- ((y[1] - y[2])/(x[1] - x[2]) - (y[2] - y[3])/(x[2] - x[3]))/(
x[1] - x[3])
b <- (y[1] - y[2])/(x[1] - x[2]) - a * (x[1] + x[2])
quad <- - b/(2 * a)
return(quad)
}
###############################################################################
findmodes<-
function(x, y)
# findmodes - given sequence of values representing a density function,
# identify number of modes, locations of modes, and indices of modes
# and antimodes
# x - x-values returned by nash1; y - density values returned by nash1
# Copyright 1993, Michael C. Minnotte
{
nbin <- length(x)
modes <- (2:(nbin-1))[y[2:(nbin-1)] > y[1:(nbin-2)] &
y[2:(nbin-1)] >= y[3:nbin]]
nm<-length(modes)
modes<- cbind(rep(0,nm),modes)
for (i in 1:nm)
modes[i,1]<-quadest(x[modes[i,2]+(-1):1],y[modes[i,2]+(-1):1])
am <- (2:(nbin-1))[y[2:(nbin-1)] < y[1:(nbin-2)] &
y[2:(nbin-1)] <= y[3:nbin]]
am<-cbind(c(1,am),c(am,nbin))
return(nm = nm, modes = modes, am = am)
}
###############################################################################
findbumps<-
function(x, y)
# findbumps - given sequence of values representing a density function, #
# identify bumps (regions of negative second derivative) #
# x - x-values returned by nash1; y - density values returned by nash1
# Copyright 1993, Michael C. Minnotte
{
n <- length(y)
f2 <- c(1, y[1:(n - 2)] - 2 * y[2:(n - 1)] + y[3:n], 1)
s <- ifelse(f2 < 0, -1, 1)
s2 <- s[2:n] - s[1:(n - 1)]
i1 <- (1:(n - 1))[s2 == -2] + 1
i2 <- (1:(n - 1))[s2 == 2]
return(cbind(x[i1], x[i2]))
}