############################################################################### # # # Mode Tree R Functions # # # # Save this file as text, then source into R. # # # # These functions will allow the user to generate univariate mode trees, as # # described in Minnotte and Scott, (1993) "The Mode Tree: A Tool for # # Visualization of Nonparametric Density Features," Journal of Computational # # and Graphical Statistics, 2, 51-68. # # # # The primary functions are the first four. # # # # modetree - generates a basic mode tree including only modes and branchings # # enhmtree - generates an enhanced mode tree including modes, branchings, # # mode sizes, bumps, and antimodes # # modetreeplot - plots a basic mode tree (if doplot=T, called automatically # # from modetree) # # enhmtreeplot - plots an enhanced mode tree (if doplot=T, called # # automatically from enhmtree) # # # # The remaining functions exist primarily to be called from the above # # functions, but may be useful themselves. # # # # bin1 - bins a data set into nbin bins for use in nash1 # # nash1 - approximates a kernel density estimate with normal kernel using # # an appropriately weighted average shifted histogram (ASH) # # nicerange - range of a data set extended on each side (default: 10%) # # matcher - matches elements from two lists as described in the appendix # # to Minnotte and Scott # # quadest - given three ordered pairs, finds the extremum of the interpolated # # parabola # # findmodes - given sequence of values representing a density function, # # identify number of modes, locations of modes, and indices of modes # # and antimodes # # findbumps - given sequence of values representing a density function, # # identify bumps (regions of negative second derivative) # # # # All functions are copyright 1993, Michael C. Minnotte except bin1 and # # nicerange, copyright 1986, David W. Scott, and nash1, copyright 1993 David # # W. Scott and Michael C. Minnotte. # # # # The authors may be contacted at minnotte@math.usu.edu and scottdw@rice.edu. # ############################################################################### modetree<- function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/400), maintit = "", xlb="", nbin = 500, nh = 200, doplot = T, docat=F) # modetree - generates a basic mode tree including only modes and branchings # X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit - # main title on plot; xlb - x label on plot; nbin - number of bins for data # and ASH; nh - number of different bandwidths; doplot - plot tree on # current device if TRUE; docat - indicate status if TRUE # Copyright 1993, Michael C. Minnotte { tree <- NULL modes <- NULL bins <- bin1(X, nbin = nbin) maxh <- ran[1] minh <- ran[2] loghlist <- seq(log(maxh), log(minh), length = nh) n <- length(X) hlist <- exp(loghlist) dh<- hlist[2]/hlist[1] est <- nash1(bins, hlist[1]) oldmodes <- findmodes(est$x, est$y) tree <- cbind(oldmodes$modes[, 1], rep(hlist[1], oldmodes$nm), rep( 0, oldmodes$nm), rep(0, oldmodes$nm)) count <- 0 for(i in 2:length(hlist)) { if(docat) cat("h number ", i, " starting.\n") est <- nash1(bins, hlist[i]) newmodes <- findmodes(est$x, est$y) mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1]) for(j in 1:newmodes$nm) { if(mout[j, 1] <= oldmodes$nm) tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + mout[j, 1], 1)) else { oldfrom <- (1:oldmodes$nm)[newmodes$modes[ j, 2] >= oldmodes$am[, 1] & newmodes$ modes[j, 2] < oldmodes$am[, 2]] newfrom <- mout[oldfrom, 2] tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + oldmodes$nm + newfrom, 2)) } } count <- count + oldmodes$nm oldmodes <- newmodes } if(doplot) modetreeplot(tree, maintit, xlb) invisible(tree) } ############################################################################### enhmtree<- function(X, ran = c(diff(nicerange(X))/4, diff(nicerange(X))/200), maintit = "", xlb = "", nbin = 500, nh = 200, doplot = T, docat = F) # enhmtree - generates an enhanced mode tree including modes, branchings, # mode sizes, bumps, and antimodes # X - raw (unbinned) data; ran - largest and smallest bandwidths; maintit - # main title on plot; xlb - x label on plot; nbin - number of bins for data # and ASH; nh - number of different bandwidths; doplot - plot tree on # current device if TRUE; docat - indicate status if TRUE # Copyright 1993, Michael C. Minnotte { tree <- NULL am <- NULL bins <- bin1(X, nbin = nbin) maxh <- ran[1] minh <- ran[2] loghlist <- seq(log(maxh), log(minh), length = nh) hlist <- exp(loghlist) n <- length(X) est <- nash1(bins, hlist[1]) d <- est$x[2] - est$x[1] oldbumps <- findbumps(est$x, est$y) nob <- nrow(oldbumps) shade <- cbind(oldbumps, rep(hlist[1], nob), rep(0, nob), rep(0, nob)) oldmodes <- findmodes(est$x, est$y) if(oldmodes$nm == 1) tree <- c(oldmodes$modes[1, 1], hlist[1], 0, 1) else for(i in 1:oldmodes$nm) { temp<-est$y exclev<-max(temp[oldmodes$am[i,1:2]]) change<-(1:nbin)[(1:nbin)>oldmodes$am[i,1] & (1:nbin) exclev] temp[change]<-exclev wd <- sum(est$y - temp) * d tree <- rbind(tree, c(oldmodes$modes[i, 1], hlist[ 1], 0, wd)) } onm1<-oldmodes$nm-1 if(oldmodes$nm > 1){ oldam<-rep(0,onm1) for(j in 1:onm1) oldam[j] <- quadest(est$x[(oldmodes$am[j + 1, 1] - 1):(oldmodes$am[j + 1, 1] + 1)], est$y[ (oldmodes$am[j + 1, 1] - 1):(oldmodes$am[j + 1, 1] + 1)]) am<-cbind(oldam,rep(hlist[1],onm1),rep(0,onm1), rep(0,onm1))} count <- 0 countb<-0 countam <- 0 for(i in 2:length(hlist)) { if(docat) cat("h number ", i, " starting.\n") est <- nash1(bins, hlist[i]) newbumps <- findbumps(est$x, est$y) nnb <- nrow(newbumps) mblout <- matcher(newbumps[,1],oldbumps[,1])[1:nnb,1] mblout[mblout>nob] <- -countb mbrout <- matcher(newbumps[,2],oldbumps[,2])[1:nnb,1] mbrout[mbrout>nob] <- -countb newbumps <- cbind(newbumps, rep(hlist[i], nnb), countb + mblout, countb + mbrout) shade <- rbind(shade, newbumps) countb <- countb + nob nob <- nnb oldbumps <- newbumps # shade <- rbind(shade, cbind(bumps, rep(hlist[i], nrow(bumps)))) newmodes <- findmodes(est$x, est$y) mout <- matcher(newmodes$modes[, 1], oldmodes$modes[, 1]) for(j in 1:newmodes$nm) { if(newmodes$nm == 1) wd <- 1 else { temp<-est$y exclev<-max(temp[newmodes$am[j,1:2]]) change<-(1:nbin)[(1:nbin)>newmodes$am[j,1] & (1:nbin) exclev] temp[change]<-exclev wd <- sum(est$y - temp) * d } if(mout[j, 1] <= oldmodes$nm) tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + mout[j, 1], wd)) else { oldfrom <- (1:oldmodes$nm)[newmodes$modes[ j, 2] > oldmodes$am[, 1] & newmodes$ modes[j, 2] < oldmodes$am[, 2]] newfrom <- mout[oldfrom, 2] tree <- rbind(tree, c(newmodes$modes[j, 1], hlist[i], count + oldmodes$nm + newfrom, -1 * wd)) } } count <- count + oldmodes$nm if(newmodes$nm > 1) { nnm1<-newmodes$nm-1 newam<-rep(0,nnm1) for(j in 1:nnm1) newam[j] <- quadest(est$x[(newmodes$am[j + 1, 1] - 1):(newmodes$am[j + 1, 1] + 1)], est$y[ (newmodes$am[j + 1, 1] - 1):(newmodes$am[j + 1, 1] + 1)]) if (is.null(am)) am<-cbind(newam,rep(hlist[i],nnm1),rep(0,nnm1), rep(0,nnm1)) else { mout<-matcher(newam,oldam) for (j in 1:nnm1) if (mout[j,1]<=onm1) am<-rbind(am,c(newam[j], hlist[i],countam+ mout[j,1],1)) else am<-rbind(am,c(newam[j], hlist[i],0,0)) } countam<-countam + onm1 onm1<-nnm1 oldam<-newam } oldmodes <- newmodes } out <- list(tree = tree, shade = shade, am = am) if(doplot) enhmtreeplot(out, maintit, xlb=xlb) invisible(out) } ############################################################################### modetreeplot<- function(tree, maintit = "", xlb="",add=F) # modetreeplot - plots a basic mode tree (if doplot=T, called automatically # from modetree) # tree - mode tree (output from modetree); maintit - main title; xlb - x label # Copyright 1993, Michael C. Minnotte { nr <- nrow(tree) if (!add) plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim = nicerange(tree[, 1]), ylim = log(range(tree[, 2])), xlab = xlb,ylab = "h", main = maintit,tck=0,axes=F) xtck<-pretty(nicerange(tree[,1])) yt1<-floor(log10(min(tree[,2]))) yt2<-ceiling(log10(max(tree[,2]))) ytck<-sort(c(10^(yt1:yt2),5*10^(yt1:(yt2-1)))) axis(1,at=xtck,labels=xtck) axis(2,at=log(ytck),labels=ytck,srt=90) box() t1 <- (1:nrow(tree))[tree[, 4] == 1] t2 <- (1:nrow(tree))[tree[, 4] == 2] segments(tree[t1, 1], log(tree[t1, 2]), tree[tree[t1, 3], 1], log(tree[tree[t1, 3], 2]), lty = 1) segments(tree[t2, 1], log(tree[t2, 2]), tree[tree[t2, 3], 1], log(tree[tree[t2, 3], 2]), lty = 2) } ############################################################################### enhmtreeplot<- function(enhmtree, maintit = "", xlb="", wd = 0, cols=c("black","black","grey","black","white","black"), amp=3) # enhmtreeplot - plots an enhanced mode tree (if doplot=T, called # automatically from enhmtree) # tree - mode tree (output from modetree); maintit - main title; xlb - x label; # wd - width of line indicating mode consisting of entire density (default of # 0 calculates so that no mode line goes beyond its bump's shaded area); # cols - colors: [1] - modes (polygon), [2] - modes (segments), [3] - bumps # (polygons), [4] - antimodes (points/segments), [5] - background/antibumps # (polygon), [6] - labels/box (lines/text), selected values look good for # X11 graphics mode, for openlook cyan-to-red I like c(5,5,2,1,6,1), and # for (b&w) postscript I like c(1,1,2,1,0,1); amp - antimode points, if # positive one dot for every amp h values will be plotted, if nonpositive, # line type -amp will be used. # Copyright 1993, Michael C. Minnotte { par(las = 1) tree <- enhmtree$tree shade <- enhmtree$shade am <- enhmtree$am nr <- nrow(tree) plot(tree[floor(nr/2), 1], tree[floor(nr/2), 2], type = "n", xlim = range(shade[, 1:2]), ylim = range(shade[, 3]), log = "y", xlab = xlb, ylab = "h", main = maintit, col=cols[6]) h <- unique(shade[, 3]) lh <- length(h) logsep <- log(h[2]) - log(h[1]) logsep2 <- logsep/2 # factor <- sqrt(h[1]/h[2]) # polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])), # c(rep(min(shade[, 3])/factor, 2), rep(max(shade[, 3]) * factor, # 2)), border = F, col = cols[5]) polygon(c(min(shade[, 1]), rep(max(shade[, 2]), 2), min(shade[, 1])), c(rep(min(shade[, 3]), 2), rep(max(shade[, 3]), 2)), border = F, col = cols[5]) s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] > 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5], 2], shade[shade[s1,4], 1], rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3], shade[shade[s1,4],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) s1 <- (1:nrow(shade))[shade[,4] > 0 & shade[,5] == 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], (shade[s1,2] + shade[s1+1,1])/2, shade[shade[s1,4], 1], rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,4],3], shade[shade[s1,4],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) s1 <- (1:nrow(shade))[shade[,4] == 0 & shade[,5] > 0] x1 <- cbind(shade[s1, 1], shade[s1, 2], shade[shade[s1,5],2], (shade[s1,1] + shade[s1-1,2])/2, rep(NA,length(s1))) y1 <- cbind(shade[s1,3], shade[s1,3], shade[shade[s1,5],3], shade[shade[s1,5],3], rep(NA,length(s1))) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[3]) if(wd == 0) { gt <- (1:nrow(tree))[tree[, 4] < 0] mt <- tree[gt, 3] ms <- rep(0, length(mt)) for(i in 1:length(mt)) { ms[i] <- (1:nrow(shade))[shade[, 3] == tree[mt[i], 2] & shade[, 1] < tree[mt[i], 1] & shade[, 2] > tree[mt[i], 1]] } wd <- min((shade[ms, 2] - tree[mt, 1])/abs(tree[mt, 4]), ( tree[mt, 1] - shade[ms, 1])/abs(tree[mt, 4])) * .9 } gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0] nt <- length(gt) x1 <- cbind(tree[gt, 1] - wd * abs(tree[gt, 4]), tree[gt, 1] + wd * abs( tree[gt, 4]), tree[tree[gt, 3], 1] + wd * abs(tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] - wd * abs(tree[tree[gt, 3], 4]), rep( NA, nt)) y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[ gt, 3], 2], rep(NA, nt)) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[1]) t2 <- (1:nrow(tree))[tree[, 4] < 0] gt <- tree[t2, 3] nt <- length(gt) x1 <- cbind(tree[gt, 1] - wd * abs(tree[tree[gt, 3], 4]), tree[gt, 1] + wd * abs(tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] + wd * abs( tree[tree[gt, 3], 4]), tree[tree[gt, 3], 1] - wd * abs(tree[ tree[gt, 3], 4]), rep(NA, nt)) y1 <- cbind(tree[gt, 2], tree[gt, 2], tree[tree[gt, 3], 2], tree[tree[ gt, 3], 2], rep(NA, nt)) x2 <- as.vector(t(x1)) y2 <- as.vector(t(y1)) polygon(x2, y2, border = F, col = cols[1]) gt <- (1:nrow(tree))[tree[, 4] > 0 & tree[, 3] > 0] segments(tree[gt, 1], tree[gt, 2], tree[tree[gt, 3], 1], tree[tree[gt, 3], 2], col=cols[2]) segments(tree[t2, 1], tree[t2, 2], tree[tree[t2, 3], 1], tree[tree[ t2, 3], 2], lty = 3, col=cols[2]) if (amp > 0) { h<-unique(am[,2]) h<-c(h,rep(0,ceiling(length(h)/amp)*amp-length(h))) h<-matrix(h,amp) goodh<-h[1,] goodam<-NULL for (i in 1:length(goodh)) goodam<-rbind(goodam,am[am[,2]==goodh[i],]) points(goodam[, 1], goodam[, 2], pch = ".",col=cols[4]) } else { a1 <- (1:nrow(am))[am[, 4] == 1] segments(am[a1, 1], am[a1, 2], am[am[a1, 3], 1], am[am[a1, 3], 2], col = cols[4], lty = -amp) # a2 <- (1:nrow(am))[am[, 2] == am[nrow(am), 2]] # segments(am[a2, 1], am[a2, 2], am[a2, 1], am[a2, 2]/factor, # col = cols[4], lty = -amp) } } ############################################################################### bin1<- function(x, ab = nicerange(x), nbin = 500) # bin1 - bins a data set into nbin bins for use in nash1 # x - (unbinned) data; ab - range for bins; nbin - number of bins # Copyright 1986, David W. Scott { if(ab[1] >= ab[2]) fatal("Interval vector has negative orientation") if(nbin <= 0) fatal("Number of bin intervals nonpositive") bc<-rep(0,nbin) d<-(ab[2]-ab[1])/nbin k<-floor((x-ab[1])/d) + 1 nskip<-length(k[k<1 | k > nbin]) k<-k[k>=1 & k <= nbin] for (i in 1:nbin) bc[i]<-length(k[k==i]) return(list(bc = bc, ab = ab, nskip = nskip)) } ############################################################################### nash1<- function(bins, h) # nash1 - approximates a kernel density estimate with normal kernel using # an appropriately weighted average shifted histogram (ASH) # bins - binned data; h - bandwidth (s.d. of normal kernel) # Copyright 1993, David W. Scott and Michael C. Minnotte { if(h <= 0) stop("Smoothing parameter m nonpositive") bc <- bins$bc ab <- bins$ab nbin <- length(bc) delta<-(ab[2]-ab[1])/nbin w<-dnorm((0:(nbin-1))*delta,0,h) nonempt<-(1:nbin)[bc > 0] x<-ab[1]+delta*(1:nbin-.5) y<-rep(0,nbin) for (i in nonempt) y<-y+bc[i]*w[abs((1:nbin)-i)+1] return(list(x = x, y = y, ab = ab, h = h, n = sum(bc), nskip = bins $nskip)) } ############################################################################### nicerange<- function(x, beta = 0.2) # nicerange - range of a data set extended on each side (default: 10%) # x - unbinned data; beta - total fraction of data range to add (split between # the two sides) # Copyright 1986, David W. Scott { ab <- range(x) # interval surrounding data del <- ((ab[2] - ab[1]) * beta)/2 return(c(ab + c(-1*del, del))) } ############################################################################### matcher<- function(alist, blist) # matcher - matches elements from two lists as described in the appendix # to Minnotte and Scott # alist - first vector of sorted values; blist - second vector of sorted # values # Copyright 1993, Michael C. Minnotte { alist<-unique(sort(alist)) blist<-unique(sort(blist)) m <- max(abs(alist),abs(blist)) alist <- alist/m blist <- blist/m na <- length(alist) nb <- length(blist) na1<-na+1 nb1<-nb+1 dist<-t(log(exp(-blist)%*%t(exp(alist)))) snd<-dist/abs(dist) alpha<-matrix(nb1,na1,2) beta<-matrix(na1,nb1,2) for (i in 1:na) {alpha[i,1]<-min((1:nb)[abs(dist[i,])==min(abs(dist[i,]))]) alpha[i,2]<-alpha[i,1]+snd[i,alpha[i,1]]} alpha[alpha==0]<-nb1 for (i in 1:nb) {beta[i,1]<-min((1:na)[abs(dist[,i])==min(abs(dist[,i]))]) beta[i,2]<-beta[i,1]-snd[beta[i,1],i]} beta[beta==0]<-na1 mout<-matrix(nb1,max(na1,nb1),2) mout[,2]<-na1 fit1<-(1:na)[beta[alpha[1:na,1],1]==(1:na)] mout[fit1,1]<-alpha[fit1,1] mout[alpha[fit1,1],2]<-fit1 beta[alpha[fit1,1],]<-na1 alpha[fit1,]<-nb1 fit2<-(1:na)[beta[alpha[1:na,1],2]==(1:na)] mout[fit2,1]<-alpha[fit2,1] mout[alpha[fit2,1],2]<-fit2 beta[alpha[fit2,1],]<-na1 alpha[fit2,]<-nb1 fit3<-(1:na)[beta[alpha[1:na,2],1]==(1:na)] mout[fit3,1]<-alpha[fit3,2] mout[alpha[fit3,2],2]<-fit3 beta[alpha[fit3,2],]<-na1 alpha[fit3,]<-nb1 fit4<-(1:na)[beta[alpha[1:na,2],2]==(1:na)] mout[fit4,1]<-alpha[fit4,2] mout[alpha[fit4,2],2]<-fit4 beta[alpha[fit4,2],]<-na1 alpha[fit4,]<-nb1 return(mout) } ############################################################################### quadest<- function(x, y) # quadest - given three ordered pairs, finds the extremum of the interpolated # parabola # x - 3-vector of x values; y - 3-vector of y values # Copyright 1993, Michael C. Minnotte { a <- ((y[1] - y[2])/(x[1] - x[2]) - (y[2] - y[3])/(x[2] - x[3]))/( x[1] - x[3]) b <- (y[1] - y[2])/(x[1] - x[2]) - a * (x[1] + x[2]) quad <- - b/(2 * a) return(quad) } ############################################################################### findmodes<- function(x, y) # findmodes - given sequence of values representing a density function, # identify number of modes, locations of modes, and indices of modes # and antimodes # x - x-values returned by nash1; y - density values returned by nash1 # Copyright 1993, Michael C. Minnotte { nbin <- length(x) modes <- (2:(nbin-1))[y[2:(nbin-1)] > y[1:(nbin-2)] & y[2:(nbin-1)] >= y[3:nbin]] nm<-length(modes) modes<- cbind(rep(0,nm),modes) for (i in 1:nm) modes[i,1]<-quadest(x[modes[i,2]+(-1):1],y[modes[i,2]+(-1):1]) am <- (2:(nbin-1))[y[2:(nbin-1)] < y[1:(nbin-2)] & y[2:(nbin-1)] <= y[3:nbin]] am<-cbind(c(1,am),c(am,nbin)) return(list(nm = nm, modes = modes, am = am)) } ############################################################################### findbumps<- function(x, y) # findbumps - given sequence of values representing a density function, # # identify bumps (regions of negative second derivative) # # x - x-values returned by nash1; y - density values returned by nash1 # Copyright 1993, Michael C. Minnotte { n <- length(y) f2 <- c(1, y[1:(n - 2)] - 2 * y[2:(n - 1)] + y[3:n], 1) s <- ifelse(f2 < 0, -1, 1) s2 <- s[2:n] - s[1:(n - 1)] i1 <- (1:(n - 1))[s2 == -2] + 1 i2 <- (1:(n - 1))[s2 == 2] return(cbind(x[i1], x[i2])) }