R/dist.topo.R

   1 ## dist.topo.R (2013-02-09)
   2
   3 ##      Topological Distances, Tree Bipartitions,
   4 ##   Consensus Trees, and Bootstrapping Phylogenies
   5
   6 ## Copyright 2005-2013 Emmanuel Paradis
   7
   8 ## This file is part of the R-package `ape'.
   9 ## See the file ../COPYING for licensing issues.
  10
  11 dist.topo <- function(x, y, method = "PH85")
  12 {
  13     if (method == "score" && (is.null(x$edge.length) || is.null(y$edge.length)))
  14         stop("trees must have branch lengths for branch score distance.")
  15     nx <- length(x$tip.label)
  16     x <- unroot(x)
  17     y <- unroot(y)
  18     bp1 <- .Call("bipartition", x$edge, nx, x$Nnode, PACKAGE = "ape")
  19     bp1 <- lapply(bp1, function(xx) sort(x$tip.label[xx]))
  20     ny <- length(y$tip.label) # fix by Otto Cordero
  21     ## fix by Tim Wallstrom:
  22     bp2.tmp <- .Call("bipartition", y$edge, ny, y$Nnode, PACKAGE = "ape")
  23     bp2 <- lapply(bp2.tmp, function(xx) sort(y$tip.label[xx]))
  24     bp2.comp <- lapply(bp2.tmp, function(xx) setdiff(1:ny, xx))
  25     bp2.comp <- lapply(bp2.comp, function(xx) sort(y$tip.label[xx]))
  26     ## End
  27     q1 <- length(bp1)
  28     q2 <- length(bp2)
  29     if (method == "PH85") {
  30         p <- 0
  31         for (i in 1:q1) {
  32             for (j in 1:q2) {
  33                 if (identical(bp1[[i]], bp2[[j]]) | identical(bp1[[i]], bp2.comp[[j]])) {
  34                     p <- p + 1
  35                     break
  36                 }
  37             }
  38         }
  39         dT <- q1 + q2 - 2 * p # same than:
  40         ##dT <- if (q1 == q2) 2*(q1 - p) else 2*(min(q1, q2) - p) + abs(q1 - q2)
  41     }
  42     if (method == "score") {
  43         dT <- 0
  44         found1 <- FALSE
  45         found2 <- logical(q2)
  46         found2[1] <- TRUE
  47         for (i in 2:q1) {
  48             for (j in 2:q2) {
  49                 if (identical(bp1[[i]], bp2[[j]]) | identical(bp1[[i]], bp2.comp[[j]])) {
  50                     dT <- dT + (x$edge.length[which(x$edge[, 2] == nx + i)] -
  51                                 y$edge.length[which(y$edge[, 2] == ny + j)])^2
  52                     found1 <- found2[j] <- TRUE
  53                     break
  54                 }
  55             }
  56             if (found1) found1 <- FALSE
  57             else dT <- dT + (x$edge.length[which(x$edge[, 2] == nx + i)])^2
  58         }
  59         if (!all(found2))
  60             dT <- dT + sum((y$edge.length[y$edge[, 2] %in% (ny + which(!found2))])^2)
  61         dT <- sqrt(dT)
  62     }
  63     dT
  64 }
  65
  66 .compressTipLabel <- function(x)
  67 {
  68     ## 'x' is a list of objects of class "phylo" possibly with no class
  69     if (!is.null(attr(x, "TipLabel"))) return(x)
  70     ref <- x[[1]]$tip.label
  71     n <- length(ref)
  72     if (length(unique(ref)) != n)
  73         stop("some tip labels are duplicated in tree no. 1")
  74
  75     ## serious improvement by Joseph W. Brown!
  76     relabel <- function (y) {
  77         label <- y$tip.label
  78         if (!identical(label, ref)) {
  79             if (length(label) != length(ref))
  80                 stop(paste("tree ", y, "has a different number of tips"))
  81             ilab <- match(label, ref)
  82             if (any(is.na(ilab)))
  83                 stop(paste("tree ", y, "has different tip labels"))
  84             ie <- match(1:n, y$edge[, 2])
  85             y$edge[ie, 2] <- ilab
  86         }
  87         y$tip.label <- NULL
  88         y
  89     }
  90     x <- unclass(x) # another killer improvement by Tucson's hackathon (1/2/2013)
  91     x <- lapply(x, relabel)
  92     attr(x, "TipLabel") <- ref
  93     class(x) <- "multiPhylo"
  94     x
  95 }
  96
  97 prop.part <- function(..., check.labels = TRUE)
  98 {
  99     obj <- list(...)
 100     if (length(obj) == 1 && class(obj[[1]]) != "phylo")
 101         obj <- obj[[1]]
 102     ## <FIXME>
 103     ## class(obj) <- NULL # needed? apparently not, see below (2010-11-18)
 104     ## </FIXME>
 105     ntree <- length(obj)
 106     if (ntree == 1) check.labels <- FALSE
 107     if (check.labels) obj <- .compressTipLabel(obj) # fix by Klaus Schliep (2011-02-21)
 108     for (i in 1:ntree) storage.mode(obj[[i]]$Nnode) <- "integer"
 109     ## <FIXME>
 110     ## The 1st must have tip labels
 111     ## Maybe simply pass the number of tips to the C code??
 112     obj <- .uncompressTipLabel(obj) # fix a bug (2010-11-18)
 113     ## </FIXME>
 114     clades <- .Call("prop_part", obj, ntree, TRUE, PACKAGE = "ape")
 115     attr(clades, "number") <- attr(clades, "number")[1:length(clades)]
 116     attr(clades, "labels") <- obj[[1]]$tip.label
 117     class(clades) <- "prop.part"
 118     clades
 119 }
 120
 121 print.prop.part <- function(x, ...)
 122 {
 123     if (is.null(attr(x, "labels"))) {
 124         for (i in 1:length(x)) {
 125             cat("==>", attr(x, "number")[i], "time(s):")
 126             print(x[[i]], quote = FALSE)
 127         }
 128     } else {
 129         for (i in 1:length(attr(x, "labels")))
 130           cat(i, ": ", attr(x, "labels")[i], "\n", sep = "")
 131         cat("\n")
 132         for (i in 1:length(x)) {
 133             cat("==>", attr(x, "number")[i], "time(s):")
 134             print(x[[i]], quote = FALSE)
 135         }
 136     }
 137 }
 138
 139 summary.prop.part <- function(object, ...) attr(object, "number")
 140
 141 plot.prop.part <- function(x, barcol = "blue", leftmar = 4, ...)
 142 {
 143     if (is.null(attr(x, "labels")))
 144       stop("cannot plot this partition object; see ?prop.part for details.")
 145     L <- length(x)
 146     n <- length(attr(x, "labels"))
 147     layout(matrix(1:2, 2, 1), heights = c(1, 3))
 148     par(mar = c(0.1, leftmar, 0.1, 0.1))
 149     plot(1:L, attr(x, "number"), type = "h", col = barcol, xlim = c(1, L),
 150          xlab = "", ylab = "Frequency", xaxt = "n", bty = "n")
 151     plot(0, type = "n", xlim = c(1, L), ylim = c(1, n),
 152          xlab = "", ylab = "", xaxt = "n", yaxt = "n")
 153     for (i in 1:L) points(rep(i, length(x[[i]])), x[[i]], ...)
 154     mtext(attr(x, "labels"), side = 2, at = 1:n, las = 1)
 155 }
 156
 157 prop.clades <- function(phy, ..., part = NULL, rooted = FALSE)
 158 {
 159     if (is.null(part)) {
 160         ## <FIXME>
 161         ## Are we going to keep the '...' way of passing trees?
 162         obj <- list(...)
 163         if (length(obj) == 1 && class(obj[[1]]) != "phylo")
 164             obj <- unlist(obj, recursive = FALSE)
 165         ## </FIXME>
 166         part <- prop.part(obj, check.labels = TRUE)
 167     }
 168
 169     ## until ape 3.0-7 it was assumed implicitly that the labels in phy
 170     ## are in the same order than in 'part' (bug report by Rupert Collins)
 171     if (!identical(phy$tip.label, attr(part, "labels"))) {
 172         i <- match(phy$tip.label, attr(part, "labels"))
 173         j <- match(seq_len(Ntip(phy)), phy$edge[, 2])
 174         phy$edge[j, 2] <- i
 175         phy$tip.label <- attr(part, "labels")
 176     }
 177     bp <- prop.part(phy)
 178     if (!rooted) {
 179         bp <- postprocess.prop.part(bp)
 180         part <- postprocess.prop.part(part) # fix by Klaus Schliep
 181         ## actually the above line in not needed if called from boot.phylo()
 182     }
 183
 184     n <- numeric(phy$Nnode)
 185     for (i in seq_along(bp)) {
 186         for (j in seq_along(part)) {
 187             ## we rely on the fact the values returned by prop.part are
 188             ## sorted and without attributes, so identical can be used:
 189             if (identical(bp[[i]], part[[j]])) {
 190                 n[i] <- attr(part, "number")[j]
 191                 done <-  TRUE
 192                 break
 193             }
 194         }
 195     }
 196     n
 197 }
 198
 199 boot.phylo <- function(phy, x, FUN, B = 100, block = 1,
 200                        trees = FALSE, quiet = FALSE, rooted = FALSE)
 201 {
 202     if (is.list(x) && !is.data.frame(x)) {
 203         if (inherits(x, "DNAbin")) x <- as.matrix(x)
 204         else {
 205             nm <- names(x)
 206             n <- length(x)
 207             x <- unlist(x)
 208             nL <- length(x)
 209             x <- matrix(x, n, nL/n, byrow = TRUE)
 210             rownames(x) <- nm
 211         }
 212     }
 213     boot.tree <- vector("list", B)
 214     if (!quiet) # suggestion by Alastair Potts
 215         progbar <- utils::txtProgressBar(style = 3)
 216     for (i in 1:B) {
 217         if (block > 1) {
 218             y <- seq(block, ncol(x), block)
 219             boot.i <- sample(y, replace = TRUE)
 220             boot.samp <- numeric(ncol(x))
 221             boot.samp[y] <- boot.i
 222             for (j in 1:(block - 1))
 223                 boot.samp[y - j] <- boot.i - j
 224         } else boot.samp <- sample(ncol(x), replace = TRUE)
 225         boot.tree[[i]] <- FUN(x[, boot.samp])
 226         if (!quiet) utils::setTxtProgressBar(progbar, i/B)
 227     }
 228     if (!quiet) close(progbar)
 229     for (i in 1:B) storage.mode(boot.tree[[i]]$Nnode) <- "integer"
 230     storage.mode(phy$Nnode) <- "integer"
 231
 232     pp <- prop.part(boot.tree)
 233     if (!rooted) pp <- postprocess.prop.part(pp)
 234     ans <- prop.clades(phy, part = pp, rooted = rooted)
 235
 236     ##ans <- attr(.Call("prop_part", c(list(phy), boot.tree),
 237     ##                  B + 1, FALSE, PACKAGE = "ape"), "number") - 1
 238     if (trees) {
 239         class(boot.tree) <- "multiPhylo"
 240         ans <- list(BP = ans, trees = boot.tree)
 241     }
 242     ans
 243 }
 244
 245 ### The next function transforms an object of class "prop.part" so
 246 ### that the vectors which are identical in terms of split are aggregated.
 247 ### For instance if n = 5 tips, 1:2 and 3:5 actually represent the same
 248 ### split though they are different clades. The aggregation is done
 249 ### arbitrarily. The call to ONEwise() insures that all splits include
 250 ### the first tip.
 251 postprocess.prop.part <- function(x)
 252 {
 253     n <- length(x[[1]])
 254     N <- length(x)
 255     w <- attr(x, "number")
 256
 257     drop <- logical(N)
 258     V <- numeric(n)
 259     for (i in 2:(N - 1)) {
 260         if (drop[i]) next
 261         A <- x[[i]]
 262         for (j in (i + 1):N) {
 263             if (drop[j]) next
 264             B <- x[[j]]
 265             if (length(A) + length(B) != n) next
 266             V[] <- 0L
 267             V[A] <- 1L
 268             V[B] <- 1L
 269             if (all(V == 1L)) {
 270                 drop[j] <- TRUE
 271                 w[i] <- w[i] + w[j]
 272             }
 273         }
 274     }
 275     if (any(drop)) {
 276         labels <- attr(x, "labels")
 277         x <- x[!drop]
 278         w <- w[!drop]
 279         attr(x, "number") <- w
 280         attr(x, "labels") <- labels
 281         class(x) <- "prop.part"
 282     }
 283     ONEwise(x)
 284 }
 285
 286 ### This function changes an object of class "prop.part" so that they
 287 ### all include the first tip. For instance if n = 5 tips, 3:5 is
 288 ### changed to 1:2.
 289 ONEwise <- function(x)
 290 {
 291     n <- length(x[[1L]])
 292     v <- 1:n
 293     for (i in 2:length(x)) {
 294         y <- x[[i]]
 295         if (y[1] != 1) x[[i]] <- v[-y]
 296     }
 297     x
 298 }
 299
 300 consensus <- function(..., p = 1, check.labels = TRUE)
 301 {
 302     foo <- function(ic, node) {
 303         ## ic: index of 'pp'
 304         ## node: node number in the final tree
 305         pool <- pp[[ic]]
 306         if (ic < m) {
 307             for (j in (ic + 1):m) {
 308                 wh <- match(pp[[j]], pool)
 309                 if (!any(is.na(wh))) {
 310                     edge[pos, 1] <<- node
 311                     pool <- pool[-wh]
 312                     edge[pos, 2] <<- nextnode <<- nextnode + 1L
 313                     pos <<- pos + 1L
 314                     foo(j, nextnode)
 315                 }
 316             }
 317         }
 318         size <- length(pool)
 319         if (size) {
 320             ind <- pos:(pos + size - 1)
 321             edge[ind, 1] <<- node
 322             edge[ind, 2] <<- pool
 323             pos <<- pos + size
 324         }
 325     }
 326     obj <- list(...)
 327     if (length(obj) == 1) {
 328         ## better than unlist(obj, recursive = FALSE)
 329         ## because "[[" keeps the class of 'obj':
 330         obj <- obj[[1]]
 331         if (class(obj) == "phylo") return(obj)
 332     }
 333     if (!is.null(attr(obj, "TipLabel")))
 334         labels <- attr(obj, "TipLabel")
 335     else {
 336         labels <- obj[[1]]$tip.label
 337         if (check.labels) obj <- .compressTipLabel(obj)
 338     }
 339     ntree <- length(obj)
 340     ## Get all observed partitions and their frequencies:
 341     pp <- prop.part(obj, check.labels = FALSE)
 342     ## Drop the partitions whose frequency is less than 'p':
 343     if (p == 0.5) p <- 0.5000001 # avoid incompatible splits
 344     pp <- pp[attr(pp, "number") >= p * ntree]
 345     ## Get the order of the remaining partitions by decreasing size:
 346     ind <- sort(unlist(lapply(pp, length)), decreasing = TRUE,
 347                 index.return = TRUE)$ix
 348     pp <- pp[ind]
 349     n <- length(labels)
 350     m <- length(pp)
 351     edge <- matrix(0L, n + m - 1, 2)
 352     if (m == 1) {
 353         edge[, 1] <- n + 1L
 354         edge[, 2] <- 1:n
 355     } else {
 356         nextnode <- n + 1L
 357         pos <- 1L
 358         foo(1, nextnode)
 359     }
 360     structure(list(edge = edge, tip.label = labels,
 361               Nnode = m), class = "phylo")
 362 }