bug fix in root()

[ape.git] / R / root.R
diff --git a/R/root.R b/R/root.R

index ba80e4a06dde46345423d8a5d3e6f58006f56849..8526b8f3d4f74add15240b88eca14a8267785496 100644 (file)
--- a/R/root.R
+++ b/R/root.R
@@ -1,29 +1,28 @@
-## root.R (2007-12-21)
+## root.R (2011-08-05)
  
  ##   Root of Phylogenetic Trees
  
-## Copyright 2004-2007 Emmanuel Paradis
+## Copyright 2004-2011 Emmanuel Paradis
  
  ## This file is part of the R-package `ape'.
  ## See the file ../COPYING for licensing issues.
  
  is.rooted <- function(phy)
  {
-    if (!"phylo" %in% class(phy))
-      stop('object "phy" is not of class "phylo"')
-    if (!is.null(phy$root.edge)) return(TRUE)
+    if (!inherits(phy, "phylo"))
+        stop('object "phy" is not of class "phylo"')
+    if (!is.null(phy$root.edge)) TRUE
      else
-      if (tabulate(phy$edge[, 1])[length(phy$tip.label) + 1] > 2)
-        return(FALSE)
-      else return(TRUE)
+        if (tabulate(phy$edge[, 1])[length(phy$tip.label) + 1] > 2)
+            FALSE else TRUE
  }
  
  unroot <- function(phy)
  {
-    if (class(phy) != "phylo")
-      stop('object "phy" is not of class "phylo"')
+    if (!inherits(phy, "phylo"))
+        stop('object "phy" is not of class "phylo"')
      if (dim(phy$edge)[1] < 3)
-      stop("cannot unroot a tree with two edges.")
+        stop("cannot unroot a tree with less than three edges.")
      ## delete FIRST the root.edge (in case this is sufficient to
      ## unroot the tree, i.e. there is a multichotomy at the root)
      if (!is.null(phy$root.edge)) phy$root.edge <- NULL
@@ -36,11 +35,11 @@ unroot <- function(phy)
      ## nodes by adding 1, except the root (this remains the
      ## origin of the tree).
      nb.tip <- length(phy$tip.label)
-    ROOT <- nb.tip + 1
+    ROOT <- nb.tip + 1L
      EDGEROOT <- which(phy$edge[, 1] == ROOT)
      ## j: the target where to stick the edge
      ## i: the edge to delete
-    if (phy$edge[EDGEROOT[1], 2] == ROOT + 1) {
+    if (phy$edge[EDGEROOT[1], 2] == ROOT + 1L) {
          j <- EDGEROOT[2]
          i <- EDGEROOT[1]
      } else {
@@ -51,98 +50,258 @@ unroot <- function(phy)
      ## cladewise order.
      phy$edge <- phy$edge[-i, ]
      nodes <- phy$edge > ROOT # renumber all nodes except the root
-    phy$edge[nodes] <- phy$edge[nodes] - 1
+    phy$edge[nodes] <- phy$edge[nodes] - 1L
      if (!is.null(phy$edge.length)) {
          phy$edge.length[j] <- phy$edge.length[j] + phy$edge.length[i]
          phy$edge.length <- phy$edge.length[-i]
      }
-    phy$Nnode <- phy$Nnode - 1
+    phy$Nnode <- phy$Nnode - 1L
      if (!is.null(phy$node.label))
        phy$node.label <- phy$node.label[-2]
      phy
  }
  
-root <- function(phy, outgroup)
+root <- function(phy, outgroup, node = NULL,
+                 resolve.root = FALSE, interactive = FALSE)
  {
-    if (class(phy) != "phylo")
-      stop('object "phy" is not of class "phylo"')
-    if (is.character(outgroup))
-      outgroup <- which(phy$tip.label %in% outgroup)
-    nb.tip <- length(phy$tip.label)
-    if (length(outgroup) == nb.tip) return(phy)
-
-    ## First check that the outgroup is monophyletic--
-    ## unless there's only one tip specified of course
-    ROOT <- nb.tip + 1
-    if (length(outgroup) > 1) {
-        msg <- "the specified outgroup is not monophyletic!"
-        ## If all tips in `tip' are not contiguous, then
-        ## no need to go further:
-        if (!all(diff(outgroup) == 1)) stop(msg)
-        seq.nod <- .Call("seq_root2tip", phy$edge, nb.tip,
-                         phy$Nnode, PACKAGE = "ape")
-        sn <- seq.nod[outgroup]
-        ## We go from the root to the tips: the sequence of nodes
-        ## is identical until the MRCA:
-        newroot <- ROOT
-        i <- 2 # we start at the 2nd position since the root
-               # of the tree is a common ancestor to all tips
+    if (!inherits(phy, "phylo"))
+        stop('object "phy" is not of class "phylo"')
+    phy <- reorder(phy)
+    n <- length(phy$tip.label)
+    ROOT <- n + 1L
+    if (interactive) {
+        node <- identify(phy)$nodes
+        cat("You have set resolve.root =", resolve.root, "\n")
+    }
+    if (!is.null(node)) {
+        if (node <= n)
+            stop("incorrect node#: should be greater than the number of taxa")
+        outgroup <- NULL
+        newroot <- node
+    } else {
+        if (is.numeric(outgroup)) {
+            if (any(outgroup > n))
+                stop("incorrect taxa#: should not be greater than the number of taxa")
+            outgroup <- sort(outgroup) # used below
+        }
+        if (is.character(outgroup))
+            outgroup <- which(phy$tip.label %in% outgroup)
+        if (length(outgroup) == n) return(phy)
+
+        ## First check that the outgroup is monophyletic--
+        ## unless there's only one tip specified of course
+        if (length(outgroup) > 1) {
+            pp <- prop.part(phy)
+            ingroup <- (1:n)[-outgroup]
+            newroot <- 0L
+            for (i in 2:phy$Nnode) {
+                if (identical(pp[[i]], ingroup)) {
+                    newroot <- i + n
+                    break
+                }
+                if (identical(pp[[i]], outgroup)) {
+                    newroot <- phy$edge[which(phy$edge[, 2] == i + n), 1]
+                    break
+                }
+            }
+            if (!newroot)
+                stop("the specified outgroup is not monophyletic")
+        } else newroot <- phy$edge[which(phy$edge[, 2] == outgroup), 1]
+    }
+    N <- Nedge(phy)
+    oldNnode <- phy$Nnode
+    if (newroot == ROOT) {
+        if (resolve.root) {
+            snw <- which(phy$edge[, 1] == newroot)
+            if (length(snw) > 2) {
+                a <- snw[1]:(snw[2] - 1)
+                b <- snw[2]:N
+                newnod <- oldNnode + n + 1
+                phy$edge[snw[-1], 1] <- newnod
+                phy$edge <- rbind(phy$edge[a, ], c(ROOT, newnod),
+                                  phy$edge[b, ])
+                if (!is.null(phy$edge.length))
+                phy$edge.length <-
+                    c(phy$edge.length[a], 0, phy$edge.length[b])
+                phy$Nnode <- phy$Nnode + 1L
+                ## node renumbering (see comments below)
+                newNb <- integer(n + oldNnode)
+                newNb[newroot] <- n + 1L
+                sndcol <- phy$edge[, 2] > n
+                phy$edge[sndcol, 2] <- newNb[phy$edge[sndcol, 2]] <-
+                    (n + 2):(n + phy$Nnode)
+                phy$edge[, 1] <- newNb[phy$edge[, 1]]
+            }
+        }
+        return(phy)
+    }
+
+    phy$root.edge <- NULL # just in case...
+    Nclade <- tabulate(phy$edge[, 1])[ROOT] # degree of the root node
+    ## if only 2 edges connect to the root, we have to fuse them:
+    fuseRoot <- Nclade == 2
+
+    start <- which(phy$edge[, 1] == ROOT)
+    end <- c(start[-1] - 1, N)
+    o <- integer(N)
+    INV <- logical(N)
+
+    w <- which(phy$edge[, 2] == newroot)
+    nod <- phy$edge[w, 1]
+    i <- w
+    NEXT <- 1L
+
+    ## The loop below starts from the new root and goes up in the edge
+    ## matrix reversing the edges that need to be as well as well
+    ## inverting their order. The other edges must not be changed, so
+    ## their indices are stored in `stack'.
+    ## We then bind the other edges in a straightforward way.
+
+    if (nod != ROOT) {
+        ## it is important that the 3 next lines
+        ## are inside this "if" statement
+        o[NEXT] <- w
+        NEXT <- NEXT + 1L
+        INV[w] <- TRUE
+        i <- w - 1L
+        stack <- 0L
          repeat {
-            x <- unique(unlist(lapply(sn, "[", i)))
-            if (length(x) != 1) break
-            newroot <- x
-            i <- i + 1
+            if (phy$edge[i, 2] == nod) {
+                if (stack) {
+                    o[NEXT:(NEXT + stack - 1L)] <- (i + 1L):(i + stack)
+                    NEXT <- NEXT + stack
+                    stack <- 0L
+                }
+                if (phy$edge[i, 1] == ROOT) break
+                o[NEXT] <- i
+                NEXT <- NEXT + 1L
+                INV[i] <- TRUE
+                nod <- phy$edge[i, 1]
+            } else stack <- stack + 1L
+            i <- i - 1L
+        }
+    }
+
+    ## we keep the edge leading to the old root if needed:
+    if (!fuseRoot) {
+        o[NEXT] <- i
+        INV[i] <- TRUE
+        NEXT <- NEXT + 1L
+    }
+
+    endOfOutgroup <- which(phy$edge[(w+1):N, 1] < newroot)[1] + w - 1
+    if (is.na(endOfOutgroup)) endOfOutgroup <- N
+    endOfClade <- end[end >= endOfOutgroup][1]
+
+    ## bind the other clades...
+    for (j in 1:Nclade) {
+        if (end[j] == endOfClade) next
+        ## do we have to fuse the two basal edges?
+        if (fuseRoot) {
+            phy$edge[start[j], 1] <- phy$edge[i, 2]
+            if (!is.null(phy$edge.length))
+                phy$edge.length[start[j]] <- phy$edge.length[start[j]] +
+                    phy$edge.length[i]
+        } #else {
+          #  o[NEXT] <- i#start[j]
+          #  NEXT <- NEXT + 1L
+        #}
+        s <- start[j]:end[j]
+        ne <- length(s)
+        o[NEXT:(NEXT + ne - 1L)] <- s
+        NEXT <- NEXT + ne
+    }
+
+    ## possibly bind the edges below the outgroup till the end of the clade
+    if (all(endOfOutgroup != end)) {
+        j <- (endOfOutgroup + 1L):endOfClade
+        ## we must take care that the branch inversions done above
+        ## may have changed the hierarchy of clades here, so we
+        ## travel from the bottom of this series of edges
+        stack <- 0L
+        inverted <- phy$edge[INV, 1] # <- fails if ', 2]' is used
+        for (k in rev(j)) {
+            if (any(phy$edge[k, 1] == inverted)) {
+                o[NEXT] <- k
+                NEXT <- NEXT + 1L
+                if (stack){
+                    o[NEXT:(NEXT + stack - 1L)] <- (k + 1L):(k + stack)
+                    NEXT <- NEXT + stack
+                    stack <- 0L
+                }
+            } else stack <- stack + 1L
+        }
+    }
+
+    ## ... and the outgroup
+    s <- (w + 1L):endOfOutgroup
+    ne <- length(s)
+    o[NEXT:(NEXT + ne - 1L)] <- s
+
+    if (fuseRoot) {
+        phy$Nnode <- oldNnode - 1L
+        N <- N - 1L
+    }
+    phy$edge[INV, ] <- phy$edge[INV, 2:1]
+    phy$edge <- phy$edge[o, ]
+    if (!is.null(phy$edge.length))
+        phy$edge.length <- phy$edge.length[o]
+
+    if (resolve.root) {
+        newnod <- oldNnode + n + 1
+        if (length(outgroup) == 1L) {
+            wh <- which(phy$edge[, 2] == outgroup)
+            phy$edge[1] <- newnod
+            phy$edge <-
+                rbind(c(newroot, newnod), phy$edge[-wh, ], phy$edge[wh, ])
+            snw <- which(phy$edge[, 1] == newroot)
+            phy$edge[snw[length(snw) - 1], 1] <- newnod
+            if (!is.null(phy$edge.length)) {
+                phy$edge.length <-
+                    c(0, phy$edge.length[-wh], phy$edge.length[wh])
+            }
+        } else {
+            wh <- which(phy$edge[, 1] == newroot)
+            phy$edge[wh[-1], 1] <- newnod
+            s1 <- 1:(wh[2] - 1)
+            s2 <- wh[2]:N
+            phy$edge <-
+                rbind(phy$edge[s1, ], c(newroot, newnod), phy$edge[s2, ])
+            if (!is.null(phy$edge.length)) {
+                tmp <- phy$edge.length[1]
+                phy$edge.length[1] <- 0
+                phy$edge.length <-
+                    c(phy$edge.length[s1], tmp, phy$edge.length[s2])
+            }
          }
-        ## Check that all descendants of this node
-        ## are included in the outgroup
-        ## (1st solution... there may be something smarter)
-        desc <- which(unlist(lapply(seq.nod,
-                                    function(x) any(x %in% newroot))))
-        if (length(outgroup) != length(desc)) stop(msg)
-        if (!all(sort(outgroup) == sort(desc))) stop(msg)
-
-    } else newroot <- phy$edge[which(phy$edge[, 2] == outgroup), 1]
-
-    if (newroot == ROOT) return(phy)
-
-### <FIXME>
-### The remaining part of the code has not been improved; this
-### does not seem obvious. This is delayed...     (2006-09-23)
-### </FIXME>
-
-    ## Invert all branches from the new root to the old one
-    i <- which(phy$edge[, 2] == newroot)
-    nod <- phy$edge[i, 1]
-    while (nod != ROOT) {
-        j <- which(phy$edge[, 2] == nod)
-        phy$edge[i, 1] <- phy$edge[i, 2]
-        phy$edge[i, 2] <- nod
-        i <- j
-        nod <- phy$edge[i, 1]
+        ## N <- N + 1L ... not needed
+        phy$Nnode <- phy$Nnode + 1L
      }
  
-    i.oroot <- which(phy$edge[, 1] == ROOT)
-    ## Unroot the tree if there's a basal dichotomy...
-    if (length(i.oroot) == 2) {
-        j <- i.oroot[which(i.oroot != i)]
-        phy$edge[j, 1] <- phy$edge[i, 2]
-        phy$edge <- phy$edge[-i, ]
-        if (!is.null(phy$edge.length)) {
-            phy$edge.length[j] <- phy$edge.length[j] + phy$edge.length[i]
-            phy$edge.length <- phy$edge.length[-i]
+    ## The block below renumbers the nodes so that they conform
+    ## to the "phylo" format
+    newNb <- integer(n + oldNnode)
+    newNb[newroot] <- n + 1L
+    sndcol <- phy$edge[, 2] > n
+    ## executed from right to left, so newNb is modified before phy$edge:
+    phy$edge[sndcol, 2] <- newNb[phy$edge[sndcol, 2]] <-
+        (n + 2):(n + phy$Nnode)
+    phy$edge[, 1] <- newNb[phy$edge[, 1]]
+
+    if (!is.null(phy$node.label)) {
+        #browser()
+        newNb <- newNb[-(1:n)]
+        if (fuseRoot) {
+            newNb <- newNb[-1]
+            phy$node.label <- phy$node.label[-1]
+        }
+        phy$node.label <- phy$node.label[order(newNb)]
+        if (resolve.root) {
+            phy$node.label[is.na(phy$node.label)] <- phy$node.label[1]
+            phy$node.label[1] <- NA
+            ##phy$node.label <- c(phy$node.label[1], NA, phy$node.label[-1])
+            ##phy$node.label <- c("NA", phy$node.label)
          }
-        phy$edge[which(phy$edge == newroot)] <- ROOT
-    } else {
-        ## ... otherwise just invert the root with the newroot
-        phy$edge[which(phy$edge == newroot)] <- ROOT
-        phy$edge[i.oroot] <- newroot
-        ## ... and invert finally! (fixed 2005-11-07)
-        phy$edge[i, ] <- rev(phy$edge[i, ])
      }
-    if (!is.null(phy$node.label))
-      ## It's important to not delete the label of the newroot
-      ## to keep the positions of the other nodes
-      phy$node.label[1] <- phy$node.label[newroot - nb.tip]
-    ## Not needed: phy$Nnode <- phy$Nnode - 1
-    read.tree(text = write.tree(phy))
+    phy
  }