final commit for ape 3.0

[ape.git] / R / chronopl.R
diff --git a/R/chronopl.R b/R/chronopl.R

index 1f111b2e152801583f1c3a004c291de5b344b6ef..98f27fb8c1ebe85bd7bb2f359442a4aa8a239bdd 100644 (file)
--- a/R/chronopl.R
+++ b/R/chronopl.R
@@ -1,33 +1,60 @@
-## chronopl.R (2007-01-17)
+## chronopl.R (2012-02-09)
  
  ##   Molecular Dating With Penalized Likelihood
  
-## Copyright 2005-2007 Emmanuel Paradis
+## Copyright 2005-2012 Emmanuel Paradis
  
  ## This file is part of the R-package `ape'.
  ## See the file ../COPYING for licensing issues.
  
-chronopl <- function(phy, lambda, node.age = 1, node = "root",
-                     CV = FALSE)
+chronopl <-
+    function(phy, lambda, age.min = 1, age.max = NULL,
+             node = "root", S = 1, tol = 1e-8,
+             CV = FALSE, eval.max = 500, iter.max = 500, ...)
  {
      n <- length(phy$tip.label)
-    n.node <- phy$Nnode
-    if (n != n.node + 1)
-      stop("the tree must be rooted AND dichotomous")
-    if (any(phy$edge.length == 0))
-      stop("some branch lengths are equal to zero;
-you must remove them beforehand.")
-    N <- dim(phy$edge)[1]
-    ROOT <- n + 1
-    if (node == "root") node <- ROOT
-    ini.rate <- phy$edge.length
-    ## `known.ages' contains the index of all nodes (internal and
-    ## terminal) of known age:
-    known.ages <- c(1:n, node)
-    ## `unknown.ages' contains the index of the nodes of unknown age:
-    unknown.ages <- ((n + 1):(n + n.node))[-(node - n)]
-    ## `basal' contains the indices of the basal edges (ie, linked to the root):
-    basal <- which(phy$edge[, 1] == ROOT)
+    ROOT <- n + 1L
+    if (identical(node, "root")) node <- ROOT
+    if (any(node <= n))
+        stop("node numbers should be greater than the number of tips")
+    zerobl <- which(phy$edge.length <= 0)
+    if (length(zerobl)) {
+        if (any(phy$edge[zerobl, 2] <= n))
+            stop("at least one terminal branch is of length zero:
+  you should remove it to have a meaningful estimation.")
+        else {
+            warning("at least one internal branch is of length zero:
+  it was collapsed and some nodes have been deleted.")
+            if (length(node) == 1 && node == ROOT)
+                phy <- di2multi(phy)
+            else {
+                tmp <- FALSE
+                if (is.null(phy$node.label)) {
+                    tmp <- !tmp
+                    phy$node.label <- paste("node", 1:phy$Nnode)
+                }
+                node.lab <- phy$node.label[node - n]
+                phy <- di2multi(phy)
+                node <- match(node.lab, phy$node.label) + n
+                if (tmp) phy$node.label <- NULL
+            }
+        }
+    }
+    m <- phy$Nnode
+    el <- phy$edge.length
+    e1 <- phy$edge[, 1L]
+    e2 <- phy$edge[, 2L]
+    N <- length(e1)
+    TIPS <- 1:n
+    EDGES <- 1:N
+
+    ini.rate <- el
+    el <- el/S
+
+    ## `basal' contains the indices of the basal edges
+    ## (ie, linked to the root):
+    basal <- which(e1 == ROOT)
+    Nbasal <- length(basal)
  
      ## `ind' contains in its 1st column the index of all nonbasal
      ## edges, and in its second column the index of the edges
@@ -40,63 +67,161 @@ you must remove them beforehand.")
      ##        |           | c | a |
      ##        |___c___    |   |   |
  
-    ind <- matrix(NA, N - 2, 2)
-    j <- 1
-    for (i in 1:N) {
-        if (phy$edge[i, 1] == ROOT) next
-        ind[j, 1] <- i
-        ind[j, 2] <- which(phy$edge[, 2] == phy$edge[i, 1])
-        j <- j + 1
+    ind <- matrix(0L, N - Nbasal, 2)
+    ind[, 1] <- EDGES[-basal]
+    ind[, 2] <- match(e1[EDGES[-basal]], e2)
+
+    age <- numeric(n + m)
+
+#############################################################################
+### This bit sets 'ini.time' and should result in no negative branch lengths
+
+    seq.nod <- .Call("seq_root2tip", phy$edge, n, phy$Nnode, PACKAGE = "ape")
+
+    ini.time <- age
+    ini.time[ROOT:(n + m)] <- NA
+    ini.time[node] <- if (is.null(age.max)) age.min else (age.min + age.max) / 2
+
+    ## if no age given for the root, find one approximately:
+    if (is.na(ini.time[ROOT]))
+        ini.time[ROOT] <- if (is.null(age.max)) 3 * max(age.min) else 3 * max(age.max)
+
+    ISnotNA.ALL <- unlist(lapply(seq.nod, function(x) sum(!is.na(ini.time[x]))))
+    o <- order(ISnotNA.ALL, decreasing = TRUE)
+
+    for (y in seq.nod[o]) {
+        ISNA <- is.na(ini.time[y])
+        if (any(ISNA)) {
+            i <- 2L # we know the 1st value is not NA, so we start at the 2nd one
+            while (i <= length(y)) {
+                if (ISNA[i]) { # we stop at the next NA
+                    j <- i + 1L
+                    while (ISNA[j]) j <- j + 1L # look for the next non-NA
+                    nb.val <- j - i
+                    by <- (ini.time[y[i - 1L]] - ini.time[y[j]]) / (nb.val + 1)
+                    ini.time[y[i:(j - 1L)]] <- ini.time[y[i - 1L]] - by * seq_len(nb.val)
+                    i <- j + 1L
+                } else i <- i + 1L
+            }
+        }
+    }
+
+    real.edge.length <- ini.time[e1] - ini.time[e2]
+
+    if (any(real.edge.length <= 0))
+        stop("some initial branch lengths are zero or negative;
+  maybe you need to adjust the given dates -- see '?chronopl' for details")
+#############################################################################
+
+    ## because if (!is.null(age.max)), 'node' is modified,
+    ## so we copy it in case CV = TRUE:
+    node.bak <- node
+
+    ## `unknown.ages' will contain the index of the nodes of unknown age:
+    unknown.ages <- n + 1:m
+
+    ## define the bounds for the node ages:
+    lower <- rep(tol, length(unknown.ages))
+    upper <- rep(1/tol, length(unknown.ages))
+
+    if (!is.null(age.max)) { # are some nodes known within some intervals?
+        lower[node - n] <- age.min
+        upper[node - n] <- age.max
+        ## find nodes known within an interval:
+        interv <- which(age.min != age.max)
+        ## drop them from the 'node' since they will be estimated:
+        node <- node[-interv]
+        if (length(node)) age[node] <- age.min[-interv] # update 'age'
+    } else age[node] <- age.min
+
+    if (length(node)) {
+        unknown.ages <- unknown.ages[n - node] # 'n - node' is simplification for '-(node - n)'
+        lower <- lower[n - node]
+        upper <- upper[n - node]
      }
  
-    age <- rep(0, 2*n - 1)
-    age[node] <- node.age
-
-    tmp <- reorder(phy, "pruningwise")
-    ini.time <- .C("node_depth", as.integer(n), as.integer(n.node),
-                   as.integer(tmp$edge[, 1]), as.integer(tmp$edge[, 2]),
-                   as.integer(N), double(n + n.node), DUP = FALSE,
-                   PACKAGE = "ape")[[6]][-(1:n)] - 1
-    ini.time <- ini.time/max(ini.time)
-    ini.time <- ini.time*node.age/ini.time[known.ages[-(1:n)] - n]
-    ## check that there are no negative branch lengths:
-    ini.time[known.ages[-(1:n)] - n] <- node.age
-    it <- c(age[1:n], ini.time)
-    ibl <- it[phy$edge[, 1]] - it[phy$edge[, 2]]
-    if (any(ibl < 0)) {
-        for (i in which(ibl < 0))
-          if (phy$edge[i, 1] %in% node)
-            ini.time[phy$edge[i, 2]] <- ini.time[phy$edge[i, 1]] - 1e-3
-          else
-            ini.time[phy$edge[i, 1]] <- ini.time[phy$edge[i, 2]] + 1e-3
+    ## `known.ages' contains the index of all nodes (internal and
+    ## terminal) of known age:
+    known.ages <- c(TIPS, node)
+
+    ## concatenate the bounds for the rates:
+    lower <- c(rep(tol, N), lower)
+    upper <- c(rep(1 - tol, N), upper)
+
+    minusploglik.gr <- function(rate, node.time) {
+        grad <- numeric(N + length(unknown.ages))
+        age[unknown.ages] <- node.time
+        real.edge.length <- age[e1] - age[e2]
+        if (any(real.edge.length < 0)) {
+            grad[] <- 0
+            return(grad)
+        }
+        ## gradient for the rates:
+        ## the parametric part can be calculated without a loop:
+        grad[EDGES] <- real.edge.length - el/rate
+        if (Nbasal == 2) { # the simpler formulae if there's a basal dichotomy
+            grad[basal[1]] <-
+                grad[basal[1]] + lambda*(rate[basal[1]] - rate[basal[2]])
+            grad[basal[2]] <-
+                grad[basal[2]] + lambda*(rate[basal[2]] - rate[basal[1]])
+        } else { # the general case
+            for (i in 1:Nbasal)
+                grad[basal[i]] <- grad[basal[i]] +
+                    lambda*(2*rate[basal[i]]*(1 - 1/Nbasal) -
+                            2*sum(rate[basal[-i]])/Nbasal)/(Nbasal - 1)
+        }
+
+        for (i in EDGES) {
+            ii <- c(which(e2 == e1[i]), which(e1 == e2[i]))
+            if (!length(ii)) next
+            grad[i] <- grad[i] + lambda*(2*length(ii)*rate[i] - 2*sum(rate[ii]))
+        }
+
+        ## gradient for the 'node times'
+        for (i in 1:length(unknown.ages)) {
+            nd <- unknown.ages[i]
+            ii <- which(e1 == nd)
+            grad[i + N] <-
+                sum(rate[ii] - el[ii]/real.edge.length[ii])#, na.rm = TRUE)
+            if (nd != ROOT) {
+                ii <- which(e2 == nd)
+                grad[i + N] <- grad[i + N] -
+                    rate[ii] + el[ii]/real.edge.length[ii]
+            }
+        }
+        grad
      }
  
-    ploglik <- function(rate, node.time) {
+    minusploglik <- function(rate, node.time) {
          age[unknown.ages] <- node.time
-        real.edge.length <- age[phy$edge[, 1]] - age[phy$edge[, 2]]
+        real.edge.length <- age[e1] - age[e2]
+        if (any(real.edge.length < 0)) return(1e50)
          B <- rate*real.edge.length
-        loglik <- sum(-B + phy$edge.length*log(B) -
-                      lfactorial(phy$edge.length))
-        loglik - lambda * (sum((rate[ind[, 1]] - rate[ind[, 2]])^2)
-                           + var(rate[basal]))
+        loglik <- sum(-B + el*log(B) - lfactorial(el))
+        -(loglik - lambda*(sum((rate[ind[, 1]] - rate[ind[, 2]])^2)
+                           + var(rate[basal])))
      }
  
-    out <- nlm(function(p) -ploglik(p[1:N], p[-(1:N)]),
-               p = c(ini.rate, ini.time[unknown.ages - n]),
-               iterlim = 500)
+    out <- nlminb(c(ini.rate, ini.time[unknown.ages]),
+                  function(p) minusploglik(p[EDGES], p[-EDGES]),
+                  function(p) minusploglik.gr(p[EDGES], p[-EDGES]),
+                  control = list(eval.max = eval.max, iter.max = iter.max, ...),
+                  lower = lower, upper = upper)
  
-    attr(phy, "ploglik") <- -out$minimum
-    attr(phy, "rates") <- out$estimate[1:N]
-    age[unknown.ages] <- out$estimate[-(1:N)]
+    attr(phy, "ploglik") <- -out$objective
+    attr(phy, "rates") <- out$par[EDGES]
+    attr(phy, "message") <- out$message
+    age[unknown.ages] <- out$par[-EDGES]
      if (CV) ophy <- phy
-    phy$edge.length <- age[phy$edge[, 1]] - age[phy$edge[, 2]]
-    if (CV)
-      attr(phy, "D2") <-
-        chronopl.cv(ophy, lambda, node.age, node, n)
+    phy$edge.length <- age[e1] - age[e2]
+    if (CV) attr(phy, "D2") <-
+        chronopl.cv(ophy, lambda, age.min, age.max, node.bak,
+                    n, S, tol, eval.max, iter.max, ...)
      phy
  }
  
-chronopl.cv <- function(ophy, lambda, node.age, nodes, n)
+chronopl.cv <- function(ophy, lambda, age.min, age.max, nodes,
+                        n, S, tol, eval.max, iter.max, ...)
  ### ophy: the original phylogeny
  ### n: number of tips
  ### Note that we assume here that the order of the nodes
@@ -107,22 +232,23 @@ chronopl.cv <- function(ophy, lambda, node.age, nodes, n)
      D2 <- numeric(n)
  
      for (i in 1:n) {
-        cat("  dropping tip", i, "\n")
+        cat("\r  dropping tip ", i, " / ", n, sep = "")
          tr <- drop.tip(ophy, i)
          j <- which(ophy$edge[, 2] == i)
          if (ophy$edge[j, 1] %in% nodes) {
              k <- which(nodes == ophy$edge[j, 1])
-            nodes <- nodes[-k]
-            node.age <- node.age[-k]
-        }
-        if (length(nodes)) {
-            chr <- chronopl(tr, lambda, node.age, nodes)
-            ## <FIXME> à vérifier:
-            ## tmp <- BT[as.character(ophy$edge[j, 1])]
-            tmp <- BT[-(ophy$edge[j, 1] - n)]
-            ## </FIXME>
+            node <- nodes[-k]
+            agemin <- age.min[-k]
+            agemax <- age.max[-k]
+        } else node <- nodes
+        if (length(node)) {
+            chr <- chronopl(tr, lambda, age.min, age.max, node,
+                            S, tol, FALSE, eval.max, iter.max, ...)
+            tmp <-
+                if (Nnode(chr) == Nnode(ophy)) BT else BT[-(ophy$edge[j, 1] - n)]
              D2[i] <- sum((tmp - branching.times(chr))^2 / tmp)
          } else D2[i] <- 0
      }
+    cat("\n")
      D2
  }