--- /dev/null
+## chronos.R (2013-01-03)
+
+## Molecular Dating With Penalized and Maximum Likelihood
+
+## Copyright 2013 Emmanuel Paradis
+
+## This file is part of the R-package `ape'.
+## See the file ../COPYING for licensing issues.
+
+.chronos.ctrl <-
+ list(tol = 1e-8, iter.max = 1e4, eval.max = 1e4, nb.rate.cat = 10,
+ dual.iter.max = 20)
+
+makeChronosCalib <-
+ function(phy, node = "root", age.min = 1, age.max = age.min,
+ interactive = FALSE, soft.bounds = FALSE)
+{
+ n <- Ntip(phy)
+ if (interactive) {
+ plot(phy)
+ cat("Click close to a node and enter the ages (right-click to exit)\n\n")
+ node <- integer()
+ age.min <- age.max <- numeric()
+ repeat {
+ ans <- identify(phy, quiet = TRUE)
+ if (is.null(ans)) break
+ NODE <- ans$nodes
+ nodelabels(node = NODE, col = "white", bg = "blue")
+ cat("constraints for node ", NODE, sep = "")
+ cat("\n youngest age: ")
+ AGE.MIN <- as.numeric(readLines(n = 1))
+ cat(" oldest age (ENTER if not applicable): ")
+ AGE.MAX <- as.numeric(readLines(n = 1))
+ node <- c(node, NODE)
+ age.min <- c(age.min, AGE.MIN)
+ age.max <- c(age.max, AGE.MAX)
+ }
+ s <- is.na(age.max)
+ if (any(s)) age.max[s] <- age.min[s]
+ } else {
+ if (identical(node, "root")) node <- n + 1L
+ }
+
+ if (any(node <= n))
+ stop("node numbers should be greater than the number of tips")
+
+ diff.age <- which(age.max < age.min)
+ if (length(diff.age)) {
+ msg <- "'old age' less than 'young age' for node"
+ if (length(diff.age) > 1) msg <- paste(msg, "s", sep = "")
+ stop(paste(msg, paste(node[diff.age], collapse = ", ")))
+ }
+
+ data.frame(node, age.min, age.max, soft.bounds = soft.bounds)
+}
+
+chronos.control <- function(...)
+{
+ dots <- list(...)
+ x <- .chronos.ctrl
+ if (length(dots)) {
+ chk.nms <- names(dots) %in% names(x)
+ if (any(!chk.nms)) {
+ warning("some control parameter names do not match: they were ignored")
+ dots <- dots[chk.nms]
+ }
+ x[names(dots)] <- dots
+ }
+ x
+}
+
+chronos <-
+ function(phy, lambda = 1, model = "correlated", quiet = FALSE,
+ calibration = makeChronosCalib(phy),
+ control = chronos.control())
+{
+ model <- match.arg(tolower(model), c("correlated", "relaxed", "discrete"))
+ n <- Ntip(phy)
+ ROOT <- n + 1L
+ m <- phy$Nnode
+ el <- phy$edge.length
+ if (any(el < 0)) stop("some branch lengths are negative")
+ e1 <- phy$edge[, 1L]
+ e2 <- phy$edge[, 2L]
+ N <- length(e1)
+ TIPS <- 1:n
+ EDGES <- 1:N
+
+ tol <- control$tol
+
+ node <- calibration$node
+ age.min <- calibration$age.min
+ age.max <- calibration$age.max
+
+ if (model == "correlated") {
+### `basal' contains the indices of the basal edges
+### (ie, linked to the root):
+ basal <- which(e1 == ROOT)
+ Nbasal <- length(basal)
+
+### 'ind1' contains the index of all nonbasal edges, and 'ind2' the
+### index of the edges where these edges come from (ie, they contain
+### pairs of contiguous edges), eg:
+
+### ___b___ ind1 ind2
+### | | || |
+### ___a___| | b || a |
+### | | c || a |
+### |___c___ | || |
+
+ ind1 <- EDGES[-basal]
+ ind2 <- match(e1[EDGES[-basal]], e2)
+ }
+
+ age <- numeric(n + m)
+
+### This bit sets 'ini.time' and should result in no negative branch lengths
+
+ if (!quiet) cat("\nSetting initial dates...\n")
+ seq.nod <- .Call("seq_root2tip", phy$edge, n, phy$Nnode, PACKAGE = "ape")
+
+ ii <- 1L
+ repeat {
+ ini.time <- age
+ ini.time[ROOT:(n + m)] <- NA
+
+ ini.time[node] <-
+ if (is.null(age.max)) age.min
+ else runif(length(node), age.min, age.max) # (age.min + age.max) / 2
+
+ ## if no age given for the root, find one approximately:
+ if (is.na(ini.time[ROOT]))
+ ini.time[ROOT] <- if (is.null(age.max)) 3 * max(age.min) else 3 * max(age.max)
+
+ ISnotNA.ALL <- unlist(lapply(seq.nod, function(x) sum(!is.na(ini.time[x]))))
+ o <- order(ISnotNA.ALL, decreasing = TRUE)
+
+ for (y in seq.nod[o]) {
+ ISNA <- is.na(ini.time[y])
+ if (any(ISNA)) {
+ i <- 2L # we know the 1st value is not NA, so we start at the 2nd one
+ while (i <= length(y)) {
+ if (ISNA[i]) { # we stop at the next NA
+ j <- i + 1L
+ while (ISNA[j]) j <- j + 1L # look for the next non-NA
+ nb.val <- j - i
+ by <- (ini.time[y[i - 1L]] - ini.time[y[j]]) / (nb.val + 1)
+ ini.time[y[i:(j - 1L)]] <- ini.time[y[i - 1L]] - by * seq_len(nb.val)
+ i <- j + 1L
+ } else i <- i + 1L
+ }
+ }
+ }
+ if (all(ini.time[e1] - ini.time[e2] >= 0)) break
+ ii <- ii + 1L
+ if (ii > 1000)
+ stop("cannot find reasonable starting dates after 1000 tries:
+maybe you need to adjust the calibration dates")
+ }
+### 'ini.time' set
+
+ #ini.time[ROOT:(n+m)] <- branching.times(chr.dis)
+ ## ini.time[ROOT:(n+m)] <- ini.time[ROOT:(n+m)] + rnorm(m, 0, 5)
+ #print(ini.time)
+
+
+### Setting 'ini.rate'
+ ini.rate <- el/(ini.time[e1] - ini.time[e2])
+
+ if (model == "discrete") {
+ Nb.rates <- control$nb.rate.cat
+ minmax <- range(ini.rate)
+ if (Nb.rates == 1) {
+ ini.rate <- sum(minmax)/2
+ } else {
+ inc <- diff(minmax)/Nb.rates
+ ini.rate <- seq(minmax[1] + inc/2, minmax[2] - inc/2, inc)
+ ini.freq <- rep(1/Nb.rates, Nb.rates - 1)
+ lower.freq <- rep(0, Nb.rates - 1)
+ upper.freq <- rep(1, Nb.rates - 1)
+ }
+ } else Nb.rates <- N
+## 'ini.rate' set
+
+### Setting bounds for the node ages
+
+ ## `unknown.ages' will contain the index of the nodes of unknown age:
+ unknown.ages <- 1:m + n
+
+ ## initialize vectors for all nodes:
+ lower.age <- rep(tol, m)
+ upper.age <- rep(1/tol, m)
+
+ lower.age[node - n] <- age.min
+ upper.age[node - n] <- age.max
+
+ ## find nodes known within an interval:
+ ii <- which(age.min != age.max)
+ ## drop them from 'node' since they will be estimated:
+ if (length(ii)) {
+ node <- node[-ii]
+ if (length(node))
+ age[node] <- age.min[-ii] # update 'age'
+ } else age[node] <- age.min
+
+ ## finally adjust the 3 vectors:
+ if (length(node)) {
+ unknown.ages <- unknown.ages[n - node] # 'n - node' is simplification for '-(node - n)'
+ lower.age <- lower.age[n - node]
+ upper.age <- upper.age[n - node]
+ }
+### Bounds for the node ages set
+
+ ## 'known.ages' contains the index of all nodes
+ ## (internal and terminal) of known age:
+ known.ages <- c(TIPS, node)
+
+ ## the bounds for the rates:
+ lower.rate <- rep(tol, Nb.rates)
+ upper.rate <- rep(100 - tol, Nb.rates) # needs to be adjusted to higher values?
+
+### Gradient
+ degree_node <- tabulate(phy$edge)
+ eta_i <- degree_node[e1]
+ eta_i[e2 <= n] <- 1L
+ ## eta_i[i] is the number of contiguous branches for branch 'i'
+
+ ## use of a list of indices is slightly faster than an incidence matrix
+ ## and takes much less memory (60 Kb vs. 8 Mb for n = 500)
+ X <- vector("list", N)
+ for (i in EDGES) {
+ j <- integer()
+ if (e1[i] != ROOT) j <- c(j, which(e2 == e1[i]))
+ if (e2[i] >= n) j <- c(j, which(e1 == e2[i]))
+ X[[i]] <- j
+ }
+ ## X is a list whose i-th element gives the indices of the branches
+ ## that are contiguous to branch 'i'
+
+ ## D_ki and A_ki are defined in the SI of the paper
+ D_ki <- match(unknown.ages, e2)
+ A_ki <- lapply(unknown.ages, function(x) which(x == e1))
+
+ gradient.poisson <- function(rate, node.time) {
+ age[unknown.ages] <- node.time
+ real.edge.length <- age[e1] - age[e2]
+ #if (any(real.edge.length < 0))
+ # return(numeric(N + length(unknown.ages)))
+ ## gradient for the rates:
+ gr <- el/rate - real.edge.length
+
+ ## gradient for the dates:
+ tmp <- el/real.edge.length - rate
+ gr.dates <- sapply(A_ki, function(x) sum(tmp[x])) - tmp[D_ki]
+
+ c(gr, gr.dates)
+ }
+
+ ## gradient of the penalized lik (must be multiplied by -1 before calling nlminb)
+ gradient <-
+ switch(model,
+ "correlated" =
+ function(rate, node.time) {
+ gr <- gradient.poisson(rate, node.time)
+ #if (all(gr == 0)) return(gr)
+
+ ## contribution of the penalty for the rates:
+ gr[RATE] <- gr[RATE] - lambda * 2 * (eta_i * rate - sapply(X, function(x) sum(rate[x])))
+ ## the contribution of the root variance term:
+ if (Nbasal == 2) { # the simpler formulae if there's a basal dichotomy
+ i <- basal[1]
+ j <- basal[2]
+ gr[i] <- gr[i] - lambda * (rate[i] - rate[j])
+ gr[j] <- gr[j] - lambda * (rate[j] - rate[i])
+ } else { # the general case
+ for (i in 1:Nbasal)
+ j <- basal[i]
+ gr[j] <- gr[j] -
+ lambda*2*(rate[j]*(1 - 1/Nbasal) - sum(rate[basal[-i]])/Nbasal)/(Nbasal - 1)
+ }
+ gr
+ },
+ "relaxed" =
+ function(rate, node.time) {
+ gr <- gradient.poisson(rate, node.time)
+ #if (all(gr == 0)) return(gr)
+
+ ## contribution of the penalty for the rates:
+ mean.rate <- mean(rate)
+ ## rank(rate)/Nb.rates is the same than ecdf(rate)(rate) but faster
+ gr[RATE] <- gr[RATE] + lambda*2*dgamma(rate, mean.rate)*(rank(rate)/Nb.rates - pgamma(rate, mean.rate))
+ gr
+ },
+ "discrete" = NULL)
+
+ log.lik.poisson <- function(rate, node.time) {
+ age[unknown.ages] <- node.time
+ real.edge.length <- age[e1] - age[e2]
+ if (isTRUE(any(real.edge.length < 0))) return(-1e100)
+ B <- rate * real.edge.length
+ sum(el * log(B) - B - lfactorial(el))
+ }
+
+### penalized log-likelihood
+ penal.loglik <-
+ switch(model,
+ "correlated" =
+ function(rate, node.time) {
+ loglik <- log.lik.poisson(rate, node.time)
+ if (!is.finite(loglik)) return(-1e100)
+ loglik - lambda * (sum((rate[ind1] - rate[ind2])^2)
+ + var(rate[basal]))
+ },
+ "relaxed" =
+ function(rate, node.time) {
+ loglik <- log.lik.poisson(rate, node.time)
+ if (!is.finite(loglik)) return(-1e100)
+ mu <- mean(rate)
+ ## loglik - lambda * sum((1:N/N - pbeta(sort(rate), mu/(1 + mu), 1))^2) # avec loi beta
+ ## loglik - lambda * sum((1:N/N - pcauchy(sort(rate)))^2) # avec loi Cauchy
+ loglik - lambda * sum((1:N/N - pgamma(sort(rate), mean(rate)))^2) # avec loi Gamma
+ },
+ "discrete" =
+ if (Nb.rates == 1)
+ function(rate, node.time) log.lik.poisson(rate, node.time)
+ else function(rate, node.time, freq) {
+ if (isTRUE(sum(freq) > 1)) return(-1e100)
+ rate.freq <- sum(c(freq, 1 - sum(freq)) * rate)
+ log.lik.poisson(rate.freq, node.time)
+ })
+
+ opt.ctrl <- list(eval.max = control$eval.max, iter.max = control$iter.max)
+
+ ## the following capitalized vectors give the indices of
+ ## the parameters once they are concatenated in 'p'
+ RATE <- 1:Nb.rates
+ AGE <- Nb.rates + 1:length(unknown.ages)
+
+ if (model == "discrete") {
+ if (Nb.rates == 1) {
+ start.para <- c(ini.rate, ini.time[unknown.ages])
+ f <- function(p) -penal.loglik(p[RATE], p[AGE])
+ g <- NULL
+ LOW <- c(lower.rate, lower.age)
+ UP <- c(upper.rate, upper.age)
+ } else {
+ FREQ <- length(RATE) + length(AGE) + 1:(Nb.rates - 1)
+ start.para <- c(ini.rate, ini.time[unknown.ages], ini.freq)
+ f <- function(p) -penal.loglik(p[RATE], p[AGE], p[FREQ])
+ g <- NULL
+ LOW <- c(lower.rate, lower.age, lower.freq)
+ UP <- c(upper.rate, upper.age, upper.freq)
+ }
+ } else {
+ start.para <- c(ini.rate, ini.time[unknown.ages])
+ f <- function(p) -penal.loglik(p[RATE], p[AGE])
+ g <- function(p) -gradient(p[RATE], p[AGE])
+ LOW <- c(lower.rate, lower.age)
+ UP <- c(upper.rate, upper.age)
+ }
+
+ k <- length(LOW) # number of free parameters
+
+ if (!quiet) cat("Fitting in progress... get a first set of estimates\n")
+
+ out <- nlminb(start.para, f, g,
+ control = opt.ctrl, lower = LOW, upper = UP)
+
+ if (model == "discrete") {
+ if (Nb.rates == 1) {
+ f.rates <- function(p) -penal.loglik(p, current.ages)
+ f.ages <- function(p) -penal.loglik(current.rates, p)
+ } else {
+ f.rates <- function(p) -penal.loglik(p, current.ages, current.freqs)
+ f.ages <- function(p) -penal.loglik(current.rates, p, current.freqs)
+ f.freqs <- function(p) -penal.loglik(current.rates, current.ages, p)
+ g.freqs <- NULL
+ }
+ g.rates <- NULL
+ g.ages <- NULL
+ } else {
+ f.rates <- function(p) -penal.loglik(p, current.ages)
+ g.rates <- function(p) -gradient(p, current.ages)[RATE]
+ f.ages <- function(p) -penal.loglik(current.rates, p)
+ g.ages <- function(p) -gradient(current.rates, p)[AGE]
+ }
+
+ current.ploglik <- -out$objective
+ current.rates <- out$par[RATE]
+ current.ages <- out$par[AGE]
+ if (model == "discrete" && Nb.rates > 1) current.freqs <- out$par[FREQ]
+
+ dual.iter.max <- control$dual.iter.max
+ i <- 0L
+
+ if (!quiet) cat(" Penalised log-lik =", current.ploglik, "\n")
+
+ repeat {
+ if (dual.iter.max < 1) break
+ if (!quiet) cat("Optimising rates...")
+ out.rates <- nlminb(current.rates, f.rates, g.rates,# h.rates,
+ control = list(eval.max = 1000, iter.max = 1000,
+ step.min = 1e-8, step.max = .1),
+ lower = lower.rate, upper = upper.rate)
+ new.rates <- out.rates$par
+ if (-out.rates$objective > current.ploglik)
+ current.rates <- new.rates
+
+ if (model == "discrete" && Nb.rates > 1) {
+ if (!quiet) cat(" frequencies...")
+ out.freqs <- nlminb(current.freqs, f.freqs,
+ control = list(eval.max = 1000, iter.max = 1000,
+ step.min = .001, step.max = .5),
+ lower = lower.freq, upper = upper.freq)
+ new.freqs <- out.freqs$par
+ }
+
+ if (!quiet) cat(" dates...")
+ out.ages <- nlminb(current.ages, f.ages, g.ages,# h.ages,
+ control = list(eval.max = 1000, iter.max = 1000,
+ step.min = .001, step.max = 100),
+ lower = lower.age, upper = upper.age)
+ new.ploglik <- -out.ages$objective
+
+ if (!quiet) cat("", current.ploglik, "\n")
+
+ if (new.ploglik - current.ploglik > 1e-6 && i <= dual.iter.max) {
+ current.ploglik <- new.ploglik
+ current.rates <- new.rates
+ current.ages <- out.ages$par
+ if (model == "discrete" && Nb.rates > 1) current.freqs <- new.freqs
+ out <- out.ages
+ i <- i + 1L
+ } else break
+ }
+
+ if (!quiet) cat("\nDone.\n")
+
+# browser()
+
+ if (model == "discrete") {
+ rate.freq <-
+ if (Nb.rates == 1) current.rates
+ else mean(c(current.freqs, 1 - sum(current.freqs)) * current.rates)
+ logLik <- log.lik.poisson(rate.freq, current.ages)
+ PHIIC <- list(logLik = logLik, k = k, PHIIC = - 2 * logLik + 2 * k)
+ } else {
+ logLik <- log.lik.poisson(current.rates, current.ages)
+ PHI <- switch(model,
+ "correlated" = (current.rates[ind1] - current.rates[ind2])^2 + var(current.rates[basal]),
+ "relaxed" = (1:N/N - pgamma(sort(current.rates), mean(current.rates)))^2) # avec loi Gamma
+ PHIIC <- list(logLik = logLik, k = k, lambda = lambda,
+ PHIIC = - 2 * logLik + 2 * k + lambda * svd(PHI)$d)
+ }
+
+ attr(phy, "call") <- match.call()
+ attr(phy, "ploglik") <- -out$objective
+ attr(phy, "rates") <- current.rates #out$par[EDGES]
+ if (model == "discrete" && Nb.rates > 1)
+ attr(phy, "frequencies") <- current.freqs
+ attr(phy, "message") <- out$message
+ attr(phy, "PHIIC") <- PHIIC
+ age[unknown.ages] <- current.ages #out$par[-EDGES]
+ phy$edge.length <- age[e1] - age[e2]
+ class(phy) <- c("chronos", class(phy))
+ phy
+}
+
+print.chronos <- function(x, ...)
+{
+ cat("\n Chronogram\n\n")
+ cat("Call: ")
+ print(attr(x, "call"))
+ cat("\n")
+ NextMethod("print")
+}
--- /dev/null
+\name{chronos}
+\alias{chronos}
+\alias{makeChronosCalib}
+\alias{chronos.control}
+\alias{print.chronos}
+\title{Molecular Dating by Penalised Likelihood and Maximum Likelihood}
+\description{
+ \code{chronos} is the main function fitting a chronogram to a
+ phylogenetic tree whose branch lengths are in number of substitution
+ per sites.
+
+ \code{makeChronosCalib} is a tool to prepare data frames with the
+ calibration points of the phylogenetic tree.
+
+ \code{chronos.control} creates a list of parameters to be passed
+ to \code{chronos}.
+}
+\usage{
+chronos(phy, lambda = 1, model = "correlated", quiet = FALSE,
+ calibration = makeChronosCalib(phy),
+ control = chronos.control())
+\method{print}{chronos}(x, ...)
+makeChronosCalib(phy, node = "root", age.min = 1,
+ age.max = age.min, interactive = FALSE, soft.bounds = FALSE)
+chronos.control(...)
+}
+\arguments{
+ \item{phy}{an object of class \code{"phylo"}.}
+ \item{lambda}{value of the smoothing parameter.}
+ \item{model}{a character string specifying the model of substitution
+ rate variation among branches. The possible choices are:
+ ``correlated'', ``relaxed'', ``discrete'', or an unambiguous
+ abbreviation of these.}
+ \item{quiet}{a logical value; by default the calculation progress are
+ displayed.}
+ \item{calibration}{a data frame (see details).}
+ \item{control}{a list of parameters controlling the optimisation
+ procedure (see details).}
+ \item{x}{an object of class \code{c("chronos", "phylo")}.}
+ \item{node}{a vector of integers giving the node numbers for which a
+ calibration point is given. The default is a short-cut for the
+ root.}
+ \item{age.min, age.max}{vectors of numerical values giving the minimum
+ and maximum ages of the nodes specified in \code{node}.}
+ \item{interactive}{a logical value. If \code{TRUE}, then \code{phy} is
+ plotted and the user is asked to click close to a node and enter the
+ ages on the keyboard.}
+ \item{soft.bounds}{(currently unused)}
+ \item{\dots}{in the case of \code{chronos.control}: one of the five
+ parameters controlling optimisation (unused in the case of
+ \code{print.chronos}).}
+}
+\details{
+ \code{chronos} replaces \code{chronopl} but with a different interface
+ and some extensions (see References).
+
+ The known dates (argument \code{calibration}) must be given in a data
+ frame with the following column names: node, age.min, age.max, and
+ soft.bounds (the last one is yet unused). For each row, these are,
+ respectively: the number of the node in the ``phylo'' coding standard,
+ the minimum age for this node, the maximum age, and a logical value
+ specifying whether the bounds are soft. If age.min = age.max, this
+ means that the age is exactly known. This data frame can be built with
+ \code{makeChronosCalib} which returns by default a data frame with a
+ single row giving age = 1 for the root. The data frame can be built
+ interactively by clicking on the plotted tree.
+
+ The argument \code{control} allows one to change some parameters of
+ the optimisation procedure. This must be a list with names. The
+ available options with their default values are:
+
+ \itemize{
+ \item{tol = 1e-8: }{tolerance for the estimation of the substitution
+ rates.}
+ \item{iter.max = 1e4: }{the maximum number of iterations at each
+ optimization step.}
+ \item{eval.max = 1e4: }{the maximum number of function evaluations at
+ each optimization step.}
+ \item{nb.rate.cat = 10: }{the number of rate categories if \code{model
+ = "discrete"} (set this parameter to 1 to fit a strict clock
+ model).}
+ \item{dual.iter.max = 20: }{the maximum number of alternative
+ iterations between rates and dates.}
+ }
+
+ The command \code{chronos.control()} returns a list with the default
+ values of these parameters. They may be modified by passing them to
+ this function, or directly in the list.
+}
+\value{
+ \code{chronos} returns an object of class \code{c("chronos",
+ "phylo")}. There is a print method for it. There are additional
+ attributes which can be visualised with \code{str} or extracted with
+ \code{attr}.
+
+ \code{makeChronosCalib} returns a data frame.
+
+ \code{chronos.control} returns a list.
+}
+\references{
+ Kim, J. and Sanderson, M. J. (2008) Penalized likelihood phylogenetic
+ inference: bridging the parsimony-likelihood gap. \emph{Systematic
+ Biology}, \bold{57}, 665--674.
+
+ Paradis, E. (2012) Molecular dating of phylogenies by likelihood
+ methods: a comparison of models and a new information
+ criterion. \emph{Manuscript}.
+
+ Sanderson, M. J. (2002) Estimating absolute rates of molecular
+ evolution and divergence times: a penalized likelihood
+ approach. \emph{Molecular Biology and Evolution}, \bold{19},
+ 101--109.
+}
+\author{Emmanuel Paradis}
+\seealso{
+ \code{\link{chronoMPL}}
+}
+\examples{
+tr <- rtree(10)
+### the default is the correlated rate model:
+chr <- chronos(tr)
+### strict clock model:
+ctrl <- chronos.control(nb.rate.cat = 1)
+chr.clock <- chronos(tr, model = "discrete", control = ctrl)
+### How different are the rates?
+attr(chr, "rates")
+attr(chr.clock, "rates")
+\dontrun{
+cal <- makeChronosCalib(tr, interactive = TRUE)
+cal
+### if you made mistakes, you can edit the data frame with:
+### fix(cal)
+chr <- chronos(tr, calibration = cal)
+}
+}
+\keyword{models}