1 ## chronos.R (2013-01-03)
3 ## Molecular Dating With Penalized and Maximum Likelihood
5 ## Copyright 2013 Emmanuel Paradis
7 ## This file is part of the R-package `ape'.
8 ## See the file ../COPYING for licensing issues.
11 list(tol = 1e-8, iter.max = 1e4, eval.max = 1e4, nb.rate.cat = 10,
15 function(phy, node = "root", age.min = 1, age.max = age.min,
16 interactive = FALSE, soft.bounds = FALSE)
21 cat("Click close to a node and enter the ages (right-click to exit)\n\n")
23 age.min <- age.max <- numeric()
25 ans <- identify(phy, quiet = TRUE)
26 if (is.null(ans)) break
28 nodelabels(node = NODE, col = "white", bg = "blue")
29 cat("constraints for node ", NODE, sep = "")
30 cat("\n youngest age: ")
31 AGE.MIN <- as.numeric(readLines(n = 1))
32 cat(" oldest age (ENTER if not applicable): ")
33 AGE.MAX <- as.numeric(readLines(n = 1))
35 age.min <- c(age.min, AGE.MIN)
36 age.max <- c(age.max, AGE.MAX)
39 if (any(s)) age.max[s] <- age.min[s]
41 if (identical(node, "root")) node <- n + 1L
45 stop("node numbers should be greater than the number of tips")
47 diff.age <- which(age.max < age.min)
48 if (length(diff.age)) {
49 msg <- "'old age' less than 'young age' for node"
50 if (length(diff.age) > 1) msg <- paste(msg, "s", sep = "")
51 stop(paste(msg, paste(node[diff.age], collapse = ", ")))
54 data.frame(node, age.min, age.max, soft.bounds = soft.bounds)
57 chronos.control <- function(...)
62 chk.nms <- names(dots) %in% names(x)
64 warning("some control parameter names do not match: they were ignored")
67 x[names(dots)] <- dots
73 function(phy, lambda = 1, model = "correlated", quiet = FALSE,
74 calibration = makeChronosCalib(phy),
75 control = chronos.control())
77 model <- match.arg(tolower(model), c("correlated", "relaxed", "discrete"))
82 if (any(el < 0)) stop("some branch lengths are negative")
91 node <- calibration$node
92 age.min <- calibration$age.min
93 age.max <- calibration$age.max
95 if (model == "correlated") {
96 ### `basal' contains the indices of the basal edges
97 ### (ie, linked to the root):
98 basal <- which(e1 == ROOT)
99 Nbasal <- length(basal)
101 ### 'ind1' contains the index of all nonbasal edges, and 'ind2' the
102 ### index of the edges where these edges come from (ie, they contain
103 ### pairs of contiguous edges), eg:
105 ### ___b___ ind1 ind2
107 ### ___a___| | b || a |
111 ind1 <- EDGES[-basal]
112 ind2 <- match(e1[EDGES[-basal]], e2)
115 age <- numeric(n + m)
117 ### This bit sets 'ini.time' and should result in no negative branch lengths
119 if (!quiet) cat("\nSetting initial dates...\n")
120 seq.nod <- .Call("seq_root2tip", phy$edge, n, phy$Nnode, PACKAGE = "ape")
125 ini.time[ROOT:(n + m)] <- NA
128 if (is.null(age.max)) age.min
129 else runif(length(node), age.min, age.max) # (age.min + age.max) / 2
131 ## if no age given for the root, find one approximately:
132 if (is.na(ini.time[ROOT]))
133 ini.time[ROOT] <- if (is.null(age.max)) 3 * max(age.min) else 3 * max(age.max)
135 ISnotNA.ALL <- unlist(lapply(seq.nod, function(x) sum(!is.na(ini.time[x]))))
136 o <- order(ISnotNA.ALL, decreasing = TRUE)
138 for (y in seq.nod[o]) {
139 ISNA <- is.na(ini.time[y])
141 i <- 2L # we know the 1st value is not NA, so we start at the 2nd one
142 while (i <= length(y)) {
143 if (ISNA[i]) { # we stop at the next NA
145 while (ISNA[j]) j <- j + 1L # look for the next non-NA
147 by <- (ini.time[y[i - 1L]] - ini.time[y[j]]) / (nb.val + 1)
148 ini.time[y[i:(j - 1L)]] <- ini.time[y[i - 1L]] - by * seq_len(nb.val)
154 if (all(ini.time[e1] - ini.time[e2] >= 0)) break
157 stop("cannot find reasonable starting dates after 1000 tries:
158 maybe you need to adjust the calibration dates")
162 #ini.time[ROOT:(n+m)] <- branching.times(chr.dis)
163 ## ini.time[ROOT:(n+m)] <- ini.time[ROOT:(n+m)] + rnorm(m, 0, 5)
167 ### Setting 'ini.rate'
168 ini.rate <- el/(ini.time[e1] - ini.time[e2])
170 if (model == "discrete") {
171 Nb.rates <- control$nb.rate.cat
172 minmax <- range(ini.rate)
174 ini.rate <- sum(minmax)/2
176 inc <- diff(minmax)/Nb.rates
177 ini.rate <- seq(minmax[1] + inc/2, minmax[2] - inc/2, inc)
178 ini.freq <- rep(1/Nb.rates, Nb.rates - 1)
179 lower.freq <- rep(0, Nb.rates - 1)
180 upper.freq <- rep(1, Nb.rates - 1)
185 ### Setting bounds for the node ages
187 ## `unknown.ages' will contain the index of the nodes of unknown age:
188 unknown.ages <- 1:m + n
190 ## initialize vectors for all nodes:
191 lower.age <- rep(tol, m)
192 upper.age <- rep(1/tol, m)
194 lower.age[node - n] <- age.min
195 upper.age[node - n] <- age.max
197 ## find nodes known within an interval:
198 ii <- which(age.min != age.max)
199 ## drop them from 'node' since they will be estimated:
203 age[node] <- age.min[-ii] # update 'age'
204 } else age[node] <- age.min
206 ## finally adjust the 3 vectors:
208 unknown.ages <- unknown.ages[n - node] # 'n - node' is simplification for '-(node - n)'
209 lower.age <- lower.age[n - node]
210 upper.age <- upper.age[n - node]
212 ### Bounds for the node ages set
214 ## 'known.ages' contains the index of all nodes
215 ## (internal and terminal) of known age:
216 known.ages <- c(TIPS, node)
218 ## the bounds for the rates:
219 lower.rate <- rep(tol, Nb.rates)
220 upper.rate <- rep(100 - tol, Nb.rates) # needs to be adjusted to higher values?
223 degree_node <- tabulate(phy$edge)
224 eta_i <- degree_node[e1]
226 ## eta_i[i] is the number of contiguous branches for branch 'i'
228 ## use of a list of indices is slightly faster than an incidence matrix
229 ## and takes much less memory (60 Kb vs. 8 Mb for n = 500)
230 X <- vector("list", N)
233 if (e1[i] != ROOT) j <- c(j, which(e2 == e1[i]))
234 if (e2[i] >= n) j <- c(j, which(e1 == e2[i]))
237 ## X is a list whose i-th element gives the indices of the branches
238 ## that are contiguous to branch 'i'
240 ## D_ki and A_ki are defined in the SI of the paper
241 D_ki <- match(unknown.ages, e2)
242 A_ki <- lapply(unknown.ages, function(x) which(x == e1))
244 gradient.poisson <- function(rate, node.time) {
245 age[unknown.ages] <- node.time
246 real.edge.length <- age[e1] - age[e2]
247 #if (any(real.edge.length < 0))
248 # return(numeric(N + length(unknown.ages)))
249 ## gradient for the rates:
250 gr <- el/rate - real.edge.length
252 ## gradient for the dates:
253 tmp <- el/real.edge.length - rate
254 gr.dates <- sapply(A_ki, function(x) sum(tmp[x])) - tmp[D_ki]
259 ## gradient of the penalized lik (must be multiplied by -1 before calling nlminb)
263 function(rate, node.time) {
264 gr <- gradient.poisson(rate, node.time)
265 #if (all(gr == 0)) return(gr)
267 ## contribution of the penalty for the rates:
268 gr[RATE] <- gr[RATE] - lambda * 2 * (eta_i * rate - sapply(X, function(x) sum(rate[x])))
269 ## the contribution of the root variance term:
270 if (Nbasal == 2) { # the simpler formulae if there's a basal dichotomy
273 gr[i] <- gr[i] - lambda * (rate[i] - rate[j])
274 gr[j] <- gr[j] - lambda * (rate[j] - rate[i])
275 } else { # the general case
279 lambda*2*(rate[j]*(1 - 1/Nbasal) - sum(rate[basal[-i]])/Nbasal)/(Nbasal - 1)
284 function(rate, node.time) {
285 gr <- gradient.poisson(rate, node.time)
286 #if (all(gr == 0)) return(gr)
288 ## contribution of the penalty for the rates:
289 mean.rate <- mean(rate)
290 ## rank(rate)/Nb.rates is the same than ecdf(rate)(rate) but faster
291 gr[RATE] <- gr[RATE] + lambda*2*dgamma(rate, mean.rate)*(rank(rate)/Nb.rates - pgamma(rate, mean.rate))
296 log.lik.poisson <- function(rate, node.time) {
297 age[unknown.ages] <- node.time
298 real.edge.length <- age[e1] - age[e2]
299 if (isTRUE(any(real.edge.length < 0))) return(-1e100)
300 B <- rate * real.edge.length
301 sum(el * log(B) - B - lfactorial(el))
304 ### penalized log-likelihood
308 function(rate, node.time) {
309 loglik <- log.lik.poisson(rate, node.time)
310 if (!is.finite(loglik)) return(-1e100)
311 loglik - lambda * (sum((rate[ind1] - rate[ind2])^2)
315 function(rate, node.time) {
316 loglik <- log.lik.poisson(rate, node.time)
317 if (!is.finite(loglik)) return(-1e100)
319 ## loglik - lambda * sum((1:N/N - pbeta(sort(rate), mu/(1 + mu), 1))^2) # avec loi beta
320 ## loglik - lambda * sum((1:N/N - pcauchy(sort(rate)))^2) # avec loi Cauchy
321 loglik - lambda * sum((1:N/N - pgamma(sort(rate), mean(rate)))^2) # avec loi Gamma
325 function(rate, node.time) log.lik.poisson(rate, node.time)
326 else function(rate, node.time, freq) {
327 if (isTRUE(sum(freq) > 1)) return(-1e100)
328 rate.freq <- sum(c(freq, 1 - sum(freq)) * rate)
329 log.lik.poisson(rate.freq, node.time)
332 opt.ctrl <- list(eval.max = control$eval.max, iter.max = control$iter.max)
334 ## the following capitalized vectors give the indices of
335 ## the parameters once they are concatenated in 'p'
337 AGE <- Nb.rates + 1:length(unknown.ages)
339 if (model == "discrete") {
341 start.para <- c(ini.rate, ini.time[unknown.ages])
342 f <- function(p) -penal.loglik(p[RATE], p[AGE])
344 LOW <- c(lower.rate, lower.age)
345 UP <- c(upper.rate, upper.age)
347 FREQ <- length(RATE) + length(AGE) + 1:(Nb.rates - 1)
348 start.para <- c(ini.rate, ini.time[unknown.ages], ini.freq)
349 f <- function(p) -penal.loglik(p[RATE], p[AGE], p[FREQ])
351 LOW <- c(lower.rate, lower.age, lower.freq)
352 UP <- c(upper.rate, upper.age, upper.freq)
355 start.para <- c(ini.rate, ini.time[unknown.ages])
356 f <- function(p) -penal.loglik(p[RATE], p[AGE])
357 g <- function(p) -gradient(p[RATE], p[AGE])
358 LOW <- c(lower.rate, lower.age)
359 UP <- c(upper.rate, upper.age)
362 k <- length(LOW) # number of free parameters
364 if (!quiet) cat("Fitting in progress... get a first set of estimates\n")
366 out <- nlminb(start.para, f, g,
367 control = opt.ctrl, lower = LOW, upper = UP)
369 if (model == "discrete") {
371 f.rates <- function(p) -penal.loglik(p, current.ages)
372 f.ages <- function(p) -penal.loglik(current.rates, p)
374 f.rates <- function(p) -penal.loglik(p, current.ages, current.freqs)
375 f.ages <- function(p) -penal.loglik(current.rates, p, current.freqs)
376 f.freqs <- function(p) -penal.loglik(current.rates, current.ages, p)
382 f.rates <- function(p) -penal.loglik(p, current.ages)
383 g.rates <- function(p) -gradient(p, current.ages)[RATE]
384 f.ages <- function(p) -penal.loglik(current.rates, p)
385 g.ages <- function(p) -gradient(current.rates, p)[AGE]
388 current.ploglik <- -out$objective
389 current.rates <- out$par[RATE]
390 current.ages <- out$par[AGE]
391 if (model == "discrete" && Nb.rates > 1) current.freqs <- out$par[FREQ]
393 dual.iter.max <- control$dual.iter.max
396 if (!quiet) cat(" Penalised log-lik =", current.ploglik, "\n")
399 if (dual.iter.max < 1) break
400 if (!quiet) cat("Optimising rates...")
401 out.rates <- nlminb(current.rates, f.rates, g.rates,# h.rates,
402 control = list(eval.max = 1000, iter.max = 1000,
403 step.min = 1e-8, step.max = .1),
404 lower = lower.rate, upper = upper.rate)
405 new.rates <- out.rates$par
406 if (-out.rates$objective > current.ploglik)
407 current.rates <- new.rates
409 if (model == "discrete" && Nb.rates > 1) {
410 if (!quiet) cat(" frequencies...")
411 out.freqs <- nlminb(current.freqs, f.freqs,
412 control = list(eval.max = 1000, iter.max = 1000,
413 step.min = .001, step.max = .5),
414 lower = lower.freq, upper = upper.freq)
415 new.freqs <- out.freqs$par
418 if (!quiet) cat(" dates...")
419 out.ages <- nlminb(current.ages, f.ages, g.ages,# h.ages,
420 control = list(eval.max = 1000, iter.max = 1000,
421 step.min = .001, step.max = 100),
422 lower = lower.age, upper = upper.age)
423 new.ploglik <- -out.ages$objective
425 if (!quiet) cat("", current.ploglik, "\n")
427 if (new.ploglik - current.ploglik > 1e-6 && i <= dual.iter.max) {
428 current.ploglik <- new.ploglik
429 current.rates <- new.rates
430 current.ages <- out.ages$par
431 if (model == "discrete" && Nb.rates > 1) current.freqs <- new.freqs
437 if (!quiet) cat("\nDone.\n")
441 if (model == "discrete") {
443 if (Nb.rates == 1) current.rates
444 else mean(c(current.freqs, 1 - sum(current.freqs)) * current.rates)
445 logLik <- log.lik.poisson(rate.freq, current.ages)
446 PHIIC <- list(logLik = logLik, k = k, PHIIC = - 2 * logLik + 2 * k)
448 logLik <- log.lik.poisson(current.rates, current.ages)
450 "correlated" = (current.rates[ind1] - current.rates[ind2])^2 + var(current.rates[basal]),
451 "relaxed" = (1:N/N - pgamma(sort(current.rates), mean(current.rates)))^2) # avec loi Gamma
452 PHIIC <- list(logLik = logLik, k = k, lambda = lambda,
453 PHIIC = - 2 * logLik + 2 * k + lambda * svd(PHI)$d)
456 attr(phy, "call") <- match.call()
457 attr(phy, "ploglik") <- -out$objective
458 attr(phy, "rates") <- current.rates #out$par[EDGES]
459 if (model == "discrete" && Nb.rates > 1)
460 attr(phy, "frequencies") <- current.freqs
461 attr(phy, "message") <- out$message
462 attr(phy, "PHIIC") <- PHIIC
463 age[unknown.ages] <- current.ages #out$par[-EDGES]
464 phy$edge.length <- age[e1] - age[e2]
465 class(phy) <- c("chronos", class(phy))
469 print.chronos <- function(x, ...)
471 cat("\n Chronogram\n\n")
473 print(attr(x, "call"))