new chronos files and a bunch of various improvements

author paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>

Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)

committer paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>

Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)
author paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)
committer paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)
diff --git a/DESCRIPTION b/DESCRIPTION

index e84ea2eea6493cb7ea21757b928a0a1bdbeb5aab..8d17b0b0a986314bb2d46b6ecb8cf5c74df2ba5d 100644 (file)
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
  Package: ape
  Version: 3.0-7
-Date: 2012-12-27
+Date: 2013-01-03
  Title: Analyses of Phylogenetics and Evolution
  Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne
  Maintainer: Emmanuel Paradis <Emmanuel.Paradis@ird.fr>
diff --git a/NEWS b/NEWS

index 57d1924cccdeffb9e41b43d9a2934a04ebe6aa68..1f2cd220f396a79f3bc7b53b25555fa3bf2a683c 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,8 @@
  
  NEW FEATURES
  
+      The new function chronos ..........
+
      o The new function 'where' searches patterns in DNA sequences.
  
      o pic() gains an option 'rescaled.tree = FALSE' to return the tree
@@ -34,6 +36,10 @@ OTHER CHANGES
  
      o base.freq() is now faster with lists.
  
+      as.matrix.DNAbin() ............ should be faster.... now accepts vectors,
+
+      read.dna() has a new for FASTA files............. faster C code.
+
  
  
                 CHANGES IN APE VERSION 3.0-6
diff --git a/R/DNA.R b/R/DNA.R

index d1b3625f0da247fe8da35377d3762a81575add2b..48e605484f877964815eef05fe055bbe51a69587 100644 (file)
--- a/R/DNA.R
+++ b/R/DNA.R
@@ -1,8 +1,8 @@
-## DNA.R (2012-12-27)
+## DNA.R (2013-01-04)
  
  ##   Manipulations and Comparisons of DNA Sequences
  
-## Copyright 2002-2012 Emmanuel Paradis
+## Copyright 2002-2013 Emmanuel Paradis
  
  ## This file is part of the R-package `ape'.
  ## See the file ../COPYING for licensing issues.
@@ -65,17 +65,21 @@ as.alignment <- function(x)
  
  as.matrix.DNAbin <- function(x, ...)
  {
-    if (is.list(x)) {
-        if (length(unique(unlist(lapply(x, length)))) != 1)
-          stop("DNA sequences in list not of the same length.")
-        nms <- names(x)
-        n <- length(x)
-        s <- length(x[[1]])
-        x <- matrix(unlist(x), n, s, byrow = TRUE)
-        rownames(x) <- nms
-        class(x) <- "DNAbin"
+    if (is.matrix(x)) return(x)
+    if (is.vector(x)) {
+        dim(x) <- c(1, length(x))
+        return(x)
      }
-    x
+    s <- unique(unlist(lapply(x, length)))
+    if (length(s) != 1)
+        stop("DNA sequences in list not of the same length.")
+    nms <- names(x)
+    n <- length(x)
+    y <- matrix(as.raw(0), n, s)
+    for (i in seq_len(n)) y[i, ] <- x[[i]]
+    rownames(y) <- nms
+    class(y) <- "DNAbin"
+    y
  }
  
  as.list.DNAbin <- function(x, ...)
@@ -392,7 +396,10 @@ dist.dna <- function(x, model = "K80", variance = FALSE, gamma = FALSE,
      Ndist <- if (imod == 11) n*n else n*(n - 1)/2
      var <- if (variance) double(Ndist) else 0
      if (!gamma) gamma <- alpha <- 0
-    else alpha <- gamma <- 1
+    else {
+        alpha <- gamma
+        gamma <- 1
+    }
      d <- .C("dist_dna", x, as.integer(n), as.integer(s), imod,
              double(Ndist), BF, as.integer(pairwise.deletion),
              as.integer(variance), var, as.integer(gamma),
diff --git a/R/chronos.R b/R/chronos.R

new file mode 100644 (file)

index 0000000..e651822
--- /dev/null
+++ b/R/chronos.R
@@ -0,0 +1,476 @@
+## chronos.R (2013-01-03)
+
+##   Molecular Dating With Penalized and Maximum Likelihood
+
+## Copyright 2013 Emmanuel Paradis
+
+## This file is part of the R-package `ape'.
+## See the file ../COPYING for licensing issues.
+
+.chronos.ctrl <-
+    list(tol = 1e-8, iter.max = 1e4, eval.max = 1e4, nb.rate.cat = 10,
+         dual.iter.max = 20)
+
+makeChronosCalib <-
+    function(phy, node = "root", age.min = 1, age.max = age.min,
+             interactive = FALSE, soft.bounds = FALSE)
+{
+    n <- Ntip(phy)
+    if (interactive) {
+        plot(phy)
+        cat("Click close to a node and enter the ages (right-click to exit)\n\n")
+        node <- integer()
+        age.min <- age.max <- numeric()
+        repeat {
+            ans <- identify(phy, quiet = TRUE)
+            if (is.null(ans)) break
+            NODE <- ans$nodes
+            nodelabels(node = NODE, col = "white", bg = "blue")
+            cat("constraints for node ", NODE, sep = "")
+            cat("\n  youngest age: ")
+            AGE.MIN <- as.numeric(readLines(n = 1))
+            cat("  oldest age (ENTER if not applicable): ")
+            AGE.MAX <- as.numeric(readLines(n = 1))
+            node <- c(node, NODE)
+            age.min <- c(age.min, AGE.MIN)
+            age.max <- c(age.max, AGE.MAX)
+        }
+        s <- is.na(age.max)
+        if (any(s)) age.max[s] <- age.min[s]
+    } else {
+        if (identical(node, "root")) node <- n + 1L
+    }
+
+    if (any(node <= n))
+        stop("node numbers should be greater than the number of tips")
+
+    diff.age <- which(age.max < age.min)
+    if (length(diff.age)) {
+        msg <- "'old age' less than 'young age' for node"
+        if (length(diff.age) > 1) msg <- paste(msg, "s", sep = "")
+        stop(paste(msg, paste(node[diff.age], collapse = ", ")))
+    }
+
+    data.frame(node, age.min, age.max, soft.bounds = soft.bounds)
+}
+
+chronos.control <- function(...)
+{
+    dots <- list(...)
+    x <- .chronos.ctrl
+    if (length(dots)) {
+        chk.nms <- names(dots) %in% names(x)
+        if (any(!chk.nms)) {
+            warning("some control parameter names do not match: they were ignored")
+            dots <- dots[chk.nms]
+        }
+        x[names(dots)] <- dots
+    }
+    x
+}
+
+chronos <-
+    function(phy, lambda = 1, model = "correlated", quiet = FALSE,
+             calibration = makeChronosCalib(phy),
+             control = chronos.control())
+{
+    model <- match.arg(tolower(model), c("correlated", "relaxed", "discrete"))
+    n <- Ntip(phy)
+    ROOT <- n + 1L
+    m <- phy$Nnode
+    el <- phy$edge.length
+    if (any(el < 0)) stop("some branch lengths are negative")
+    e1 <- phy$edge[, 1L]
+    e2 <- phy$edge[, 2L]
+    N <- length(e1)
+    TIPS <- 1:n
+    EDGES <- 1:N
+
+    tol <- control$tol
+
+    node <- calibration$node
+    age.min <- calibration$age.min
+    age.max <- calibration$age.max
+
+    if (model == "correlated") {
+### `basal' contains the indices of the basal edges
+### (ie, linked to the root):
+        basal <- which(e1 == ROOT)
+        Nbasal <- length(basal)
+
+### 'ind1' contains the index of all nonbasal edges, and 'ind2' the
+### index of the edges where these edges come from (ie, they contain
+### pairs of contiguous edges), eg:
+
+###         ___b___    ind1  ind2
+###        |           |   ||   |
+### ___a___|           | b || a |
+###        |           | c || a |
+###        |___c___    |   ||   |
+
+        ind1 <- EDGES[-basal]
+        ind2 <- match(e1[EDGES[-basal]], e2)
+    }
+
+    age <- numeric(n + m)
+
+### This bit sets 'ini.time' and should result in no negative branch lengths
+
+    if (!quiet) cat("\nSetting initial dates...\n")
+    seq.nod <- .Call("seq_root2tip", phy$edge, n, phy$Nnode, PACKAGE = "ape")
+
+    ii <- 1L
+    repeat {
+        ini.time <- age
+        ini.time[ROOT:(n + m)] <- NA
+
+        ini.time[node] <-
+            if (is.null(age.max)) age.min
+            else runif(length(node), age.min, age.max) # (age.min + age.max) / 2
+
+        ## if no age given for the root, find one approximately:
+        if (is.na(ini.time[ROOT]))
+            ini.time[ROOT] <- if (is.null(age.max)) 3 * max(age.min) else 3 * max(age.max)
+
+        ISnotNA.ALL <- unlist(lapply(seq.nod, function(x) sum(!is.na(ini.time[x]))))
+        o <- order(ISnotNA.ALL, decreasing = TRUE)
+
+        for (y in seq.nod[o]) {
+            ISNA <- is.na(ini.time[y])
+            if (any(ISNA)) {
+                i <- 2L # we know the 1st value is not NA, so we start at the 2nd one
+                while (i <= length(y)) {
+                    if (ISNA[i]) { # we stop at the next NA
+                        j <- i + 1L
+                        while (ISNA[j]) j <- j + 1L # look for the next non-NA
+                        nb.val <- j - i
+                        by <- (ini.time[y[i - 1L]] - ini.time[y[j]]) / (nb.val + 1)
+                        ini.time[y[i:(j - 1L)]] <- ini.time[y[i - 1L]] - by * seq_len(nb.val)
+                        i <- j + 1L
+                    } else i <- i + 1L
+                }
+            }
+        }
+        if (all(ini.time[e1] - ini.time[e2] >= 0)) break
+        ii <- ii + 1L
+        if (ii > 1000)
+            stop("cannot find reasonable starting dates after 1000 tries:
+maybe you need to adjust the calibration dates")
+    }
+### 'ini.time' set
+
+    #ini.time[ROOT:(n+m)] <- branching.times(chr.dis)
+    ## ini.time[ROOT:(n+m)] <- ini.time[ROOT:(n+m)] + rnorm(m, 0, 5)
+    #print(ini.time)
+
+
+### Setting 'ini.rate'
+    ini.rate <- el/(ini.time[e1] - ini.time[e2])
+
+    if (model == "discrete") {
+        Nb.rates <- control$nb.rate.cat
+        minmax <- range(ini.rate)
+        if (Nb.rates == 1) {
+            ini.rate <- sum(minmax)/2
+        } else {
+            inc <- diff(minmax)/Nb.rates
+            ini.rate <- seq(minmax[1] + inc/2, minmax[2] - inc/2, inc)
+            ini.freq <- rep(1/Nb.rates, Nb.rates - 1)
+            lower.freq <- rep(0, Nb.rates - 1)
+            upper.freq <- rep(1, Nb.rates - 1)
+        }
+    } else Nb.rates <- N
+## 'ini.rate' set
+
+### Setting bounds for the node ages
+
+    ## `unknown.ages' will contain the index of the nodes of unknown age:
+    unknown.ages <- 1:m + n
+
+    ## initialize vectors for all nodes:
+    lower.age <- rep(tol, m)
+    upper.age <- rep(1/tol, m)
+
+    lower.age[node - n] <- age.min
+    upper.age[node - n] <- age.max
+
+    ## find nodes known within an interval:
+    ii <- which(age.min != age.max)
+    ## drop them from 'node' since they will be estimated:
+    if (length(ii)) {
+        node <- node[-ii]
+        if (length(node))
+            age[node] <- age.min[-ii] # update 'age'
+    } else age[node] <- age.min
+
+    ## finally adjust the 3 vectors:
+    if (length(node)) {
+        unknown.ages <- unknown.ages[n - node] # 'n - node' is simplification for '-(node - n)'
+        lower.age <- lower.age[n - node]
+        upper.age <- upper.age[n - node]
+    }
+### Bounds for the node ages set
+
+    ## 'known.ages' contains the index of all nodes
+    ## (internal and terminal) of known age:
+    known.ages <- c(TIPS, node)
+
+    ## the bounds for the rates:
+    lower.rate <- rep(tol, Nb.rates)
+    upper.rate <- rep(100 - tol, Nb.rates) # needs to be adjusted to higher values?
+
+### Gradient
+    degree_node <- tabulate(phy$edge)
+    eta_i <- degree_node[e1]
+    eta_i[e2 <= n] <- 1L
+    ## eta_i[i] is the number of contiguous branches for branch 'i'
+
+    ## use of a list of indices is slightly faster than an incidence matrix
+    ## and takes much less memory (60 Kb vs. 8 Mb for n = 500)
+    X <- vector("list", N)
+    for (i in EDGES) {
+        j <- integer()
+        if (e1[i] != ROOT) j <- c(j, which(e2 == e1[i]))
+        if (e2[i] >= n) j <- c(j, which(e1 == e2[i]))
+        X[[i]] <- j
+    }
+    ## X is a list whose i-th element gives the indices of the branches
+    ## that are contiguous to branch 'i'
+
+    ## D_ki and A_ki are defined in the SI of the paper
+    D_ki <- match(unknown.ages, e2)
+    A_ki <- lapply(unknown.ages, function(x) which(x == e1))
+
+    gradient.poisson <- function(rate, node.time) {
+        age[unknown.ages] <- node.time
+        real.edge.length <- age[e1] - age[e2]
+        #if (any(real.edge.length < 0))
+        #    return(numeric(N + length(unknown.ages)))
+        ## gradient for the rates:
+        gr <- el/rate - real.edge.length
+
+        ## gradient for the dates:
+        tmp <- el/real.edge.length - rate
+        gr.dates <- sapply(A_ki, function(x) sum(tmp[x])) - tmp[D_ki]
+
+        c(gr, gr.dates)
+    }
+
+    ## gradient of the penalized lik (must be multiplied by -1 before calling nlminb)
+    gradient <-
+        switch(model,
+               "correlated" =
+               function(rate, node.time) {
+                   gr <- gradient.poisson(rate, node.time)
+                   #if (all(gr == 0)) return(gr)
+
+                   ## contribution of the penalty for the rates:
+                   gr[RATE] <- gr[RATE] - lambda * 2 * (eta_i * rate - sapply(X, function(x) sum(rate[x])))
+                   ## the contribution of the root variance term:
+                   if (Nbasal == 2) { # the simpler formulae if there's a basal dichotomy
+                       i <- basal[1]
+                       j <- basal[2]
+                       gr[i] <- gr[i] - lambda * (rate[i] - rate[j])
+                       gr[j] <- gr[j] - lambda * (rate[j] - rate[i])
+                   } else { # the general case
+                       for (i in 1:Nbasal)
+                           j <- basal[i]
+                           gr[j] <- gr[j] -
+                               lambda*2*(rate[j]*(1 - 1/Nbasal) - sum(rate[basal[-i]])/Nbasal)/(Nbasal - 1)
+                   }
+                   gr
+               },
+               "relaxed" =
+               function(rate, node.time) {
+                   gr <- gradient.poisson(rate, node.time)
+                   #if (all(gr == 0)) return(gr)
+
+                   ## contribution of the penalty for the rates:
+                   mean.rate <- mean(rate)
+                   ## rank(rate)/Nb.rates is the same than ecdf(rate)(rate) but faster
+                   gr[RATE] <- gr[RATE] + lambda*2*dgamma(rate, mean.rate)*(rank(rate)/Nb.rates - pgamma(rate, mean.rate))
+                   gr
+               },
+               "discrete" = NULL)
+
+    log.lik.poisson <- function(rate, node.time) {
+        age[unknown.ages] <- node.time
+        real.edge.length <- age[e1] - age[e2]
+        if (isTRUE(any(real.edge.length < 0))) return(-1e100)
+        B <- rate * real.edge.length
+        sum(el * log(B) - B - lfactorial(el))
+    }
+
+### penalized log-likelihood
+    penal.loglik <-
+        switch(model,
+               "correlated" =
+               function(rate, node.time) {
+                   loglik <- log.lik.poisson(rate, node.time)
+                   if (!is.finite(loglik)) return(-1e100)
+                   loglik - lambda * (sum((rate[ind1] - rate[ind2])^2)
+                                      + var(rate[basal]))
+               },
+               "relaxed" =
+               function(rate, node.time) {
+                   loglik <- log.lik.poisson(rate, node.time)
+                   if (!is.finite(loglik)) return(-1e100)
+                   mu <- mean(rate)
+                   ## loglik - lambda * sum((1:N/N - pbeta(sort(rate), mu/(1 + mu), 1))^2) # avec loi beta
+                   ## loglik - lambda * sum((1:N/N - pcauchy(sort(rate)))^2) # avec loi Cauchy
+                   loglik - lambda * sum((1:N/N - pgamma(sort(rate), mean(rate)))^2) # avec loi Gamma
+               },
+               "discrete" =
+               if (Nb.rates == 1)
+                   function(rate, node.time) log.lik.poisson(rate, node.time)
+               else function(rate, node.time, freq) {
+                   if (isTRUE(sum(freq) > 1)) return(-1e100)
+                   rate.freq <- sum(c(freq, 1 - sum(freq)) * rate)
+                   log.lik.poisson(rate.freq, node.time)
+               })
+
+    opt.ctrl <- list(eval.max = control$eval.max, iter.max = control$iter.max)
+
+    ## the following capitalized vectors give the indices of
+    ## the parameters once they are concatenated in 'p'
+    RATE <- 1:Nb.rates
+    AGE <- Nb.rates + 1:length(unknown.ages)
+
+    if (model == "discrete") {
+        if (Nb.rates == 1) {
+            start.para <- c(ini.rate, ini.time[unknown.ages])
+            f <- function(p) -penal.loglik(p[RATE], p[AGE])
+            g <- NULL
+            LOW <- c(lower.rate, lower.age)
+            UP <- c(upper.rate, upper.age)
+        } else {
+            FREQ <- length(RATE) + length(AGE) + 1:(Nb.rates - 1)
+            start.para <- c(ini.rate, ini.time[unknown.ages], ini.freq)
+            f <- function(p) -penal.loglik(p[RATE], p[AGE], p[FREQ])
+            g <- NULL
+            LOW <- c(lower.rate, lower.age, lower.freq)
+            UP <- c(upper.rate, upper.age, upper.freq)
+        }
+    } else {
+        start.para <- c(ini.rate, ini.time[unknown.ages])
+        f <- function(p) -penal.loglik(p[RATE], p[AGE])
+        g <- function(p) -gradient(p[RATE], p[AGE])
+        LOW <- c(lower.rate, lower.age)
+        UP <- c(upper.rate, upper.age)
+    }
+
+    k <- length(LOW) # number of free parameters
+
+    if (!quiet) cat("Fitting in progress... get a first set of estimates\n")
+
+    out <- nlminb(start.para, f, g,
+                  control = opt.ctrl, lower = LOW, upper = UP)
+
+    if (model == "discrete") {
+        if (Nb.rates == 1) {
+            f.rates <- function(p) -penal.loglik(p, current.ages)
+            f.ages <- function(p) -penal.loglik(current.rates, p)
+        } else {
+            f.rates <- function(p) -penal.loglik(p, current.ages, current.freqs)
+            f.ages <- function(p) -penal.loglik(current.rates, p, current.freqs)
+            f.freqs <- function(p) -penal.loglik(current.rates, current.ages, p)
+            g.freqs <- NULL
+        }
+        g.rates <- NULL
+        g.ages <- NULL
+    } else {
+        f.rates <- function(p) -penal.loglik(p, current.ages)
+        g.rates <- function(p) -gradient(p, current.ages)[RATE]
+        f.ages <- function(p) -penal.loglik(current.rates, p)
+        g.ages <- function(p) -gradient(current.rates, p)[AGE]
+    }
+
+    current.ploglik <- -out$objective
+    current.rates <- out$par[RATE]
+    current.ages <- out$par[AGE]
+    if (model == "discrete" && Nb.rates > 1) current.freqs <- out$par[FREQ]
+
+    dual.iter.max <- control$dual.iter.max
+    i <- 0L
+
+    if (!quiet) cat("         Penalised log-lik =", current.ploglik, "\n")
+
+    repeat {
+        if (dual.iter.max < 1) break
+        if (!quiet) cat("Optimising rates...")
+        out.rates <- nlminb(current.rates, f.rates, g.rates,# h.rates,
+                            control = list(eval.max = 1000, iter.max = 1000,
+                                           step.min = 1e-8, step.max = .1),
+                            lower = lower.rate, upper = upper.rate)
+        new.rates <- out.rates$par
+        if (-out.rates$objective > current.ploglik)
+            current.rates <- new.rates
+
+        if (model == "discrete" && Nb.rates > 1) {
+            if (!quiet) cat(" frequencies...")
+            out.freqs <- nlminb(current.freqs, f.freqs,
+                                control = list(eval.max = 1000, iter.max = 1000,
+                                               step.min = .001, step.max = .5),
+                                lower = lower.freq, upper = upper.freq)
+            new.freqs <- out.freqs$par
+        }
+
+        if (!quiet) cat(" dates...")
+        out.ages <- nlminb(current.ages, f.ages, g.ages,# h.ages,
+                           control = list(eval.max = 1000, iter.max = 1000,
+                                          step.min = .001, step.max = 100),
+                           lower = lower.age, upper = upper.age)
+        new.ploglik <- -out.ages$objective
+
+        if (!quiet) cat("", current.ploglik, "\n")
+
+        if (new.ploglik - current.ploglik > 1e-6 && i <= dual.iter.max) {
+            current.ploglik <- new.ploglik
+            current.rates <- new.rates
+            current.ages <- out.ages$par
+            if (model == "discrete" && Nb.rates > 1) current.freqs <- new.freqs
+            out <- out.ages
+            i <- i + 1L
+        } else break
+    }
+
+    if (!quiet) cat("\nDone.\n")
+
+#    browser()
+
+    if (model == "discrete") {
+        rate.freq <-
+            if (Nb.rates == 1) current.rates
+            else mean(c(current.freqs, 1 - sum(current.freqs)) * current.rates)
+        logLik <- log.lik.poisson(rate.freq, current.ages)
+        PHIIC <- list(logLik = logLik, k = k, PHIIC = - 2 * logLik + 2 * k)
+    } else {
+        logLik <- log.lik.poisson(current.rates, current.ages)
+        PHI <- switch(model,
+                      "correlated" = (current.rates[ind1] - current.rates[ind2])^2 + var(current.rates[basal]),
+                      "relaxed" = (1:N/N - pgamma(sort(current.rates), mean(current.rates)))^2) # avec loi Gamma
+        PHIIC <- list(logLik = logLik, k = k, lambda = lambda,
+                      PHIIC = - 2 * logLik + 2 * k + lambda * svd(PHI)$d)
+    }
+
+    attr(phy, "call") <- match.call()
+    attr(phy, "ploglik") <- -out$objective
+    attr(phy, "rates") <- current.rates #out$par[EDGES]
+    if (model == "discrete" && Nb.rates > 1)
+        attr(phy, "frequencies") <- current.freqs
+    attr(phy, "message") <- out$message
+    attr(phy, "PHIIC") <- PHIIC
+    age[unknown.ages] <- current.ages #out$par[-EDGES]
+    phy$edge.length <- age[e1] - age[e2]
+    class(phy) <- c("chronos", class(phy))
+    phy
+}
+
+print.chronos <- function(x, ...)
+{
+    cat("\n    Chronogram\n\n")
+    cat("Call: ")
+    print(attr(x, "call"))
+    cat("\n")
+    NextMethod("print")
+}
diff --git a/R/read.dna.R b/R/read.dna.R

index 9e46cba67bbc81b5eed7624232e5c8185519e5c2..4b2ddd8e2a65dd7b664071251fd17824320a160e 100644 (file)
--- a/R/read.dna.R
+++ b/R/read.dna.R
@@ -1,4 +1,4 @@
-## read.dna.R (2012-12-27)
+## read.dna.R (2013-01-04)
  
  ##   Read DNA Sequences in a File
  
@@ -47,7 +47,6 @@ read.dna <- function(file, format = "interleaved", skip = 0,
      } else {
          X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
                    skip = skip, nlines = nlines, comment.char = comment.char)
-
          if (format %in% formats[1:2]) {
              ## need to remove the possible leading spaces and/or tabs in the first line
              fl <- gsub("^[[:blank:]]+", "", X[1])
@@ -106,9 +105,10 @@ read.dna <- function(file, format = "interleaved", skip = 0,
                     for (i in 2:n)
                         obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
                 })
-
+    }
      if (format != "fasta") {
          rownames(obj) <- taxa
+        if (!as.character) obj <- as.DNAbin(obj)
      } else {
          LENGTHS <- unique(unlist(lapply(obj, length)))
          allSameLength <- length(LENGTHS) == 1
@@ -119,10 +119,15 @@ read.dna <- function(file, format = "interleaved", skip = 0,
              as.matrix <- allSameLength
          }
          if (as.matrix) {
-            obj <- matrix(unlist(obj), ncol = LENGTHS, byrow = TRUE)
+            taxa <- names(obj)
+            n <- length(obj)
+            y <- matrix(as.raw(0), n, LENGTHS)
+            for (i in seq_len(n)) y[i, ] <- obj[[i]]
+            obj <- y
              rownames(obj) <- taxa
+            class(obj) <- "DNAbin"
          }
+        if (as.character) obj <- as.character(obj)
      }
-    if (!as.character) obj <- as.DNAbin(obj)
      obj
  }
diff --git a/man/chronopl.Rd b/man/chronopl.Rd

index 13f183fbb35ced0abc013a0f4183680873653225..339a2849012249ecea4b0a4135d20cba504181a6 100644 (file)
--- a/man/chronopl.Rd
+++ b/man/chronopl.Rd
@@ -101,6 +101,10 @@ chronopl(phy, lambda, age.min = 1, age.max = NULL,
    \item{D2}{the influence of each observation on overall date
      estimates (if \code{CV = TRUE}).}
  }
+\note{
+  The new function \code{\link{chronos}} replaces the present one which
+  is no more maintained.
+}
  \references{
    Sanderson, M. J. (2002) Estimating absolute rates of molecular
    evolution and divergence times: a penalized likelihood
@@ -109,6 +113,6 @@ chronopl(phy, lambda, age.min = 1, age.max = NULL,
  }
  \author{Emmanuel Paradis}
  \seealso{
-  \code{\link{chronoMPL}}
+  \code{\link{chronos}}, \code{\link{chronoMPL}}
  }
  \keyword{models}
diff --git a/man/chronos.Rd b/man/chronos.Rd

new file mode 100644 (file)

index 0000000..22c15f1
--- /dev/null
+++ b/man/chronos.Rd
@@ -0,0 +1,136 @@
+\name{chronos}
+\alias{chronos}
+\alias{makeChronosCalib}
+\alias{chronos.control}
+\alias{print.chronos}
+\title{Molecular Dating by Penalised Likelihood and Maximum Likelihood}
+\description{
+  \code{chronos} is the main function fitting a chronogram to a
+  phylogenetic tree whose branch lengths are in number of substitution
+  per sites.
+
+  \code{makeChronosCalib} is a tool to prepare data frames with the
+  calibration points of the phylogenetic tree.
+
+  \code{chronos.control} creates a list of parameters to be passed
+  to \code{chronos}.
+}
+\usage{
+chronos(phy, lambda = 1, model = "correlated", quiet = FALSE,
+        calibration = makeChronosCalib(phy),
+        control = chronos.control())
+\method{print}{chronos}(x, ...)
+makeChronosCalib(phy, node = "root", age.min = 1,
+   age.max = age.min, interactive = FALSE, soft.bounds = FALSE)
+chronos.control(...)
+}
+\arguments{
+  \item{phy}{an object of class \code{"phylo"}.}
+  \item{lambda}{value of the smoothing parameter.}
+  \item{model}{a character string specifying the model of substitution
+    rate variation among branches. The possible choices are:
+    ``correlated'', ``relaxed'', ``discrete'', or an unambiguous
+    abbreviation of these.}
+  \item{quiet}{a logical value; by default the calculation progress are
+    displayed.}
+  \item{calibration}{a data frame (see details).}
+  \item{control}{a list of parameters controlling the optimisation
+    procedure (see details).}
+  \item{x}{an object of class \code{c("chronos", "phylo")}.}
+  \item{node}{a vector of integers giving the node numbers for which a
+    calibration point is given. The default is a short-cut for the
+    root.}
+  \item{age.min, age.max}{vectors of numerical values giving the minimum
+    and maximum ages of the nodes specified in \code{node}.}
+  \item{interactive}{a logical value. If \code{TRUE}, then \code{phy} is
+    plotted and the user is asked to click close to a node and enter the
+    ages on the keyboard.}
+  \item{soft.bounds}{(currently unused)}
+  \item{\dots}{in the case of \code{chronos.control}: one of the five
+    parameters controlling optimisation (unused in the case of
+    \code{print.chronos}).}
+}
+\details{
+  \code{chronos} replaces \code{chronopl} but with a different interface
+  and some extensions (see References).
+
+  The known dates (argument \code{calibration}) must be given in a data
+  frame with the following column names: node, age.min, age.max, and
+  soft.bounds (the last one is yet unused). For each row, these are,
+  respectively: the number of the node in the ``phylo'' coding standard,
+  the minimum age for this node, the maximum age, and a logical value
+  specifying whether the bounds are soft. If age.min = age.max, this
+  means that the age is exactly known. This data frame can be built with
+  \code{makeChronosCalib} which returns by default a data frame with a
+  single row giving age = 1 for the root. The data frame can be built
+  interactively by clicking on the plotted tree.
+
+  The argument \code{control} allows one to change some parameters of
+  the optimisation procedure. This must be a list with names. The
+  available options with their default values are:
+
+  \itemize{
+    \item{tol = 1e-8: }{tolerance for the estimation of the substitution
+      rates.}
+    \item{iter.max = 1e4: }{the maximum number of iterations at each
+      optimization step.}
+    \item{eval.max = 1e4: }{the maximum number of function evaluations at
+      each optimization step.}
+    \item{nb.rate.cat = 10: }{the number of rate categories if \code{model
+       = "discrete"} (set this parameter to 1 to fit a strict clock
+      model).}
+    \item{dual.iter.max = 20: }{the maximum number of alternative
+      iterations between rates and dates.}
+  }
+
+  The command \code{chronos.control()} returns a list with the default
+  values of these parameters. They may be modified by passing them to
+  this function, or directly in the list.
+}
+\value{
+  \code{chronos} returns an object of class \code{c("chronos",
+  "phylo")}. There is a print method for it. There are additional
+  attributes which can be visualised with \code{str} or extracted with
+  \code{attr}.
+
+  \code{makeChronosCalib} returns a data frame.
+
+  \code{chronos.control} returns a list.
+}
+\references{
+  Kim, J. and Sanderson, M. J. (2008) Penalized likelihood phylogenetic
+  inference: bridging the parsimony-likelihood gap. \emph{Systematic
+    Biology}, \bold{57}, 665--674.
+
+  Paradis, E. (2012) Molecular dating of phylogenies by likelihood
+  methods: a comparison of models and a new information
+  criterion. \emph{Manuscript}.
+
+  Sanderson, M. J. (2002) Estimating absolute rates of molecular
+  evolution and divergence times: a penalized likelihood
+  approach. \emph{Molecular Biology and Evolution}, \bold{19},
+  101--109.
+}
+\author{Emmanuel Paradis}
+\seealso{
+  \code{\link{chronoMPL}}
+}
+\examples{
+tr <- rtree(10)
+### the default is the correlated rate model:
+chr <- chronos(tr)
+### strict clock model:
+ctrl <- chronos.control(nb.rate.cat = 1)
+chr.clock <- chronos(tr, model = "discrete", control = ctrl)
+### How different are the rates?
+attr(chr, "rates")
+attr(chr.clock, "rates")
+\dontrun{
+cal <- makeChronosCalib(tr, interactive = TRUE)
+cal
+### if you made mistakes, you can edit the data frame with:
+### fix(cal)
+chr <- chronos(tr, calibration = cal)
+}
+}
+\keyword{models}
diff --git a/man/plot.phylo.Rd b/man/plot.phylo.Rd

index 6d08946052aa33925df254d2116ccb35f729c204..371266945e3a489ffb2ee6f6b3b8749c9305c8e8 100644 (file)
--- a/man/plot.phylo.Rd
+++ b/man/plot.phylo.Rd
@@ -97,12 +97,12 @@
    \item{rotate.tree}{for "fan", "unrooted", or "radial" trees: the
      rotation of the whole tree in degrees (negative values are
      accepted).}
-  \item{layout}{the number of trees to be plotted simultaneously.}
-  \item{\dots}{further arguments to be passed to \code{plot} or to
-    \code{plot.phylo}.}
    \item{open.angle}{if \code{type = "f"}, the angle in degrees left
      blank. Use a non-zero value if you want to call
      \code{\link{axisPhylo}} after the tree is plotted.}
+  \item{layout}{the number of trees to be plotted simultaneously.}
+  \item{\dots}{further arguments to be passed to \code{plot} or to
+    \code{plot.phylo}.}
  }
  \description{
    These functions plot phylogenetic trees on the current graphical
diff --git a/man/read.dna.Rd b/man/read.dna.Rd

index 19bab40bf68733b013ca7d6b2fa893632d6a9e1e..4cf664ea68b2292797bae09f7c6934827d5fd45a 100644 (file)
--- a/man/read.dna.Rd
+++ b/man/read.dna.Rd
@@ -2,6 +2,13 @@
  \alias{read.dna}
  \alias{read.FASTA}
  \title{Read DNA Sequences in a File}
+\description{
+  These functions read DNA sequences in a file, and returns a matrix or a
+  list of DNA sequences with the names of the taxa read in the file as
+  rownames or names, respectively. By default, the sequences are stored
+  in binary format, otherwise (if \code{as.character = "TRUE"}) in lower
+  case.
+}
  \usage{
  read.dna(file, format = "interleaved", skip = 0,
           nlines = 0, comment.char = "#",
@@ -16,11 +23,11 @@ read.FASTA(file)
      \code{"sequential"}, \code{"clustal"}, or \code{"fasta"}, or any
      unambiguous abbreviation of these.}
    \item{skip}{the number of lines of the input file to skip before
-    beginning to read data.}
+    beginning to read data (ignored for FASTA files; see below).}
    \item{nlines}{the number of lines to be read (by default the file is
-    read untill its end).}
+    read untill its end; ignored for FASTA files)).}
    \item{comment.char}{a single character, the remaining of the line
-    after this character is ignored.}
+    after this character is ignored (ignored for FASTA files).}
    \item{as.character}{a logical controlling whether to return the
      sequences as an object of class \code{"DNAbin"} (the default).}
    \item{as.matrix}{(used if \code{format = "fasta"}) one of the three
@@ -30,15 +37,8 @@ read.FASTA(file)
      are of different lengths; (iii) \code{FALSE}: always returns the
      sequences in a list.}
  }
-\description{
-  This function reads DNA sequences in a file, and returns a matrix or a
-  list of DNA sequences with the names of the taxa read in the file as
-  rownames or names, respectively. By default, the sequences are stored
-  in binary format, otherwise (if \code{as.character = "TRUE"}) in lower
-  case.
-}
  \details{
-  This function follows the interleaved and sequential formats defined
+  \code{read.dna} follows the interleaved and sequential formats defined
    in PHYLIP (Felsenstein, 1993) but with the original feature than there
    is no restriction on the lengths of the taxa names. For these two
    formats, the first line of the file must contain the dimensions of the
@@ -72,9 +72,9 @@ read.FASTA(file)
  
    \item{FASTA:}{This looks like the sequential format but the taxa names
      (or rather a description of the sequence) are on separate lines
-    beginning with a `greater than' character ``>'' (there may be
+    beginning with a `greater than' character `>' (there may be
      leading spaces before this character). These lines are taken as taxa
-    names after removing the ``>'' and the possible leading and trailing
+    names after removing the `>' and the possible leading and trailing
      spaces. All the data in the file before the first sequence is ignored.}
  }}
  \value{
@@ -82,7 +82,7 @@ read.FASTA(file)
    stored in binary format, or of mode character (if \code{as.character =
      "TRUE"}).
  
-  \code{read.FASTA} always returns a list.
+  \code{read.FASTA} always returns a list of class \code{"DNAbin"}.
  }
  \references{
    Anonymous. FASTA format description.
diff --git a/src/read_dna.c b/src/read_dna.c

index 6d44c6665603d19c0bcd0f6c8aea2409c94e671b..01a4db32c5d2b1de517d2f659496a73c1d1b9c13 100644 (file)
--- a/src/read_dna.c
+++ b/src/read_dna.c
@@ -1,6 +1,6 @@
-/* read_dna.c       2012-12-27 */
+/* read_dna.c       2013-01-04 */
  
-/* Copyright 2008-2012 Emmanuel Paradis */
+/* Copyright 2013 Emmanuel Paradis */
  
  /* This file is part of the R-package `ape'. */
  /* See the file ../COPYING for licensing issues. */
@@ -81,7 +81,7 @@ static const unsigned char lineFeed = 0x0a;
  
  SEXP rawStreamToDNAbin(SEXP x)
  {
-       int N, i, j, k, n;
+       int N, i, j, k, n, startOfSeq;
         unsigned char *xr, *rseq, *buffer, tmp;
         SEXP obj, nms, seq;
  
@@ -89,15 +89,25 @@ SEXP rawStreamToDNAbin(SEXP x)
         N = LENGTH(x);
         xr = RAW(x);
  
-/* do a 1st pass to find the number of sequences */
+/* do a 1st pass to find the number of sequences
  
-       n = 0;
-       j = 0; /* use j as a flag */
-       for (i = 0; i < N; i++) {
+   this code should be robust to '>' present inside
+   a label or in the header text before the sequences */
+
+       n = j = 0; /* use j as a flag */
+       if (xr[0] == hook) {
+               j = 1;
+               startOfSeq = 0;
+       }
+       i = 1;
+       for (i = 1; i < N; i++) {
                 if (j && xr[i] == lineFeed) {
                         n++;
                         j = 0;
-               } else if (xr[i] == hook) j = 1;
+               } else if (xr[i] == hook) {
+                       if (!n) startOfSeq = i;
+                       j = 1;
+               }
         }
  
         PROTECT(obj = allocVector(VECSXP, n));
@@ -106,16 +116,15 @@ SEXP rawStreamToDNAbin(SEXP x)
  /* Refine the way the size of the buffer is set? */
         buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *));
  
-       i = j = 0;
+       i = startOfSeq;
+       j = 0; /* gives the index of the sequence */
         while (i < N) {
                 /* 1st read the label... */
-               if (xr[i] == hook) {
-                       i++;
-                       k = 0;
-                       while (xr[i] != lineFeed) buffer[k++] = xr[i++];
-                       buffer[k] = '\0';
-                       SET_STRING_ELT(nms, j, mkChar(buffer));
-               }
+               i++;
+               k = 0;
+               while (xr[i] != lineFeed) buffer[k++] = xr[i++];
+               buffer[k] = '\0';
+               SET_STRING_ELT(nms, j, mkChar(buffer));
                 /* ... then read the sequence */
                 n = 0;
                 while (xr[i] != hook && i < N) {
@@ -125,7 +134,7 @@ SEXP rawStreamToDNAbin(SEXP x)
     then the following check would not be needed; additionally the size
     of tab_trans could be restriced to 0-121. This check has the
     advantage that all invalid characters are simply ignored without
-   causing error. */
+   causing error -- except if '>' occurs in the middle of a sequence. */
                         if (tmp) buffer[n++] = tmp;
                 }
                 PROTECT(seq = allocVector(RAWSXP, n));
author	paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
	Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)
committer	paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
	Fri, 4 Jan 2013 12:48:57 +0000 (12:48 +0000)
DESCRIPTION		patch \| blob \| history
NEWS		patch \| blob \| history
R/DNA.R		patch \| blob \| history
R/chronos.R	[new file with mode: 0644]	patch \| blob
R/read.dna.R		patch \| blob \| history
man/chronopl.Rd		patch \| blob \| history
man/chronos.Rd	[new file with mode: 0644]	patch \| blob
man/plot.phylo.Rd		patch \| blob \| history
man/read.dna.Rd		patch \| blob \| history
src/read_dna.c		patch \| blob \| history