From b9e04a6e6af3beda74b916eda00b42ac38875563 Mon Sep 17 00:00:00 2001 From: paradis Date: Tue, 2 Oct 2012 04:00:29 +0000 Subject: [PATCH 01/16] some updates for ape 3.0-6 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@196 6e262413-ae40-0410-9e79-b911bd7a66b7 --- NEWS | 17 +++++++++++++-- R/CDF.birth.death.R | 11 ++++++---- R/DNA.R | 16 +++++++------- R/dbd.R | 6 +---- R/evonet.R | 6 ++--- R/ladderize.R | 4 ++-- R/me.R | 22 +++++++++++-------- R/pic.R | 6 ++--- R/plot.phylo.R | 53 ++++++++++++++++++++++++++++++++++++++++++++- R/read.nexus.R | 4 +++- R/read.tree.R | 5 +++-- R/rtree.R | 3 ++- man/ape-internal.Rd | 2 -- man/dbd.Rd | 9 ++++---- man/node.depth.Rd | 22 ++++++++++++++----- man/rTraitCont.Rd | 2 +- src/plot_phylo.c | 21 +++--------------- 17 files changed, 138 insertions(+), 71 deletions(-) diff --git a/NEWS b/NEWS index 46af03f..729e3ed 100644 --- a/NEWS +++ b/NEWS @@ -5,13 +5,18 @@ NEW FEATURES o reorder.phylo() has a new order, "postorder", and a new option index.only = TRUE to return only the vector of indices (the tree - is unmodified; see ?reorder.phylo for details). + is unmodified, see ?reorder.phylo for details). + + o The three new functions node.depth.length, node.height, and + node.height.clado make some internal code available from R. See + ?node.depth (which was already available and documented) for + details. BUG FIXES o reorder(, "pruningwise") made R crash if the rows of the edge - matrix are in random order. + matrix are in random order: this is now fixed. OTHER CHANGES @@ -22,6 +27,14 @@ OTHER CHANGES visible for small trees (n < 1000) but this can be more than 1000 faster for big trees (n >= 1e4). + o The attribute "order" of the objects of class "phylo" is now + strongly recommended, though not mandatory. Most functions in + ape should return a tree with this attribute correctly set. + + o dbd() is now vectorized on both arguments 'x' (number of species + in clade) and 't' (clade age) to make likelihood calculations + easier and faster. + CHANGES IN APE VERSION 3.0-5 diff --git a/R/CDF.birth.death.R b/R/CDF.birth.death.R index 847c37d..7ba5f5e 100644 --- a/R/CDF.birth.death.R +++ b/R/CDF.birth.death.R @@ -1,9 +1,9 @@ -## CDF.birth.death.R (2010-09-27) +## CDF.birth.death.R (2012-09-14) -## Functions to simulate and fit +## Functions to Simulate and Fit ## Time-Dependent Birth-Death Models -## Copyright 2010 Emmanuel Paradis +## Copyright 2010-2012 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -158,7 +158,9 @@ CDF.birth.death <- if (!case %in% c(1, 3, 6)) Pi <- Vectorize(Pi) - denom <- if (fast) integrateTrapeze(Pi, 0, Tmax) else integrate(Pi, 0, Tmax)$value + denom <- + if (fast) integrateTrapeze(Pi, 0, Tmax) + else integrate(Pi, 0, Tmax)$value n <- length(x) p <- numeric(n) if (fast) { @@ -178,6 +180,7 @@ CDF.birth.death <- phy <- list(edge = edge, edge.length = edge.length, tip.label = paste("t", 1:(i + 1), sep = ""), Nnode = i) class(phy) <- "phylo" + attr(phy, "order") <- "cladewise" phy } diff --git a/R/DNA.R b/R/DNA.R index 2bd7520..b426dbd 100644 --- a/R/DNA.R +++ b/R/DNA.R @@ -1,4 +1,4 @@ -## DNA.R (2012-06-19) +## DNA.R (2012-09-13) ## Manipulations and Comparisons of DNA Sequences @@ -295,7 +295,7 @@ base.freq <- function(x, freq = FALSE, all = FALSE) { if (is.list(x)) x <- unlist(x) n <- length(x) - BF <-.C("BaseProportion", x, n, double(17), + BF <-.C("BaseProportion", x, as.integer(n), double(17), DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]] names(BF) <- c("a", "c", "g", "t", "r", "m", "w", "s", "k", "y", "v", "h", "d", "b", "n", "-", "?") @@ -351,8 +351,8 @@ seg.sites <- function(x) n <- n[1] } if (n == 1) return(integer(0)) - ans <- .C("SegSites", x, n, s, integer(s), - DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + ans <- .C("SegSites", x, as.integer(n), as.integer(s), + integer(s), DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") which(as.logical(ans[[4]])) } @@ -397,10 +397,10 @@ dist.dna <- function(x, model = "K80", variance = FALSE, gamma = FALSE, var <- if (variance) double(Ndist) else 0 if (!gamma) gamma <- alpha <- 0 else alpha <- gamma <- 1 - d <- .C("dist_dna", x, n, s, imod, double(Ndist), BF, - as.integer(pairwise.deletion), as.integer(variance), - var, as.integer(gamma), alpha, DUP = FALSE, NAOK = TRUE, - PACKAGE = "ape") + d <- .C("dist_dna", x, as.integer(n), as.integer(s), imod, + double(Ndist), BF, as.integer(pairwise.deletion), + as.integer(variance), var, as.integer(gamma), + alpha, DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") if (variance) var <- d[[9]] d <- d[[5]] if (imod == 11) { diff --git a/R/dbd.R b/R/dbd.R index 3a325dc..b0beb7a 100644 --- a/R/dbd.R +++ b/R/dbd.R @@ -1,4 +1,4 @@ -## dbd.R (2012-03-19) +## dbd.R (2012-09-14) ## Probability Density Under Birth--Death Models @@ -27,10 +27,6 @@ dbd <- function(x, lambda, mu, t, conditional = FALSE, log = FALSE) mu <- mu[1] warning("only the first value of 'mu' was considered") } - if (length(t) > 1) { - t <- t[1] - warning("only the first value of 't' was considered") - } if (mu == 0) return(dyule(x, lambda, t, log)) diff --git a/R/evonet.R b/R/evonet.R index 9dbaaa3..7f6b45f 100644 --- a/R/evonet.R +++ b/R/evonet.R @@ -1,8 +1,8 @@ -## evonet.R (2011-06-09) +## evonet.R (2012-09-14) ## Evolutionary Networks -## Copyright 2011 Emmanuel Paradis +## Copyright 2011-2012 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -67,7 +67,7 @@ as.networx.evonet <- function(x, weight = NA, ...) { if (any(x$reticulation <= Ntip(x))) stop("some tips are involved in reticulations: cannot convert to \"networx\"") - x <- reorder(x, "p") + x <- reorder(x, "pruningwise") ned <- Nedge(x) nrt <- nrow(x$reticulation) x$edge <- rbind(x$edge, x$reticulation) diff --git a/R/ladderize.R b/R/ladderize.R index 8d2e94a..596a8fd 100644 --- a/R/ladderize.R +++ b/R/ladderize.R @@ -23,7 +23,7 @@ ladderize <- function(phy, right = TRUE) start <- start[o] neworder[newpos] <<- start for (i in 1:Nclade) - if (desc[i] > nb.tip) foo(desc[i], end[i], newpos[i] + 1) + if (desc[i] > nb.tip) foo(desc[i], end[i], newpos[i] + 1) } nb.tip <- length(phy$tip.label) nb.node <- phy$Nnode @@ -37,6 +37,6 @@ ladderize <- function(phy, right = TRUE) foo(nb.tip + 1, nb.edge, 1) phy$edge <- phy$edge[neworder, ] if (!is.null(phy$edge.length)) - phy$edge.length <- phy$edge.length[neworder] + phy$edge.length <- phy$edge.length[neworder] phy } diff --git a/R/me.R b/R/me.R index 1006c64..533c40e 100644 --- a/R/me.R +++ b/R/me.R @@ -1,4 +1,4 @@ -## me.R (2012-04-30) +## me.R (2012-09-14) ## Tree Estimation Based on Minimum Evolution Algorithm @@ -20,10 +20,12 @@ fastme.bal <- function(X, nni = TRUE, spr = TRUE, tbr = TRUE) labels <- attr(X, "Labels") if (is.null(labels)) labels <- as.character(1:N) labels <- labels[ans[[3]]] - structure(list(edge = cbind(ans[[7]], ans[[8]]), - edge.length = ans[[9]], - tip.label = labels, Nnode = N - 2L), - class = "phylo") + obj <- list(edge = cbind(ans[[7]], ans[[8]]), + edge.length = ans[[9]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + attr(obj, "order") <- "cladewise" + obj } fastme.ols <- function(X, nni = TRUE) @@ -37,10 +39,12 @@ fastme.ols <- function(X, nni = TRUE) labels <- attr(X, "Labels") if (is.null(labels)) labels <- as.character(1:N) labels <- labels[ans[[3]]] - structure(list(edge = cbind(ans[[5]], ans[[6]]), - edge.length = ans[[7]], - tip.label = labels, Nnode = N - 2L), - class = "phylo") + obj <- list(edge = cbind(ans[[5]], ans[[6]]), + edge.length = ans[[7]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + attr(obj, "order") <- "cladewise" + obj } bionj <- function(X) diff --git a/R/pic.R b/R/pic.R index 816a09b..b1cb20b 100644 --- a/R/pic.R +++ b/R/pic.R @@ -1,8 +1,8 @@ -## pic.R (2011-03-01) +## pic.R (2012-09-11) ## Phylogenetically Independent Contrasts -## Copyright 2002-2011 Emmanuel Paradis +## Copyright 2002-2012 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -22,7 +22,7 @@ pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE) if (any(is.na(x))) stop("missing data in 'x': you may consider removing the species with missing data from your tree with the function 'drop.tip'.") - phy <- reorder(phy, "pruningwise") + phy <- reorder(phy, "postorder") phenotype <- numeric(nb.tip + nb.node) if (is.null(names(x))) { diff --git a/R/plot.phylo.R b/R/plot.phylo.R index 3fd0637..862291d 100644 --- a/R/plot.phylo.R +++ b/R/plot.phylo.R @@ -1,4 +1,4 @@ -## plot.phylo.R (2012-03-22) +## plot.phylo.R (2012-10-02) ## Plot Phylogenies @@ -595,6 +595,57 @@ node.depth <- function(phy) as.integer(N), double(n + m), DUP = FALSE, PACKAGE = "ape")[[6]] } +node.depth.edgelength <- function(phy) +{ + n <- length(phy$tip.label) + m <- phy$Nnode + N <- dim(phy$edge)[1] + phy <- reorder(phy, order = "pruningwise") + .C("node_depth_edgelength", as.integer(n), as.integer(n), + as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]), + as.integer(N), as.double(phy$edge.length), double(n + m), + DUP = FALSE, PACKAGE = "ape")[[7]] +} + +node.height <- function(phy) +{ + n <- length(phy$tip.label) + m <- phy$Nnode + N <- dim(phy$edge)[1] + phy <- reorder(phy, order = "pruningwise") + + e1 <- phy$edge[, 1] + e2 <- phy$edge[, 2] + + yy <- numeric(n + m) + TIPS <- e2[e2 <= n] + yy[TIPS] <- 1:n + + .C("node_height", as.integer(n), as.integer(m), + as.integer(e1), as.integer(e2), as.integer(N), + as.double(yy), DUP = FALSE, PACKAGE = "ape")[[6]] +} + +node.height.clado <- function(phy) +{ + n <- length(phy$tip.label) + m <- phy$Nnode + N <- dim(phy$edge)[1] + phy <- reorder(phy, order = "pruningwise") + + e1 <- phy$edge[, 1] + e2 <- phy$edge[, 2] + + yy <- numeric(n + m) + TIPS <- e2[e2 <= n] + yy[TIPS] <- 1:n + + .C("node_height_clado", as.integer(n), as.integer(m), + as.integer(e1), as.integer(e2), as.integer(N), + double(n + m), as.double(yy), DUP = FALSE, + PACKAGE = "ape")[[7]] +} + plot.multiPhylo <- function(x, layout = 1, ...) { if (layout > 1) diff --git a/R/read.nexus.R b/R/read.nexus.R index 0268e68..2f4177e 100644 --- a/R/read.nexus.R +++ b/R/read.nexus.R @@ -1,4 +1,4 @@ -## read.nexus.R (2012-02-09) +## read.nexus.R (2012-09-28) ## Read Tree File in Nexus Format @@ -16,6 +16,7 @@ names(phy) <- nms if (all(phy$node.label == "")) phy$node.label <- NULL class(phy) <- "phylo" + attr(phy, "order") <- "cladewise" phy } @@ -97,6 +98,7 @@ clado.build <- function(tp) if (all(obj$node.label == "NA")) NULL else gsub("^NA", "", obj$node.label) class(obj) <- "phylo" + attr(obj, "order") <- "cladewise" obj } diff --git a/R/read.tree.R b/R/read.tree.R index cba51d1..47a432c 100644 --- a/R/read.tree.R +++ b/R/read.tree.R @@ -1,8 +1,8 @@ -## read.tree.R (2010-09-27) +## read.tree.R (2012-09-14) ## Read Tree Files in Parenthetic Format -## Copyright 2002-2010 Emmanuel Paradis, Daniel Lawson and Klaus Schliep +## Copyright 2002-2012 Emmanuel Paradis, Daniel Lawson and Klaus Schliep ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -95,6 +95,7 @@ tree.build <- function(tp) if (any(nzchar(node.label))) obj$node.label <- node.label if (!is.na(root.edge)) obj$root.edge <- root.edge class(obj) <- "phylo" + attr(obj, "order") <- "cladewise" obj } diff --git a/R/rtree.R b/R/rtree.R index 0df3596..eb00b60 100644 --- a/R/rtree.R +++ b/R/rtree.R @@ -1,4 +1,4 @@ -## rtree.R (2012-02-09) +## rtree.R (2012-09-14) ## Generates Trees @@ -100,6 +100,7 @@ rtree <- function(n, rooted = TRUE, tip.label = NULL, br = runif, ...) } phy$Nnode <- n - 2L + rooted class(phy) <- "phylo" + attr(phy, "order") <- "cladewise" phy } diff --git a/man/ape-internal.Rd b/man/ape-internal.Rd index 838a3a3..c923093 100644 --- a/man/ape-internal.Rd +++ b/man/ape-internal.Rd @@ -13,8 +13,6 @@ \alias{circular.plot} \alias{unrooted.plot} \alias{unrooted.xy} -\alias{node.height} -\alias{node.height.clado} \alias{birth.step} \alias{death.step} \alias{ht.move} diff --git a/man/dbd.Rd b/man/dbd.Rd index ad2e620..4f5ba15 100644 --- a/man/dbd.Rd +++ b/man/dbd.Rd @@ -43,11 +43,12 @@ dbdTime(x, birth, death, t, conditional = FALSE, \code{dbdTime} is for time-varying \code{lambda} and \code{mu} specified as \R functions. - Only \code{dyule} is vectorized simultaneously on its three arguments + \code{dyule} is vectorized simultaneously on its three arguments \code{x}, \code{lambda}, and \code{t}, according to \R's rules of - recycling arguments. The two others are vectorized only on \code{x}; - the other arguments are eventually shortened with a warning if - necessary. + recycling arguments. \code{dbd} is vectorized simultaneously \code{x} + and \code{t} (to make likelihood calculations easy), and + \code{dbdTime} is vectorized only on \code{x}; the other arguments are + eventually shortened with a warning if necessary. The returned value is, logically, zero for values of \code{x} out of range, i.e., negative or zero for \code{dyule} or if \code{conditional diff --git a/man/node.depth.Rd b/man/node.depth.Rd index aced56a..e4c7bf9 100644 --- a/man/node.depth.Rd +++ b/man/node.depth.Rd @@ -1,19 +1,31 @@ \name{node.depth} \alias{node.depth} -\title{Depth of Nodes and Tips} +\alias{node.depth.length} +\alias{node.height} +\alias{node.height.clado} +\title{Depth and Heights of Nodes and Tips} \description{ - This function returns the depth of nodes and tips given by the number - of descendants (1 is returned for tips). + These functions return the depth or height of nodes and tips. } \usage{ node.depth(phy) +node.depth.length(phy) +node.height(phy) +node.height.clado(phy) } \arguments{ \item{phy}{an object of class "phylo".} } \details{ - The depth of a node is computed as the number of tips which are its - descendants. The value of 1 is given to the tips. + \code{node.depth} computes the depth of a node as the number of tips + which are its descendants. The value of 1 is given to the tips. + + \code{node.depth.length} does the same but using branch lengths. + + \code{node.height} computes the heights of nodes and tips as plotted + by a phylogram. + + \code{node.height.clado} does the same but for a cladogram. } \value{ A numeric vector indexed with the node numbers of the matrix `edge' of diff --git a/man/rTraitCont.Rd b/man/rTraitCont.Rd index 9e96f9e..2251c85 100644 --- a/man/rTraitCont.Rd +++ b/man/rTraitCont.Rd @@ -73,7 +73,7 @@ rTraitCont(phy, model = "BM", sigma = 0.1, alpha = 1, theta = 0, data(bird.orders) rTraitCont(bird.orders) # BM with sigma = 0.1 ### OU model with two optima: -tr <- reorder(bird.orders, "p") +tr <- reorder(bird.orders, "postorder") plot(tr) edgelabels() theta <- rep(0, Nedge(tr)) diff --git a/src/plot_phylo.c b/src/plot_phylo.c index d8666a2..9b38d2a 100644 --- a/src/plot_phylo.c +++ b/src/plot_phylo.c @@ -1,6 +1,6 @@ -/* plot_phylo.c (2011-06-23) */ +/* plot_phylo.c (2012-10-01) */ -/* Copyright 2004-2011 Emmanuel Paradis +/* Copyright 2004-2012 Emmanuel Paradis /* This file is part of the R-package `ape'. */ /* See the file ../COPYING for licensing issues. */ @@ -35,7 +35,7 @@ void node_depth(int *ntip, int *nnode, int *edge1, int *edge2, } void node_height(int *ntip, int *nnode, int *edge1, int *edge2, - int *nedge, double *yy) + int *nedge, double *yy) { int i, n; double S; @@ -77,18 +77,3 @@ void node_height_clado(int *ntip, int *nnode, int *edge1, int *edge2, } } } - -void get_single_index_integer(int *x, int *val, int *index) -{ - while (x[*index] != *val) (*index)++; - *index += 1; -} - -void get_two_index_integer(int *x, int *val, int *index) -{ - while (x[index[0]] != *val) index[0]++; - index[1] = index[0] + 1; - while (x[index[1]] != *val) index[1]++; - index[0] += 1; - index[1] += 1; -} -- 2.39.2 From da67dccb93d35408baa48b141fcda921772c8b9c Mon Sep 17 00:00:00 2001 From: paradis Date: Fri, 12 Oct 2012 07:56:32 +0000 Subject: [PATCH 02/16] some changes for ape 3.0-6 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@197 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 5 ++++- R/drop.tip.R | 14 ++++++------- R/reorder.phylo.R | 7 ++++--- man/ace.Rd | 44 ++++++++++++++++++++--------------------- man/collapse.singles.Rd | 2 +- man/dist.dna.Rd | 17 ++++++++-------- man/mantel.test.Rd | 3 +-- man/node.depth.Rd | 6 +++--- man/nodelabels.Rd | 3 +-- man/plot.phylo.Rd | 6 +++--- man/print.phylo.Rd | 2 +- man/read.caic.Rd | 7 +------ man/read.nexus.Rd | 28 ++++---------------------- man/woodmouse.Rd | 6 +++--- man/yule.Rd | 2 +- 16 files changed, 65 insertions(+), 89 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 03b6257..32f124c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-6 -Date: 2012-08-17 +Date: 2012-10-12 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 729e3ed..6ba9bc4 100644 --- a/NEWS +++ b/NEWS @@ -7,7 +7,7 @@ NEW FEATURES index.only = TRUE to return only the vector of indices (the tree is unmodified, see ?reorder.phylo for details). - o The three new functions node.depth.length, node.height, and + o The three new functions node.depth.edgelength, node.height, and node.height.clado make some internal code available from R. See ?node.depth (which was already available and documented) for details. @@ -18,6 +18,9 @@ BUG FIXES o reorder(, "pruningwise") made R crash if the rows of the edge matrix are in random order: this is now fixed. + o drop.tip() sometimes shuffled node labels (thanks to Rebecca Best + for the report). + OTHER CHANGES diff --git a/R/drop.tip.R b/R/drop.tip.R index 5b635f6..e9bd5c0 100644 --- a/R/drop.tip.R +++ b/R/drop.tip.R @@ -1,8 +1,8 @@ -## drop.tip.R (2011-11-21) +## drop.tip.R (2012-10-06) ## Remove Tips in a Phylogenetic Tree -## Copyright 2003-2011 Emmanuel Paradis +## Copyright 2003-2012 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -208,14 +208,10 @@ drop.tip <- phy$tip.label <- c(phy$tip.label, new.tip.label) } - ## update node.label if needed: - if (!is.null(phy$node.label)) - phy$node.label <- phy$node.label[sort(unique(phy$edge[, 1])) - Ntip] - phy$Nnode <- dim(phy$edge)[1] - n + 1L # update phy$Nnode ## The block below renumbers the nodes so that they conform - ## to the "phylo" format -- same as in root() + ## to the "phylo" format, same as in root() newNb <- integer(n + phy$Nnode) newNb[NEWROOT] <- n + 1L sndcol <- phy$edge[, 2] > n @@ -224,6 +220,10 @@ drop.tip <- (n + 2):(n + phy$Nnode) phy$edge[, 1] <- newNb[phy$edge[, 1]] storage.mode(phy$edge) <- "integer" + if (!is.null(phy$node.label)) { # update node.label if needed + newNb[is.na(newNb)] <- 0L + phy$node.label <- phy$node.label[order(newNb[newNb > 0])] + } collapse.singles(phy) } diff --git a/R/reorder.phylo.R b/R/reorder.phylo.R index ee9e9e0..e717eb2 100644 --- a/R/reorder.phylo.R +++ b/R/reorder.phylo.R @@ -1,4 +1,4 @@ -## reorder.phylo.R (2012-09-03) +## reorder.phylo.R (2012-10-12) ## Internal Reordering of Trees @@ -13,12 +13,13 @@ reorder.phylo <- function(x, order = "cladewise", index.only = FALSE, ...) io <- pmatch(order, ORDER) if (is.na(io)) stop("ambiguous order") order <- ORDER[io] + nb.edge <- dim(x$edge)[1] if (!is.null(attr(x, "order"))) - if (attr(x, "order") == order) return(x) + if (attr(x, "order") == order) + if (index.only) return(1:nb.edge) else return(x) nb.node <- x$Nnode if (nb.node == 1) return(x) nb.tip <- length(x$tip.label) - nb.edge <- dim(x$edge)[1] if (io == 3) { x <- reorder(x) neworder <- diff --git a/man/ace.Rd b/man/ace.Rd index 9686022..15bd387 100644 --- a/man/ace.Rd +++ b/man/ace.Rd @@ -6,6 +6,22 @@ \alias{AIC.ace} \alias{anova.ace} \title{Ancestral Character Estimation} +\description{ + This function estimates ancestral character states, and the associated + uncertainty, for continuous and discrete characters. + + \code{logLik}, \code{deviance}, and \code{AIC} are generic functions + used to extract the log-likelihood, the deviance, or the Akaike + information criterion of a fitted object. If no such values are + available, \code{NULL} is returned. + + \code{anova} is another generic function which is used to compare + nested models: the significance of the additional parameter(s) is + tested with likelihood ratio tests. You must ensure that the models + are effectively nested (if they are not, the results will be + meaningless). It is better to list the models from the smallest to the + largest. +} \usage{ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, model = if (type == "continuous") "BM" else "ER", @@ -47,22 +63,6 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, information criterion.} \item{\dots}{further arguments passed to or from other methods.} } -\description{ - This function estimates ancestral character states, and the associated - uncertainty, for continuous and discrete characters. - - \code{logLik}, \code{deviance}, and \code{AIC} are generic functions - used to extract the log-likelihood, the deviance (-2*logLik), or the - Akaike information criterion of a tree. If no such values are - available, \code{NULL} is returned. - - \code{anova} is another generic function which is used to compare - nested models: the significance of the additional parameter(s) is - tested with likelihood ratio tests. You must ensure that the models - are effectively nested (if they are not, the results will be - meaningless). It is better to list the models from the smallest to the - largest. -} \details{ If \code{type = "continuous"}, the default model is Brownian motion where characters evolve randomly following a random walk. This model @@ -74,8 +74,8 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, given through a correlation structure with the option \code{corStruct}. - In the default setting (i.e., \code{method = "ML"} and \code{model = - "BM"}) the maximum likelihood estimation is done simultaneously on the + In the default setting (\code{method = "ML"} and \code{model = "BM"}) + the maximum likelihood estimation is done simultaneously on the ancestral values and the variance of the Brownian motion process; these estimates are then used to compute the confidence intervals in the standard way. The REML method first estimates the ancestral value @@ -89,8 +89,8 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, It could be shown that, with a continous character, REML results in unbiased estimates of the variance of the Brownian motion process - while ML gives a downward bias. Therefore, the former is recommanded - over the latter, even though it is not the default. + while ML gives a downward bias. Therefore the former is recommanded, + even though it is not the default. For discrete characters (\code{type = "discrete"}), only maximum likelihood estimation is available (Pagel 1994). The model is @@ -110,7 +110,7 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, is determined from the data. } \value{ - a list with the following elements: + an object of class \code{"ace"} with the following elements: \item{ace}{if \code{type = "continuous"}, the estimates of the ancestral character values.} @@ -153,7 +153,7 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, Likelihood of ancestor states in adaptive radiation. \emph{Evolution}, \bold{51}, 1699--1711. } -\author{Emmanuel Paradis, Ben Bolker \email{bolker@zoo.ufl.edu}} +\author{Emmanuel Paradis, Ben Bolker} \seealso{ \code{\link{corBrownian}}, \code{\link{corGrafen}}, \code{\link{corMartins}}, \code{\link{compar.ou}}, diff --git a/man/collapse.singles.Rd b/man/collapse.singles.Rd index cd6e8b6..ecfa0ec 100644 --- a/man/collapse.singles.Rd +++ b/man/collapse.singles.Rd @@ -14,7 +14,7 @@ collapse.singles(tree) \value{ an object of class \code{"phylo"}. } -\author{Ben Bolker \email{bolker@zoo.ufl.edu}} +\author{Ben Bolker} \seealso{ \code{\link{plot.phylo}}, \code{\link{read.tree}} } diff --git a/man/dist.dna.Rd b/man/dist.dna.Rd index bc5d485..8168603 100644 --- a/man/dist.dna.Rd +++ b/man/dist.dna.Rd @@ -10,7 +10,7 @@ dist.dna(x, model = "K80", variance = FALSE, \item{x}{a matrix or a list containing the DNA sequences; this must be of class \code{"DNAbin"} (use \code{\link{as.DNAbin}} is they are stored as character).} - \item{model}{a character string specifying the evlutionary model to be + \item{model}{a character string specifying the evolutionary model to be used; must be one of \code{"raw"}, \code{"N"}, \code{"TS"}, \code{"TV"}, \code{"JC69"}, \code{"K80"} (the default), \code{"F81"}, \code{"K81"}, \code{"F84"}, \code{"BH87"}, @@ -19,16 +19,15 @@ dist.dna(x, model = "K80", variance = FALSE, \item{variance}{a logical indicating whether to compute the variances of the distances; defaults to \code{FALSE} so the variances are not computed.} - \item{gamma}{a value for the gamma parameter which is possibly used to - apply a gamma correction to the distances (by default \code{gamma = - FALSE} so no correction is applied).} + \item{gamma}{a value for the gamma parameter possibly used to apply a + correction to the distances (by default no correction is applied).} \item{pairwise.deletion}{a logical indicating whether to delete the sites with missing data in a pairwise way. The default is to delete the sites with at least one missing data for all sequences (ignored if \code{model = "indel"} or \code{"indelblock"}).} \item{base.freq}{the base frequencies to be used in the computations - (if applicable, i.e. if \code{method = "F84"}). By default, the - base frequencies are computed from the whole sample of sequences.} + (if applicable). By default, the base frequencies are computed from + the whole set of sequences.} \item{as.matrix}{a logical indicating whether to return the results as a matrix. The default is to return an object of class \link[stats]{dist}.} @@ -123,12 +122,12 @@ dist.dna(x, model = "K80", variance = FALSE, \item{\code{paralin}: }{Lake (1994) developed the paralinear distance which can be viewed as another variant of the Barry--Hartigan distance.} - \item{\code{indel}: }{this counts the number of sites where there an + \item{\code{indel}: }{this counts the number of sites where there is an insertion/deletion gap in one sequence and not in the other.} \item{\code{indelblock}: }{same than before but contiguous gaps are - counted as a single unit. Note that the distance between `-A-' and - `A--' is 3 because there are three different blocks of gaps, whereas + counted as a single unit. Note that the distance between \code{-A-} and + \code{A--} is 3 because there are three different blocks of gaps, whereas the ``indel'' distance will be 2.} }} \value{ diff --git a/man/mantel.test.Rd b/man/mantel.test.Rd index 949678e..07dfc39 100644 --- a/man/mantel.test.Rd +++ b/man/mantel.test.Rd @@ -55,8 +55,7 @@ mantel.test(m1, m2, nperm = 999, graph = FALSE, London: Chapman & Hall. } \author{ - Original code in S by Ben Bolker \email{bolker@zoo.ufl.edu}, ported - to \R by Julien Claude \email{claude@isem.univ-montp2.fr} + Original code in S by Ben Bolker, ported to \R by Julien Claude } \examples{ q1 <- matrix(runif(36), nrow = 6) diff --git a/man/node.depth.Rd b/man/node.depth.Rd index e4c7bf9..0741c7a 100644 --- a/man/node.depth.Rd +++ b/man/node.depth.Rd @@ -1,6 +1,6 @@ \name{node.depth} \alias{node.depth} -\alias{node.depth.length} +\alias{node.depth.edgelength} \alias{node.height} \alias{node.height.clado} \title{Depth and Heights of Nodes and Tips} @@ -9,7 +9,7 @@ } \usage{ node.depth(phy) -node.depth.length(phy) +node.depth.edgelength(phy) node.height(phy) node.height.clado(phy) } @@ -20,7 +20,7 @@ node.height.clado(phy) \code{node.depth} computes the depth of a node as the number of tips which are its descendants. The value of 1 is given to the tips. - \code{node.depth.length} does the same but using branch lengths. + \code{node.depth.edgelength} does the same but using branch lengths. \code{node.height} computes the heights of nodes and tips as plotted by a phylogram. diff --git a/man/nodelabels.Rd b/man/nodelabels.Rd index c499694..5c44f3f 100644 --- a/man/nodelabels.Rd +++ b/man/nodelabels.Rd @@ -88,8 +88,7 @@ edgelabels(text, edge, adj = c(0.5, 0.5), frame = "rect", \code{show.tip.label}) of \code{plot.phylo} in most cases (see the examples). } -\author{Emmanuel Paradis, Ben Bolker \email{bolker@zoo.ufl.edu}, and Jim - Lemon} +\author{Emmanuel Paradis, Ben Bolker, and Jim Lemon} \seealso{ \code{\link{plot.phylo}}, \code{\link{edges}}, \code{\link{mixedFontLabel}} diff --git a/man/plot.phylo.Rd b/man/plot.phylo.Rd index f19db41..ee89287 100644 --- a/man/plot.phylo.Rd +++ b/man/plot.phylo.Rd @@ -94,12 +94,12 @@ \item{plot}{a logical controlling whether to draw the tree. If \code{FALSE}, the graphical device is set as if the tree was plotted, and the coordinates are saved as well.} - \item{layout}{the number of trees to be plotted simultaneously.} - \item{\dots}{further arguments to be passed to \code{plot} or to - \code{plot.phylo}.} \item{rotate.tree}{for "fan", "unrooted", or "radial" trees: the rotation of the whole tree in degrees (negative values are accepted).} + \item{layout}{the number of trees to be plotted simultaneously.} + \item{\dots}{further arguments to be passed to \code{plot} or to + \code{plot.phylo}.} } \description{ These functions plot phylogenetic trees on the current graphical diff --git a/man/print.phylo.Rd b/man/print.phylo.Rd index 91917f8..f1ad306 100644 --- a/man/print.phylo.Rd +++ b/man/print.phylo.Rd @@ -23,7 +23,7 @@ \value{ NULL. } -\author{Ben Bolker \email{bolker@zoo.ufl.edu} and Emmanuel Paradis} +\author{Ben Bolker and Emmanuel Paradis} \seealso{ \code{\link{read.tree}}, \code{\link{summary.phylo}}, \code{\link[base]{print}} for the generic \R function diff --git a/man/read.caic.Rd b/man/read.caic.Rd index 016d8a6..39bc2a1 100644 --- a/man/read.caic.Rd +++ b/man/read.caic.Rd @@ -19,12 +19,7 @@ read.caic(file, brlen = NULL, skip = 0, comment.char = "#", ...) Read a tree from a file in the format used by the CAIC and MacroCAIc program. } \value{ -an object of class "phylo" with the following components: -\item{edge}{a two-column matrix of mode character where each row represents an edge of the tree; the nodes and the tips are symbolized with numbers (these numbers are not treated as numeric, hence the mode character); the nodes are represented with negative numbers (the root being "-1"), and the tips are represented with positive numbers. For each row, the first column gives the ancestor. This representation allows an easy manipulation of the tree, particularly if it is rooted.} -\item{edge.length}{a numeric vector giving the lengths of the branches given by edge.} -\item{tip.label}{a vector of mode character giving the names of the tips; the order of the names in this vector corresponds to the (positive) number in edge.} -\item{node.label}{(optional) a vector of mode character giving the names of the nodes (set to NULL if not available in the file).} -\item{root.edge}{(optional) a numeric value giving the length of the branch at the root is it exists (NULL otherwise).} + an object of class \code{"phylo"}. } \references{ Purvis, A. and Rambaut, A. (1995) Comparative analysis by independent diff --git a/man/read.nexus.Rd b/man/read.nexus.Rd index 8a668d1..5594e0e 100644 --- a/man/read.nexus.Rd +++ b/man/read.nexus.Rd @@ -8,7 +8,8 @@ read.nexus(file, tree.names = NULL) \item{file}{a file name specified by either a variable of mode character, or a double-quoted string.} \item{tree.names}{if there are several trees to be read, a vector of - mode character that gives names to the individual trees.} + mode character giving names to the individual trees (by default, + this uses the labels in the NEXUS file if these are present).} } \description{ This function reads one or several trees in a NEXUS file. @@ -18,8 +19,7 @@ read.nexus(file, tree.names = NULL) NEXUS standard (but see the restriction below on TRANSLATION tables). Only the block ``TREES'' is read; the other data can be read with other functions (e.g., \code{\link{read.dna}}, - \code{\link[utils]{read.table}}, ...). A trace of the original data is - kept with the attribute \code{"origin"} (see below). + \code{\link[utils]{read.table}}, \dots). If a TRANSLATION table is present it is assumed that only the tip labels are translated and they are all translated with integers @@ -39,27 +39,7 @@ read.nexus(file, tree.names = NULL) message is issued. } \value{ - an object of class \code{"phylo"} with the following components: - \item{edge}{a two-column matrix of mode character where each row - represents an edge of the tree; the nodes and the tips are - symbolized with numbers (these numbers are not treated as numeric, - hence the mode character); the nodes are represented with negative - numbers (the root being \code{"-1"}), and the tips are represented with - positive numbers. For each row, the first column gives the - ancestor. This representation allows an easy manipulation of the - tree, particularly if it is rooted.} - \item{edge.length}{a numeric vector giving the lengths of the - branches given by \code{edge}.} - \item{tip.label}{a vector of mode character giving the names of the - tips; the order of the names in this vector corresponds to the - (positive) number in \code{edge}.} - \item{node.label}{(optional) a vector of mode character giving the - names of the nodes (set to \code{NULL} if not available in the file).} - \item{root.edge}{(optional) a numeric value giving the length of the - branch at the root is it exists (\code{NULL} otherwise).} - - If several trees are read in the file, the returned object is of class - \code{"multiPhylo"}, and is a list of objects of class \code{"phylo"}. + an object of class \code{"phylo"} or of class \code{"multiPhylo"}. } \references{ Maddison, D. R., Swofford, D. L. and Maddison, W. P. (1997) NEXUS: an diff --git a/man/woodmouse.Rd b/man/woodmouse.Rd index be62ae0..d90ce66 100644 --- a/man/woodmouse.Rd +++ b/man/woodmouse.Rd @@ -17,9 +17,9 @@ data(woodmouse) } \source{ Michaux, J. R., Magnanou, E., Paradis, E., Nieberding, C. and Libois, - R. (2003) Mitochondrial phylogeography of the Woodmouse (Apodemus - sylvaticus) in the Western Palearctic region. \emph{Molecular - Ecology}, \bold{12}, 685--697. + R. (2003) Mitochondrial phylogeography of the Woodmouse + (\emph{Apodemus sylvaticus}) in the Western Palearctic region. + \emph{Molecular Ecology}, \bold{12}, 685--697. } \seealso{ \code{\link{read.dna}}, \code{\link{DNAbin}}, \code{\link{dist.dna}} diff --git a/man/yule.Rd b/man/yule.Rd index 40998f9..c9a07de 100644 --- a/man/yule.Rd +++ b/man/yule.Rd @@ -10,7 +10,7 @@ yule(phy, use.root.edge = FALSE) edge in the calculations.} } \description{ - This function fits by maximum likelihood a Yule model, i.e. a + This function fits by maximum likelihood a Yule model, i.e., a birth-only model to the branching times computed from a phylogenetic tree. } -- 2.39.2 From 2014b83971be4b9cd1644d6127837df798e9335c Mon Sep 17 00:00:00 2001 From: paradis Date: Mon, 22 Oct 2012 10:13:38 +0000 Subject: [PATCH 03/16] final corrections for ape 3.0-6 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@198 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 14 ++++++++++---- R/drop.tip.R | 3 ++- R/plot.phylo.R | 8 ++++---- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 32f124c..3904cc8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-6 -Date: 2012-10-12 +Date: 2012-10-20 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 6ba9bc4..e90cec1 100644 --- a/NEWS +++ b/NEWS @@ -9,8 +9,7 @@ NEW FEATURES o The three new functions node.depth.edgelength, node.height, and node.height.clado make some internal code available from R. See - ?node.depth (which was already available and documented) for - details. + ?node.depth (which was already documented) for details. BUG FIXES @@ -18,8 +17,15 @@ BUG FIXES o reorder(, "pruningwise") made R crash if the rows of the edge matrix are in random order: this is now fixed. - o drop.tip() sometimes shuffled node labels (thanks to Rebecca Best - for the report). + o drop.tip() sometimes shuffled node labels (thanks to Rebecca + Best for the report). + + o drop.tip(phy, "") returned a tree with zero-length tip labels: + it now returns the tree unchanged (thanks to Brian Anacker for + the report). + + o plot.phylo() made R crash if the tree has zero-length tip + labels: it now returns NULL (thanks again to Brian Anacker). OTHER CHANGES diff --git a/R/drop.tip.R b/R/drop.tip.R index e9bd5c0..a720a7c 100644 --- a/R/drop.tip.R +++ b/R/drop.tip.R @@ -1,4 +1,4 @@ -## drop.tip.R (2012-10-06) +## drop.tip.R (2012-10-20) ## Remove Tips in a Phylogenetic Tree @@ -101,6 +101,7 @@ drop.tip <- if (is.character(tip)) tip <- which(phy$tip.label %in% tip) } + if (!length(tip)) return(phy) if (any(tip > Ntip)) warning("some tip numbers were higher than the number of tips") diff --git a/R/plot.phylo.R b/R/plot.phylo.R index 862291d..4c23c6f 100644 --- a/R/plot.phylo.R +++ b/R/plot.phylo.R @@ -1,4 +1,4 @@ -## plot.phylo.R (2012-10-02) +## plot.phylo.R (2012-10-20) ## Plot Phylogenies @@ -18,8 +18,8 @@ plot.phylo <- tip.color = "black", plot = TRUE, rotate.tree = 0, ...) { Ntip <- length(x$tip.label) - if (Ntip == 1) { - warning("found only one tip in the tree") + if (Ntip < 2) { + warning("found less than 2 tips in the tree") return(NULL) } if (any(tabulate(x$edge[, 1]) == 1)) @@ -421,7 +421,7 @@ phylogram.plot <- function(edge, Ntip, Nnode, xx, yy, horizontal, yy <- xx xx <- tmp } - ## un trait vertical à chaque noeud... + ## un trait vertical a chaque noeud... x0v <- xx[nodes] y0v <- y1v <- numeric(Nnode) ## store the index of each node in the 1st column of edge: -- 2.39.2 From 12b407de3b6d3a160eb2ebd48d005da328735206 Mon Sep 17 00:00:00 2001 From: paradis Date: Thu, 22 Nov 2012 07:56:13 +0000 Subject: [PATCH 04/16] fix in drop.tip() and new option in pic() git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@199 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 4 ++-- NEWS | 23 +++++++++++++++++++---- R/drop.tip.R | 10 ++++------ R/pic.R | 26 +++++++------------------- R/summary.phylo.R | 2 +- man/pic.Rd | 19 +++++++++++++------ 6 files changed, 46 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3904cc8..83eab43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape -Version: 3.0-6 -Date: 2012-10-20 +Version: 3.0-7 +Date: 2012-11-22 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index e90cec1..4425348 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,18 @@ + CHANGES IN APE VERSION 3.0-7 + + +NEW FEATURES + + o pic() gains an option 'rescaled.tree = FALSE' to return the tree + with its branch lengths rescaled for the PICs calculation. + + +BUG FIXES + + o drop.tip() shuffled node labels on some trees. + + + CHANGES IN APE VERSION 3.0-6 @@ -1978,14 +1993,14 @@ BUG FIXES o A bug was fixed in phymltest(): the executable couldn't be found in some cases. - o Three bug have been fixed in ace(): computing the likelihood of + o Three bugs have been fixed in ace(): computing the likelihood of ancestral states of discrete characters failed, custom models did not work, and the function failed with a null gradient (a warning message is now returned; this latter bug was also present in yule.cov() as well and is now fixed). - o pic() hanged out when missing data were present: a message error - is now returned. + o pic() hanged out when missing data were present: an error is now + returned. o A small bug was fixed in dist.dna() where the gamma correction was not always correctly dispatched. @@ -2049,7 +2064,7 @@ NEW FEATURES DNA sequences by specifying model = "raw". o dist.phylo() has a new option `full' to possibly compute the - distances among all tips and nodes of the tree. The default if + distances among all tips and nodes of the tree. The default is `full = FALSE'. diff --git a/R/drop.tip.R b/R/drop.tip.R index a720a7c..f1b413f 100644 --- a/R/drop.tip.R +++ b/R/drop.tip.R @@ -1,4 +1,4 @@ -## drop.tip.R (2012-10-20) +## drop.tip.R (2012-11-22) ## Remove Tips in a Phylogenetic Tree @@ -213,7 +213,7 @@ drop.tip <- ## The block below renumbers the nodes so that they conform ## to the "phylo" format, same as in root() - newNb <- integer(n + phy$Nnode) + newNb <- integer(Ntip + Nnode) newNb[NEWROOT] <- n + 1L sndcol <- phy$edge[, 2] > n ## executed from right to left, so newNb is modified before phy$edge: @@ -221,10 +221,8 @@ drop.tip <- (n + 2):(n + phy$Nnode) phy$edge[, 1] <- newNb[phy$edge[, 1]] storage.mode(phy$edge) <- "integer" - if (!is.null(phy$node.label)) { # update node.label if needed - newNb[is.na(newNb)] <- 0L - phy$node.label <- phy$node.label[order(newNb[newNb > 0])] - } + if (!is.null(phy$node.label)) # update node.label if needed + phy$node.label <- phy$node.label[which(newNb > 0) - Ntip] collapse.singles(phy) } diff --git a/R/pic.R b/R/pic.R index b1cb20b..bc54067 100644 --- a/R/pic.R +++ b/R/pic.R @@ -1,4 +1,4 @@ -## pic.R (2012-09-11) +## pic.R (2012-11-20) ## Phylogenetically Independent Contrasts @@ -7,7 +7,7 @@ ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. -pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE) +pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE, rescaled.tree = FALSE) { if (!inherits(phy, "phylo")) stop("object 'phy' is not of class \"phylo\"") @@ -35,10 +35,10 @@ pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE) warning("the names of argument 'x' and the tip labels of the tree did not match: the former were ignored in the analysis.") } } + ## No need to copy the branch lengths: they are rescaled ## in the C code, so it's important to leave the default ## `DUP = TRUE' of .C. - ans <- .C("pic", as.integer(nb.tip), as.integer(nb.node), as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]), as.double(phy$edge.length), as.double(phenotype), @@ -46,22 +46,6 @@ pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE) as.integer(var.contrasts), as.integer(scaled), PACKAGE = "ape") - ## The "old" R code: - ##for (i in seq(from = 1, by = 2, length.out = nb.node)) { - ## j <- i + 1 - ## anc <- phy$edge[i, 1] - ## des1 <- phy$edge[i, 2] - ## des2 <- phy$edge[j, 2] - ## sumbl <- bl[i] + bl[j] - ## ic <- anc - nb.tip - ## contr[ic] <- phenotype[des1] - phenotype[des2] - ## if (scaled) contr[ic] <- contr[ic]/sqrt(sumbl) - ## if (var.contrasts) var.con[ic] <- sumbl - ## phenotype[anc] <- (phenotype[des1]*bl[j] + phenotype[des2]*bl[i])/sumbl - ## k <- which(phy$edge[, 2] == anc) - ## bl[k] <- bl[k] + bl[i]*bl[j]/sumbl - ## - ##} contr <- ans[[7]] lbls <- if (is.null(phy$node.label)) as.character(1:nb.node + nb.tip) @@ -70,6 +54,10 @@ pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE) contr <- cbind(contr, ans[[8]]) dimnames(contr) <- list(lbls, c("contrasts", "variance")) } else names(contr) <- lbls + if (rescaled.tree) { + phy$edge.length <- ans[[5]] + contr <- list(contr = contr, rescaled.tree = phy) + } contr } diff --git a/R/summary.phylo.R b/R/summary.phylo.R index 715021b..f390b24 100644 --- a/R/summary.phylo.R +++ b/R/summary.phylo.R @@ -85,7 +85,7 @@ print.phylo <- function(x, printlen = 6,...) collapse=", "), ", ...\n", sep = "")) } else print(x$tip.label) if (!is.null(x$node.label)) { - cat("\tNode labels:\n") + cat("Node labels:\n") if (nb.node > printlen) { cat(paste("\t", paste(x$node.label[1:printlen], collapse=", "), ", ...\n", sep = "")) diff --git a/man/pic.Rd b/man/pic.Rd index f0d57da..80a421e 100644 --- a/man/pic.Rd +++ b/man/pic.Rd @@ -2,7 +2,8 @@ \alias{pic} \title{Phylogenetically Independent Contrasts} \usage{ -pic(x, phy, scaled = TRUE, var.contrasts = FALSE) +pic(x, phy, scaled = TRUE, var.contrasts = FALSE, + rescaled.tree = FALSE) } \arguments{ \item{x}{a numeric vector.} @@ -10,7 +11,10 @@ pic(x, phy, scaled = TRUE, var.contrasts = FALSE) \item{scaled}{logical, indicates whether the contrasts should be scaled with their expected variances (default to \code{TRUE}).} \item{var.contrasts}{logical, indicates whether the expected - variances of the contrasts should be returned (default to \code{FALSE}).} + variances of the contrasts should be returned (default to + \code{FALSE}).} + \item{rescaled.tree}{logical, if \code{TRUE} the rescaled tree is + returned together with the main results.} } \description{ Compute the phylogenetically independent contrasts using the method @@ -22,10 +26,9 @@ pic(x, phy, scaled = TRUE, var.contrasts = FALSE) than the tip labels of \code{phy}. The user must be careful here since the function requires that both - series of names perfectly match, so this operation may fail if there - is a typing or syntax error. If both series of names do not match, the - values in the \code{x} are taken to be in the same order than the tip - labels of \code{phy}, and a warning message is issued. + series of names perfectly match. If both series of names do not match, + the values in the \code{x} are taken to be in the same order than the + tip labels of \code{phy}, and a warning message is issued. } \value{ either a vector of phylogenetically independent contrasts (if @@ -34,6 +37,10 @@ pic(x, phy, scaled = TRUE, var.contrasts = FALSE) expected variance in the second column (if \code{var.contrasts = TRUE}). If the tree has node labels, these are used as labels of the returned object. + + If \code{rescaled.tree = TRUE}, a list is returned with two elements + named ``contr'' with the above results and ``rescaled.tree'' with the + tree and its rescaled branch lengths (see Felsenstein 1985). } \references{ Felsenstein, J. (1985) Phylogenies and the comparative method. -- 2.39.2 From dff741171e7afe3f9aaa2d9cb19c2f91995e8623 Mon Sep 17 00:00:00 2001 From: paradis Date: Wed, 28 Nov 2012 10:02:30 +0000 Subject: [PATCH 05/16] adding the new function 'where' git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@200 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 2 ++ R/DNA.R | 28 +++++++++++++++++++++++++++- man/consensus.Rd | 4 ++++ man/where.Rd | 35 +++++++++++++++++++++++++++++++++++ src/dist_dna.c | 23 ++++++++++++++++++++++- 6 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 man/where.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 83eab43..a81d208 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2012-11-22 +Date: 2012-11-28 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 4425348..9d57433 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ NEW FEATURES + o The new function 'where' searches patterns in DNA sequences. + o pic() gains an option 'rescaled.tree = FALSE' to return the tree with its branch lengths rescaled for the PICs calculation. diff --git a/R/DNA.R b/R/DNA.R index b426dbd..1d0ce79 100644 --- a/R/DNA.R +++ b/R/DNA.R @@ -1,4 +1,4 @@ -## DNA.R (2012-09-13) +## DNA.R (2012-11-28) ## Manipulations and Comparisons of DNA Sequences @@ -471,3 +471,29 @@ image.DNAbin <- function(x, what, col, bg = "white", xlab = "", ylab = "", horiz = TRUE, xpd = TRUE) } } + +where <- function(x, pattern) +{ + pat <- as.DNAbin(strsplit(pattern, NULL)[[1]]) + p <- as.integer(length(pat)) + + foo <- function(x, pat, p) { + s <- as.integer(length(x)) + if (s < p) stop("sequence shorter than the pattern") + ans <- .C("where", x, pat, s, p, integer(s), 0L, + DUP = FALSE, NAOK = TRUE, PACKAGE = "apex") + n <- ans[[6]] + if (n) ans[[5]][seq_len(n)] - p + 2L else integer() + } + + if (is.list(x)) return(lapply(x, foo, pat = pat, p = p)) + if (is.matrix(x)) { + n <- nrow(x) + res <- vector("list", n) + for (i in seq_along(n)) + res[[i]] <- foo(x[i, , drop = TRUE], pat, p) + names(res) <- rownames(x) + return(res) + } + foo(x, pat, p) # if x is a vector +} diff --git a/man/consensus.Rd b/man/consensus.Rd index 2f7bdeb..ffaa58f 100644 --- a/man/consensus.Rd +++ b/man/consensus.Rd @@ -31,6 +31,10 @@ consensus(..., p = 1, check.labels = TRUE) \value{ an object of class \code{"phylo"}. } +\references{ + Felsenstein, J. (2004) \emph{Inferring Phylogenies}. Sunderland: + Sinauer Associates. +} \author{Emmanuel Paradis} \seealso{ \code{\link{prop.part}}, \code{\link{dist.topo}} diff --git a/man/where.Rd b/man/where.Rd new file mode 100644 index 0000000..faddeba --- /dev/null +++ b/man/where.Rd @@ -0,0 +1,35 @@ +\names{where} +\alias{where} +\title{Find Patterns in DNA Sequences} +\description{ + This function finds patterns in a single or a set of DNA sequences. +} +\usage{ +where(x, pattern) +} +\arguments{ + \item{x}{an object of class \code{"DNAbin"}.} + \item{pattern}{a character string to be searched in \code{x}.} +} +\details{ + If \code{x} is a vector, the function returns a single vector giving + the position(s) where the pattern was found. If \code{x} is a matrix + or a list, it returns a list with the positions of the pattern for + each sequence. + + Patterns may be overlapping. For instance, if \code{pattern = "tata"} + and the sequence starts with `tatata', then the vector returned will + be c(1, 3). +} +\value{ + a vector of integers or a list of such vectors. +} +\author{Emmanuel Paradis} +\seealso{ + \code{\link{DNAbin}}, \code{\link{image.DNAbin}} +} +\examples{ +data(woodmouse) +where(woodmouse, "tata") +} +\keyword{manip} diff --git a/src/dist_dna.c b/src/dist_dna.c index 67522ff..c087212 100644 --- a/src/dist_dna.c +++ b/src/dist_dna.c @@ -1,4 +1,4 @@ -/* dist_dna.c 2012-02-14 */ +/* dist_dna.c 2012-11-28 */ /* Copyright 2005-2012 Emmanuel Paradis @@ -1143,3 +1143,24 @@ void dist_dna(unsigned char *x, int *n, int *s, int *model, double *d, case 17 : distDNA_indelblock(x, n, s, d); break; } } + +void where(unsigned char *x, unsigned char *pat, int *s, int *p, + int *ans, int *n) +{ + int i, j, k, ln; + + ln = 0; /* local n */ + + for (i = 0; i <= *s - *p; i++) { + k = i; j = 0; + while (1) { + if (x[k] != pat[j]) break; + j++; k++; + if (j == *p) { + ans[ln++] = k - 1; + break; + } + } + } + *n = ln; +} -- 2.39.2 From a2fc961dffe9f9b7994ed880e68c03b2334dc341 Mon Sep 17 00:00:00 2001 From: paradis Date: Wed, 19 Dec 2012 07:35:43 +0000 Subject: [PATCH 06/16] a few changes.... git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@201 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 17 ++++++++++++++++- R/DNA.R | 2 +- R/clustal.R | 20 +++++++++++++------- R/cophyloplot.R | 10 +++++----- R/dist.topo.R | 42 +++++++++++++++++++----------------------- R/drop.tip.R | 6 +++--- R/plot.phylo.R | 18 ++++++++++-------- R/scales.R | 40 +++++++++++++++++++++++++++++++++++++--- man/clustal.Rd | 10 +++++++--- man/plot.phylo.Rd | 5 ++++- man/where.Rd | 2 +- 12 files changed, 117 insertions(+), 57 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a81d208..c2302fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2012-11-28 +Date: 2012-12-19 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 9d57433..f6acbff 100644 --- a/NEWS +++ b/NEWS @@ -6,13 +6,28 @@ NEW FEATURES o The new function 'where' searches patterns in DNA sequences. o pic() gains an option 'rescaled.tree = FALSE' to return the tree - with its branch lengths rescaled for the PICs calculation. + with its branch lengths rescaled for the PIC calculation. + + o clustal(), muscle(), and tcoffee() gain an option + 'original.ordering = TRUE' to ease the comparisons of + alignments. + + o plot.phylo() has a new option, open.angle, used when plotting + circular trees. BUG FIXES o drop.tip() shuffled node labels on some trees. + o axisPhylo() now works correctly with circular trees, and gives a + sensible error message when type = "r" or "u". + + +OTHER CHANGES + + o .compressTipLabel() is 10 times faster thanks to Joseph Brown. + CHANGES IN APE VERSION 3.0-6 diff --git a/R/DNA.R b/R/DNA.R index 1d0ce79..31ef4bb 100644 --- a/R/DNA.R +++ b/R/DNA.R @@ -481,7 +481,7 @@ where <- function(x, pattern) s <- as.integer(length(x)) if (s < p) stop("sequence shorter than the pattern") ans <- .C("where", x, pat, s, p, integer(s), 0L, - DUP = FALSE, NAOK = TRUE, PACKAGE = "apex") + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") n <- ans[[6]] if (n) ans[[5]][seq_len(n)] - p + 2L else integer() } diff --git a/R/clustal.R b/R/clustal.R index 656ecb2..c777945 100644 --- a/R/clustal.R +++ b/R/clustal.R @@ -1,4 +1,4 @@ -## clustal.R (2012-01-12) +## clustal.R (2012-11-28) ## Multiple Sequence Alignment with External Applications @@ -9,7 +9,7 @@ clustal <- function(x, pw.gapopen = 10, pw.gapext = 0.1, gapopen = 10, gapext = 0.2, exec = NULL, - MoreArgs = "", quiet = TRUE) + MoreArgs = "", quiet = TRUE, original.ordering = TRUE) { os <- Sys.info()[1] if (is.null(exec)) { @@ -32,10 +32,12 @@ clustal <- function(x, pw.gapopen = 10, pw.gapext = 0.1, opts <- paste(prefix, suffix, sep = "=", collapse = " ") opts <- paste(opts, MoreArgs) system(paste(exec, opts), ignore.stdout = quiet) - read.dna(outf, "clustal") + res <- read.dna(outf, "clustal") + if (original.ordering) res <- res[labels(x), ] + res } -muscle <- function(x, exec = "muscle", MoreArgs = "", quiet = TRUE) +muscle <- function(x, exec = "muscle", MoreArgs = "", quiet = TRUE, original.ordering = TRUE) { if (missing(x)) { system(exec) @@ -50,10 +52,12 @@ muscle <- function(x, exec = "muscle", MoreArgs = "", quiet = TRUE) if (quiet) opts <- paste(opts, "-quiet") opts <- paste(opts, MoreArgs) system(paste(exec, opts)) - read.dna(outf, "fasta") + res <- read.dna(outf, "fasta") + if (original.ordering) res <- res[labels(x), ] + res } -tcoffee <- function(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE) +tcoffee <- function(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE, original.ordering = TRUE) { if (missing(x)) { system(exec) @@ -68,5 +72,7 @@ tcoffee <- function(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE) opts <- paste(inf, MoreArgs) if (quiet) opts <- paste(opts, "-quiet=nothing") system(paste(exec, opts)) - read.dna("input_tcoffee.aln", "clustal") + res <- read.dna("input_tcoffee.aln", "clustal") + if (original.ordering) res <- res[labels(x), ] + res } diff --git a/R/cophyloplot.R b/R/cophyloplot.R index 85f091f..29e5b3c 100644 --- a/R/cophyloplot.R +++ b/R/cophyloplot.R @@ -42,11 +42,11 @@ cophyloplot <- } plotCophylo2 <- - function (x, y, assoc = assoc, use.edge.length = use.edge.length, - space = space, length.line = length.line, gap = gap, - type = type, return = return, col = col, lwd=lwd, lty=lty, - show.tip.label = show.tip.label, - font = font, ...) + function(x, y, assoc = assoc, use.edge.length = use.edge.length, + space = space, length.line = length.line, gap = gap, + type = type, return = return, col = col, lwd=lwd, lty=lty, + show.tip.label = show.tip.label, + font = font, ...) { res <- list() ###choice of the minimum space between the trees diff --git a/R/dist.topo.R b/R/dist.topo.R index 5777482..230f009 100644 --- a/R/dist.topo.R +++ b/R/dist.topo.R @@ -1,4 +1,4 @@ -## dist.topo.R (2012-03-13) +## dist.topo.R (2012-12-12) ## Topological Distances, Tree Bipartitions, ## Consensus Trees, and Bootstrapping Phylogenies @@ -68,32 +68,28 @@ dist.topo <- function(x, y, method = "PH85") ## 'x' is a list of objects of class "phylo" possibly with no class if (!is.null(attr(x, "TipLabel"))) return(x) ref <- x[[1]]$tip.label - if (any(table(ref) != 1)) - stop("some tip labels are duplicated in tree no. 1") n <- length(ref) - Ntree <- length(x) - if (Ntree > 1) { - for (i in 2:Ntree) { - label <- x[[i]]$tip.label - if (!identical(label, ref)) { - if (length(label) != length(ref)) - stop(paste("tree no.", i, "has a different number of tips")) - ilab <- match(label, ref) - ## can use tabulate here because 'ilab' contains integers - if (any(is.na(ilab))) - stop(paste("tree no.", i, "has different tip labels")) -### the test below does not seem useful anymore -### if (any(tabulate(ilab) > 1)) -### stop(paste("some tip labels are duplicated in tree no.", i)) -### - ie <- match(1:n, x[[i]]$edge[, 2]) - x[[i]]$edge[ie, 2] <- ilab - } - x[[i]]$tip.label <- NULL + if (length(unique(ref)) != n) + stop("some tip labels are duplicated in tree no. 1") + + ## serious improvement by Joseph W. Brown! + relabel <- function (y) { + label <- y$tip.label + if (!identical(label, ref)) { + if (length(label) != length(ref)) + stop(paste("tree ", y, "has a different number of tips")) + ilab <- match(label, ref) + if (any(is.na(ilab))) + stop(paste("tree ", y, "has different tip labels")) + ie <- match(1:n, y$edge[, 2]) + y$edge[ie, 2] <- ilab } + y$tip.label <- NULL + y } - x[[1]]$tip.label <- NULL + x <- lapply(x, relabel) attr(x, "TipLabel") <- ref + class(x) <- "multiPhylo" x } diff --git a/R/drop.tip.R b/R/drop.tip.R index f1b413f..113c448 100644 --- a/R/drop.tip.R +++ b/R/drop.tip.R @@ -1,4 +1,4 @@ -## drop.tip.R (2012-11-22) +## drop.tip.R (2012-11-29) ## Remove Tips in a Phylogenetic Tree @@ -204,8 +204,8 @@ drop.tip <- if (is.null(phy$node.label)) rep("NA", length(node2tip)) else phy$node.label[node2tip - Ntip] } - if (!is.null(phy$node.label)) - phy$node.label <- phy$node.label[-(node2tip - Ntip)] +# if (!is.null(phy$node.label)) +# phy$node.label <- phy$node.label[-(node2tip - Ntip)] phy$tip.label <- c(phy$tip.label, new.tip.label) } diff --git a/R/plot.phylo.R b/R/plot.phylo.R index 4c23c6f..96d9aa4 100644 --- a/R/plot.phylo.R +++ b/R/plot.phylo.R @@ -1,4 +1,4 @@ -## plot.phylo.R (2012-10-20) +## plot.phylo.R (2012-12-19) ## Plot Phylogenies @@ -15,7 +15,8 @@ plot.phylo <- adj = NULL, srt = 0, no.margin = FALSE, root.edge = FALSE, label.offset = 0, underscore = FALSE, x.lim = NULL, y.lim = NULL, direction = "rightwards", lab4ut = "horizontal", - tip.color = "black", plot = TRUE, rotate.tree = 0, ...) + tip.color = "black", plot = TRUE, rotate.tree = 0, + open.angle = 0, ...) { Ntip <- length(x$tip.label) if (Ntip < 2) { @@ -122,13 +123,15 @@ plot.phylo <- xx <- .nodeDepthEdgelength(Ntip, Nnode, z$edge, Nedge, z$edge.length) } } else { - rotate.tree <- 2 * pi * rotate.tree/360 + twopi <- 2 * pi + rotate.tree <- twopi * rotate.tree/360 switch(type, "fan" = { ## if the tips are not in the same order in tip.label ## and in edge[, 2], we must reorder the angles: we ## use `xx' to store temporarily the angles TIPS <- x$edge[which(x$edge[, 2] <= Ntip), 2] - xx <- seq(0, 2*pi*(1 - 1/Ntip), 2*pi/Ntip) + xx <- seq(0, twopi * (1 - 1/Ntip) - twopi * open.angle/360, + length.out = Ntip) theta <- double(Ntip) theta[TIPS] <- xx theta <- c(theta, numeric(Nnode)) @@ -140,8 +143,8 @@ plot.phylo <- r <- 1/r } theta <- theta + rotate.tree - xx <- r*cos(theta) - yy <- r*sin(theta) + xx <- r * cos(theta) + yy <- r * sin(theta) }, "unrooted" = { nb.sp <- .nodeDepth(Ntip, Nnode, z$edge, Nedge) XY <- if (use.edge.length) @@ -157,7 +160,7 @@ plot.phylo <- ## radius: X <- 1 - X/Ntip ## angle (1st compute the angles for the tips): - yy <- c((1:Ntip)*2*pi/Ntip, rep(0, Nnode)) + yy <- c((1:Ntip)*twopi/Ntip, rep(0, Nnode)) Y <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy) xx <- X * cos(Y + rotate.tree) yy <- X * sin(Y + rotate.tree) @@ -375,7 +378,6 @@ if (plot) { if (type %in% c("fan", "radial")) { xx.tips <- xx[1:Ntip] yy.tips <- yy[1:Ntip] - ## using atan2 considerably facilitates things compared to acos... angle <- atan2(yy.tips, xx.tips) # in radians if (label.offset) { xx.tips <- xx.tips + label.offset * cos(angle) diff --git a/R/scales.R b/R/scales.R index 7176d1f..c2e9d61 100644 --- a/R/scales.R +++ b/R/scales.R @@ -1,4 +1,4 @@ -## scales.R (2011-05-31) +## scales.R (2012-12-19) ## Add a Scale Bar or Axis to a Phylogeny Plot @@ -76,7 +76,14 @@ add.scale.bar <- function(x, y, length = NULL, ask = FALSE, axisPhylo <- function(side = 1, ...) { lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv) - if (lastPP$type %in% c("phylogram", "cladogram")) { + type <- lastPP$type + + if (type == "unrooted") + stop("axisPhylo() not available for unrooted plots; try add.scale.bar()") + if (type == "radial") + stop("axisPhylo() not meaningful for this type of plot") + + if (type %in% c("phylogram", "cladogram")) { if (lastPP$direction %in% c("rightwards", "leftwards")) { x <- pretty(lastPP$xx) if (lastPP$direction == "rightwards") maxi <- max(lastPP$xx) @@ -92,6 +99,33 @@ axisPhylo <- function(side = 1, ...) x <- -x } } + axis(side = side, at = c(maxi - x), labels = abs(x), ...) + } else { # type == "fan" + n <- lastPP$Ntip + xx <- lastPP$xx[1:n]; yy <- lastPP$yy[1:n] + r0 <- max(sqrt(xx^2 + yy^2)) + firstandlast <- c(1, n) + theta0 <- mean(atan2(yy[firstandlast], xx[firstandlast])) + x0 <- r0 * cos(theta0); y0 <- r0 * sin(theta0) + inc <- diff(pretty(c(0, r0))[1:2]) + srt <- 360*theta0/(2*pi) + coef <- -1 + if (abs(srt) > 90) { + srt <- srt + 180 + coef <- 1 + } + len <- 0.025 * r0 # the length of tick marks + r <- r0 + while (r > 1e-8) { + x <- r * cos(theta0); y <- r * sin(theta0) + if (len/r < 1) { + ra <- sqrt(len^2 + r^2); thetaa <- theta0 + coef * asin(len/r) + xa <- ra * cos(thetaa); ya <- ra * sin(thetaa) + segments(xa, ya, x, y) + text(xa, ya, r0 - r, srt = srt, adj = c(0.5, 1.1), ...) + } + r <- r - inc + } + segments(x, y, x0, y0) } - axis(side = side, at = c(maxi - x), labels = abs(x), ...) } diff --git a/man/clustal.Rd b/man/clustal.Rd index 0e00198..afb8d73 100644 --- a/man/clustal.Rd +++ b/man/clustal.Rd @@ -10,9 +10,11 @@ \usage{ clustal(x, pw.gapopen = 10, pw.gapext = 0.1, gapopen = 10, gapext = 0.2, exec = NULL, - MoreArgs = "", quiet = TRUE) -muscle(x, exec = "muscle", MoreArgs = "", quiet = TRUE) -tcoffee(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE) + MoreArgs = "", quiet = TRUE, original.ordering = TRUE) +muscle(x, exec = "muscle", MoreArgs = "", quiet = TRUE, + original.ordering = TRUE) +tcoffee(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE, + original.ordering = TRUE) } \arguments{ \item{x}{an object of class \code{"DNAbin"}.} @@ -25,6 +27,8 @@ tcoffee(x, exec = "t_coffee", MoreArgs = "", quiet = TRUE) \item{MoreArgs}{a character string giving additional options.} \item{quiet}{a logical: the default is to not print on \R's console the messages from the external program.} + \item{original.ordering}{a logical specifying whether to return the + aligned sequences in the same order than in \code{x}.} } \details{ \code{clustal} tries to guess the name of the executable program diff --git a/man/plot.phylo.Rd b/man/plot.phylo.Rd index ee89287..6d08946 100644 --- a/man/plot.phylo.Rd +++ b/man/plot.phylo.Rd @@ -10,7 +10,7 @@ root.edge = FALSE, label.offset = 0, underscore = FALSE, x.lim = NULL, y.lim = NULL, direction = "rightwards", lab4ut = "horizontal", tip.color = "black", plot = TRUE, - rotate.tree = 0, ...) + rotate.tree = 0, open.angle = 0, ...) \method{plot}{multiPhylo}(x, layout = 1, ...) } \arguments{ @@ -100,6 +100,9 @@ \item{layout}{the number of trees to be plotted simultaneously.} \item{\dots}{further arguments to be passed to \code{plot} or to \code{plot.phylo}.} + \item{open.angle}{if \code{type = "f"}, the angle in degrees left + blank. Use a non-zero value if you want to call + \code{\link{axisPhylo}} after the tree is plotted.} } \description{ These functions plot phylogenetic trees on the current graphical diff --git a/man/where.Rd b/man/where.Rd index faddeba..09e0ab8 100644 --- a/man/where.Rd +++ b/man/where.Rd @@ -1,4 +1,4 @@ -\names{where} +\name{where} \alias{where} \title{Find Patterns in DNA Sequences} \description{ -- 2.39.2 From be6a044200152fd83d0b72f348012a0adc2593c5 Mon Sep 17 00:00:00 2001 From: paradis Date: Thu, 27 Dec 2012 09:48:23 +0000 Subject: [PATCH 07/16] new code for reading FASTA files git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@202 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 6 ++ R/DNA.R | 36 ++++++------ R/read.dna.R | 149 ++++++++++++++++++++++++------------------------ man/read.dna.Rd | 4 ++ src/read_dna.c | 141 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 244 insertions(+), 94 deletions(-) create mode 100644 src/read_dna.c diff --git a/DESCRIPTION b/DESCRIPTION index c2302fd..e84ea2e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2012-12-19 +Date: 2012-12-27 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index f6acbff..57d1924 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,10 @@ NEW FEATURES o plot.phylo() has a new option, open.angle, used when plotting circular trees. + o The new function read.FASTA reads FASTA files much faster and + more efficiently. It is called internally by read.dna(, "fasta") + or can be called directly. + BUG FIXES @@ -28,6 +32,8 @@ OTHER CHANGES o .compressTipLabel() is 10 times faster thanks to Joseph Brown. + o base.freq() is now faster with lists. + CHANGES IN APE VERSION 3.0-6 diff --git a/R/DNA.R b/R/DNA.R index 31ef4bb..d1b3625 100644 --- a/R/DNA.R +++ b/R/DNA.R @@ -1,4 +1,4 @@ -## DNA.R (2012-11-28) +## DNA.R (2012-12-27) ## Manipulations and Comparisons of DNA Sequences @@ -58,21 +58,8 @@ as.alignment <- function(x) "[.DNAbin" <- function(x, i, j, drop = FALSE) { - oc <- oldClass(x) - class(x) <- NULL - if (is.matrix(x)) { - if (nargs() == 2 && !missing(i)) ans <- x[i] - else { - nd <- dim(x) - if (missing(i)) i <- 1:nd[1] - if (missing(j)) j <- 1:nd[2] - ans <- x[i, j, drop = drop] - } - } else { - if (missing(i)) i <- 1:length(x) - ans <- x[i] - } - class(ans) <- oc + ans <- NextMethod("[", drop = drop) + class(ans) <- "DNAbin" ans } @@ -293,10 +280,19 @@ as.character.DNAbin <- function(x, ...) base.freq <- function(x, freq = FALSE, all = FALSE) { - if (is.list(x)) x <- unlist(x) - n <- length(x) - BF <-.C("BaseProportion", x, as.integer(n), double(17), - DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]] + f <- function(x) + .C("BaseProportion", x, as.integer(length(x)), double(17), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]] + + if (is.list(x)) { + BF <- rowSums(sapply(x, f)) + n <- sum(sapply(x, length)) + } else { + n <- length(x) + BF <-.C("BaseProportion", x, as.integer(n), double(17), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]] + } + names(BF) <- c("a", "c", "g", "t", "r", "m", "w", "s", "k", "y", "v", "h", "d", "b", "n", "-", "?") if (all) { diff --git a/R/read.dna.R b/R/read.dna.R index ca4da7a..9e46cba 100644 --- a/R/read.dna.R +++ b/R/read.dna.R @@ -1,4 +1,4 @@ -## read.dna.R (2012-05-03) +## read.dna.R (2012-12-27) ## Read DNA Sequences in a File @@ -7,6 +7,19 @@ ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. +read.FASTA <- function(file) +{ + sz <- file.info(file)$size + x <- readBin(file, "raw", sz) + if (Sys.info()[1] == "Windows") { + icr <- which(x == as.raw(0x0d)) # CR + x <- x[-icr] + } + res <- .Call("rawStreamToDNAbin", x, PACKAGE = "ape") + class(res) <- "DNAbin" + res +} + read.dna <- function(file, format = "interleaved", skip = 0, nlines = 0, comment.char = "#", as.character = FALSE, as.matrix = NULL) @@ -29,84 +42,74 @@ read.dna <- function(file, format = "interleaved", skip = 0, } formats <- c("interleaved", "sequential", "fasta", "clustal") format <- match.arg(format, formats) - phylip <- if (format %in% formats[1:2]) TRUE else FALSE - X <- scan(file = file, what = "", sep = "\n", quiet = TRUE, - skip = skip, nlines = nlines, comment.char = comment.char) + if (format == "fasta") { + obj <- read.FASTA(file) + } else { + X <- scan(file = file, what = "", sep = "\n", quiet = TRUE, + skip = skip, nlines = nlines, comment.char = comment.char) - if (phylip) { - ## need to remove the possible leading spaces and/or tabs in the first line - fl <- gsub("^[[:blank:]]+", "", X[1]) - fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+"))) - if (length(fl) != 2 || any(is.na(fl))) - stop("the first line of the file must contain the dimensions of the data") - n <- fl[1] - s <- fl[2] - obj <- matrix("", n, s) - X <- X[-1] - } - switch(format, - "interleaved" = { - start.seq <- findFirstNucleotide(X[1]) - one2n <- 1:n - taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1)) - X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n])) - nl <- length(X) - for (i in one2n) - obj[i, ] <- getNucleotide(X[seq(i, nl, n)]) - }, - "sequential" = { - taxa <- character(n) - j <- 1L # line number - for (i in 1:n) { - start.seq <- findFirstNucleotide(X[j]) - taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1)) - sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j]))) - j <- j + 1L - while (length(sequ) < s) { - sequ <- c(sequ, getNucleotide(X[j])) - j <- j + 1L - } - obj[i, ] <- sequ + if (format %in% formats[1:2]) { + ## need to remove the possible leading spaces and/or tabs in the first line + fl <- gsub("^[[:blank:]]+", "", X[1]) + fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+"))) + if (length(fl) != 2 || any(is.na(fl))) + stop("the first line of the file must contain the dimensions of the data") + n <- fl[1] + s <- fl[2] + obj <- matrix("", n, s) + X <- X[-1] } - taxa <- getTaxaNames(taxa) - }, - "fasta" = { - start <- grep("^ {0,}>", X) - taxa <- X[start] - taxa <- sub("^ {0,}>", "", taxa) # remove the hook and the spaces before - taxa <- getTaxaNames(taxa) - n <- length(taxa) - obj <- vector("list", n) - start <- c(start, length(X) + 1) # this avoids the following to crash when `i = n' - for (i in 1:n) - obj[[i]] <- getNucleotide(X[(start[i] + 1):(start[i + 1] - 1)]) - }, - "clustal" = { - X <- X[-1] # drop the line with "Clustal bla bla..." - ## find where the 1st sequence starts - start.seq <- findFirstNucleotide(X[1]) - ## find the lines with *********.... - nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "") - stars <- grep(nspaces, X) - ## we now know how many sequences in the file: - n <- stars[1] - 1 - taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1)) - ## need to remove the sequence names before getting the sequences: - X <- substr(X, start.seq, nchar(X)) - nl <- length(X) - ## find the length of the 1st sequence: - tmp <- getNucleotide(X[seq(1, nl, n + 1)]) - s <- length(tmp) - obj <- matrix("", n, s) - obj[1, ] <- tmp - for (i in 2:n) - obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)]) - }) + switch(format, + "interleaved" = { + start.seq <- findFirstNucleotide(X[1]) + one2n <- 1:n + taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1)) + X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n])) + nl <- length(X) + for (i in one2n) + obj[i, ] <- getNucleotide(X[seq(i, nl, n)]) + }, + "sequential" = { + taxa <- character(n) + j <- 1L # line number + for (i in 1:n) { + start.seq <- findFirstNucleotide(X[j]) + taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1)) + sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j]))) + j <- j + 1L + while (length(sequ) < s) { + sequ <- c(sequ, getNucleotide(X[j])) + j <- j + 1L + } + obj[i, ] <- sequ + } + taxa <- getTaxaNames(taxa) + }, + "clustal" = { + X <- X[-1] # drop the line with "Clustal bla bla..." + ## find where the 1st sequence starts + start.seq <- findFirstNucleotide(X[1]) + ## find the lines with *********.... + nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "") + stars <- grep(nspaces, X) + ## we now know how many sequences in the file: + n <- stars[1] - 1 + taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1)) + ## need to remove the sequence names before getting the sequences: + X <- substr(X, start.seq, nchar(X)) + nl <- length(X) + ## find the length of the 1st sequence: + tmp <- getNucleotide(X[seq(1, nl, n + 1)]) + s <- length(tmp) + obj <- matrix("", n, s) + obj[1, ] <- tmp + for (i in 2:n) + obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)]) + }) if (format != "fasta") { rownames(obj) <- taxa } else { - names(obj) <- taxa LENGTHS <- unique(unlist(lapply(obj, length))) allSameLength <- length(LENGTHS) == 1 if (is.logical(as.matrix)) { diff --git a/man/read.dna.Rd b/man/read.dna.Rd index 2863ad8..19bab40 100644 --- a/man/read.dna.Rd +++ b/man/read.dna.Rd @@ -1,10 +1,12 @@ \name{read.dna} \alias{read.dna} +\alias{read.FASTA} \title{Read DNA Sequences in a File} \usage{ read.dna(file, format = "interleaved", skip = 0, nlines = 0, comment.char = "#", as.character = FALSE, as.matrix = NULL) +read.FASTA(file) } \arguments{ \item{file}{a file name specified by either a variable of mode character, @@ -79,6 +81,8 @@ read.dna(file, format = "interleaved", skip = 0, a matrix or a list (if \code{format = "fasta"}) of DNA sequences stored in binary format, or of mode character (if \code{as.character = "TRUE"}). + + \code{read.FASTA} always returns a list. } \references{ Anonymous. FASTA format description. diff --git a/src/read_dna.c b/src/read_dna.c new file mode 100644 index 0000000..6d44c66 --- /dev/null +++ b/src/read_dna.c @@ -0,0 +1,141 @@ +/* read_dna.c 2012-12-27 */ + +/* Copyright 2008-2012 Emmanuel Paradis */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include +#include + +// The initial code defining and initialising the translation table: +// +// for (i = 0; i < 122; i++) tab_trans[i] = 0x00; +// +// tab_trans[65] = 0x88; /* A */ +// tab_trans[71] = 0x48; /* G */ +// tab_trans[67] = 0x28; /* C */ +// tab_trans[84] = 0x18; /* T */ +// tab_trans[82] = 0xc0; /* R */ +// tab_trans[77] = 0xa0; /* M */ +// tab_trans[87] = 0x90; /* W */ +// tab_trans[83] = 0x60; /* S */ +// tab_trans[75] = 0x50; /* K */ +// tab_trans[89] = 0x30; /* Y */ +// tab_trans[86] = 0xe0; /* V */ +// tab_trans[72] = 0xb0; /* H */ +// tab_trans[68] = 0xd0; /* D */ +// tab_trans[66] = 0x70; /* B */ +// tab_trans[78] = 0xf0; /* N */ +// +// tab_trans[97] = 0x88; /* a */ +// tab_trans[103] = 0x48; /* g */ +// tab_trans[99] = 0x28; /* c */ +// tab_trans[116] = 0x18; /* t */ +// tab_trans[114] = 0xc0; /* r */ +// tab_trans[109] = 0xa0; /* m */ +// tab_trans[119] = 0x90; /* w */ +// tab_trans[115] = 0x60; /* s */ +// tab_trans[107] = 0x50; /* k */ +// tab_trans[121] = 0x30; /* y */ +// tab_trans[118] = 0xe0; /* v */ +// tab_trans[104] = 0xb0; /* h */ +// tab_trans[100] = 0xd0; /* d */ +// tab_trans[98] = 0x70; /* b */ +// tab_trans[110] = 0xf0; /* n */ +// +// tab_trans[45] = 0x04; /* - */ +// tab_trans[63] = 0x02; /* ? */ + +static const unsigned char tab_trans[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0-9 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10-19 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 20-29 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 30-39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, /* 40-49 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 50-59 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x88, 0x70, 0x28, 0xd0, 0x00, /* 60-69 */ + 0x48, 0x00, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */ + 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, 0x00, 0x30, /* 80-89 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x70, 0x28, /* 90-99 */ + 0xd0, 0x00, 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, /* 100-109 */ + 0xf0, 0x00, 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, /* 110-119 */ + 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120-129 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 130-139 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 140-149 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 150-159 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-169 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 170-179 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 180-189 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 190-199 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-209 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 210-219 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 220-229 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 230-239 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-249 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 250-255 */ + +static const unsigned char hook = 0x3e; +static const unsigned char lineFeed = 0x0a; +/* static const unsigned char space = 0x20; */ + +SEXP rawStreamToDNAbin(SEXP x) +{ + int N, i, j, k, n; + unsigned char *xr, *rseq, *buffer, tmp; + SEXP obj, nms, seq; + + PROTECT(x = coerceVector(x, RAWSXP)); + N = LENGTH(x); + xr = RAW(x); + +/* do a 1st pass to find the number of sequences */ + + n = 0; + j = 0; /* use j as a flag */ + for (i = 0; i < N; i++) { + if (j && xr[i] == lineFeed) { + n++; + j = 0; + } else if (xr[i] == hook) j = 1; + } + + PROTECT(obj = allocVector(VECSXP, n)); + PROTECT(nms = allocVector(STRSXP, n)); + +/* Refine the way the size of the buffer is set? */ + buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *)); + + i = j = 0; + while (i < N) { + /* 1st read the label... */ + if (xr[i] == hook) { + i++; + k = 0; + while (xr[i] != lineFeed) buffer[k++] = xr[i++]; + buffer[k] = '\0'; + SET_STRING_ELT(nms, j, mkChar(buffer)); + } + /* ... then read the sequence */ + n = 0; + while (xr[i] != hook && i < N) { + tmp = tab_trans[xr[i++]]; +/* If we are sure that the FASTA file is correct (ie, the sequence on + a single line and only the IUAPC code (plus '-' and '?') is used, + then the following check would not be needed; additionally the size + of tab_trans could be restriced to 0-121. This check has the + advantage that all invalid characters are simply ignored without + causing error. */ + if (tmp) buffer[n++] = tmp; + } + PROTECT(seq = allocVector(RAWSXP, n)); + rseq = RAW(seq); + for (k = 0; k < n; k++) rseq[k] = buffer[k]; + SET_VECTOR_ELT(obj, j, seq); + UNPROTECT(1); + j++; + } + setAttrib(obj, R_NamesSymbol, nms); + UNPROTECT(3); + return obj; +} -- 2.39.2 From a0436318d70829a2d16134be7ca1d6d454613a20 Mon Sep 17 00:00:00 2001 From: paradis Date: Fri, 4 Jan 2013 12:48:57 +0000 Subject: [PATCH 08/16] new chronos files and a bunch of various improvements git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@203 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 6 + R/DNA.R | 33 ++-- R/chronos.R | 476 ++++++++++++++++++++++++++++++++++++++++++++++ R/read.dna.R | 15 +- man/chronopl.Rd | 6 +- man/chronos.Rd | 136 +++++++++++++ man/plot.phylo.Rd | 6 +- man/read.dna.Rd | 28 +-- src/read_dna.c | 43 +++-- 10 files changed, 697 insertions(+), 54 deletions(-) create mode 100644 R/chronos.R create mode 100644 man/chronos.Rd diff --git a/DESCRIPTION b/DESCRIPTION index e84ea2e..8d17b0b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2012-12-27 +Date: 2013-01-03 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 57d1924..1f2cd22 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ NEW FEATURES + The new function chronos .......... + o The new function 'where' searches patterns in DNA sequences. o pic() gains an option 'rescaled.tree = FALSE' to return the tree @@ -34,6 +36,10 @@ OTHER CHANGES o base.freq() is now faster with lists. + as.matrix.DNAbin() ............ should be faster.... now accepts vectors, + + read.dna() has a new for FASTA files............. faster C code. + CHANGES IN APE VERSION 3.0-6 diff --git a/R/DNA.R b/R/DNA.R index d1b3625..48e6054 100644 --- a/R/DNA.R +++ b/R/DNA.R @@ -1,8 +1,8 @@ -## DNA.R (2012-12-27) +## DNA.R (2013-01-04) ## Manipulations and Comparisons of DNA Sequences -## Copyright 2002-2012 Emmanuel Paradis +## Copyright 2002-2013 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -65,17 +65,21 @@ as.alignment <- function(x) as.matrix.DNAbin <- function(x, ...) { - if (is.list(x)) { - if (length(unique(unlist(lapply(x, length)))) != 1) - stop("DNA sequences in list not of the same length.") - nms <- names(x) - n <- length(x) - s <- length(x[[1]]) - x <- matrix(unlist(x), n, s, byrow = TRUE) - rownames(x) <- nms - class(x) <- "DNAbin" + if (is.matrix(x)) return(x) + if (is.vector(x)) { + dim(x) <- c(1, length(x)) + return(x) } - x + s <- unique(unlist(lapply(x, length))) + if (length(s) != 1) + stop("DNA sequences in list not of the same length.") + nms <- names(x) + n <- length(x) + y <- matrix(as.raw(0), n, s) + for (i in seq_len(n)) y[i, ] <- x[[i]] + rownames(y) <- nms + class(y) <- "DNAbin" + y } as.list.DNAbin <- function(x, ...) @@ -392,7 +396,10 @@ dist.dna <- function(x, model = "K80", variance = FALSE, gamma = FALSE, Ndist <- if (imod == 11) n*n else n*(n - 1)/2 var <- if (variance) double(Ndist) else 0 if (!gamma) gamma <- alpha <- 0 - else alpha <- gamma <- 1 + else { + alpha <- gamma + gamma <- 1 + } d <- .C("dist_dna", x, as.integer(n), as.integer(s), imod, double(Ndist), BF, as.integer(pairwise.deletion), as.integer(variance), var, as.integer(gamma), diff --git a/R/chronos.R b/R/chronos.R new file mode 100644 index 0000000..e651822 --- /dev/null +++ b/R/chronos.R @@ -0,0 +1,476 @@ +## chronos.R (2013-01-03) + +## Molecular Dating With Penalized and Maximum Likelihood + +## Copyright 2013 Emmanuel Paradis + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +.chronos.ctrl <- + list(tol = 1e-8, iter.max = 1e4, eval.max = 1e4, nb.rate.cat = 10, + dual.iter.max = 20) + +makeChronosCalib <- + function(phy, node = "root", age.min = 1, age.max = age.min, + interactive = FALSE, soft.bounds = FALSE) +{ + n <- Ntip(phy) + if (interactive) { + plot(phy) + cat("Click close to a node and enter the ages (right-click to exit)\n\n") + node <- integer() + age.min <- age.max <- numeric() + repeat { + ans <- identify(phy, quiet = TRUE) + if (is.null(ans)) break + NODE <- ans$nodes + nodelabels(node = NODE, col = "white", bg = "blue") + cat("constraints for node ", NODE, sep = "") + cat("\n youngest age: ") + AGE.MIN <- as.numeric(readLines(n = 1)) + cat(" oldest age (ENTER if not applicable): ") + AGE.MAX <- as.numeric(readLines(n = 1)) + node <- c(node, NODE) + age.min <- c(age.min, AGE.MIN) + age.max <- c(age.max, AGE.MAX) + } + s <- is.na(age.max) + if (any(s)) age.max[s] <- age.min[s] + } else { + if (identical(node, "root")) node <- n + 1L + } + + if (any(node <= n)) + stop("node numbers should be greater than the number of tips") + + diff.age <- which(age.max < age.min) + if (length(diff.age)) { + msg <- "'old age' less than 'young age' for node" + if (length(diff.age) > 1) msg <- paste(msg, "s", sep = "") + stop(paste(msg, paste(node[diff.age], collapse = ", "))) + } + + data.frame(node, age.min, age.max, soft.bounds = soft.bounds) +} + +chronos.control <- function(...) +{ + dots <- list(...) + x <- .chronos.ctrl + if (length(dots)) { + chk.nms <- names(dots) %in% names(x) + if (any(!chk.nms)) { + warning("some control parameter names do not match: they were ignored") + dots <- dots[chk.nms] + } + x[names(dots)] <- dots + } + x +} + +chronos <- + function(phy, lambda = 1, model = "correlated", quiet = FALSE, + calibration = makeChronosCalib(phy), + control = chronos.control()) +{ + model <- match.arg(tolower(model), c("correlated", "relaxed", "discrete")) + n <- Ntip(phy) + ROOT <- n + 1L + m <- phy$Nnode + el <- phy$edge.length + if (any(el < 0)) stop("some branch lengths are negative") + e1 <- phy$edge[, 1L] + e2 <- phy$edge[, 2L] + N <- length(e1) + TIPS <- 1:n + EDGES <- 1:N + + tol <- control$tol + + node <- calibration$node + age.min <- calibration$age.min + age.max <- calibration$age.max + + if (model == "correlated") { +### `basal' contains the indices of the basal edges +### (ie, linked to the root): + basal <- which(e1 == ROOT) + Nbasal <- length(basal) + +### 'ind1' contains the index of all nonbasal edges, and 'ind2' the +### index of the edges where these edges come from (ie, they contain +### pairs of contiguous edges), eg: + +### ___b___ ind1 ind2 +### | | || | +### ___a___| | b || a | +### | | c || a | +### |___c___ | || | + + ind1 <- EDGES[-basal] + ind2 <- match(e1[EDGES[-basal]], e2) + } + + age <- numeric(n + m) + +### This bit sets 'ini.time' and should result in no negative branch lengths + + if (!quiet) cat("\nSetting initial dates...\n") + seq.nod <- .Call("seq_root2tip", phy$edge, n, phy$Nnode, PACKAGE = "ape") + + ii <- 1L + repeat { + ini.time <- age + ini.time[ROOT:(n + m)] <- NA + + ini.time[node] <- + if (is.null(age.max)) age.min + else runif(length(node), age.min, age.max) # (age.min + age.max) / 2 + + ## if no age given for the root, find one approximately: + if (is.na(ini.time[ROOT])) + ini.time[ROOT] <- if (is.null(age.max)) 3 * max(age.min) else 3 * max(age.max) + + ISnotNA.ALL <- unlist(lapply(seq.nod, function(x) sum(!is.na(ini.time[x])))) + o <- order(ISnotNA.ALL, decreasing = TRUE) + + for (y in seq.nod[o]) { + ISNA <- is.na(ini.time[y]) + if (any(ISNA)) { + i <- 2L # we know the 1st value is not NA, so we start at the 2nd one + while (i <= length(y)) { + if (ISNA[i]) { # we stop at the next NA + j <- i + 1L + while (ISNA[j]) j <- j + 1L # look for the next non-NA + nb.val <- j - i + by <- (ini.time[y[i - 1L]] - ini.time[y[j]]) / (nb.val + 1) + ini.time[y[i:(j - 1L)]] <- ini.time[y[i - 1L]] - by * seq_len(nb.val) + i <- j + 1L + } else i <- i + 1L + } + } + } + if (all(ini.time[e1] - ini.time[e2] >= 0)) break + ii <- ii + 1L + if (ii > 1000) + stop("cannot find reasonable starting dates after 1000 tries: +maybe you need to adjust the calibration dates") + } +### 'ini.time' set + + #ini.time[ROOT:(n+m)] <- branching.times(chr.dis) + ## ini.time[ROOT:(n+m)] <- ini.time[ROOT:(n+m)] + rnorm(m, 0, 5) + #print(ini.time) + + +### Setting 'ini.rate' + ini.rate <- el/(ini.time[e1] - ini.time[e2]) + + if (model == "discrete") { + Nb.rates <- control$nb.rate.cat + minmax <- range(ini.rate) + if (Nb.rates == 1) { + ini.rate <- sum(minmax)/2 + } else { + inc <- diff(minmax)/Nb.rates + ini.rate <- seq(minmax[1] + inc/2, minmax[2] - inc/2, inc) + ini.freq <- rep(1/Nb.rates, Nb.rates - 1) + lower.freq <- rep(0, Nb.rates - 1) + upper.freq <- rep(1, Nb.rates - 1) + } + } else Nb.rates <- N +## 'ini.rate' set + +### Setting bounds for the node ages + + ## `unknown.ages' will contain the index of the nodes of unknown age: + unknown.ages <- 1:m + n + + ## initialize vectors for all nodes: + lower.age <- rep(tol, m) + upper.age <- rep(1/tol, m) + + lower.age[node - n] <- age.min + upper.age[node - n] <- age.max + + ## find nodes known within an interval: + ii <- which(age.min != age.max) + ## drop them from 'node' since they will be estimated: + if (length(ii)) { + node <- node[-ii] + if (length(node)) + age[node] <- age.min[-ii] # update 'age' + } else age[node] <- age.min + + ## finally adjust the 3 vectors: + if (length(node)) { + unknown.ages <- unknown.ages[n - node] # 'n - node' is simplification for '-(node - n)' + lower.age <- lower.age[n - node] + upper.age <- upper.age[n - node] + } +### Bounds for the node ages set + + ## 'known.ages' contains the index of all nodes + ## (internal and terminal) of known age: + known.ages <- c(TIPS, node) + + ## the bounds for the rates: + lower.rate <- rep(tol, Nb.rates) + upper.rate <- rep(100 - tol, Nb.rates) # needs to be adjusted to higher values? + +### Gradient + degree_node <- tabulate(phy$edge) + eta_i <- degree_node[e1] + eta_i[e2 <= n] <- 1L + ## eta_i[i] is the number of contiguous branches for branch 'i' + + ## use of a list of indices is slightly faster than an incidence matrix + ## and takes much less memory (60 Kb vs. 8 Mb for n = 500) + X <- vector("list", N) + for (i in EDGES) { + j <- integer() + if (e1[i] != ROOT) j <- c(j, which(e2 == e1[i])) + if (e2[i] >= n) j <- c(j, which(e1 == e2[i])) + X[[i]] <- j + } + ## X is a list whose i-th element gives the indices of the branches + ## that are contiguous to branch 'i' + + ## D_ki and A_ki are defined in the SI of the paper + D_ki <- match(unknown.ages, e2) + A_ki <- lapply(unknown.ages, function(x) which(x == e1)) + + gradient.poisson <- function(rate, node.time) { + age[unknown.ages] <- node.time + real.edge.length <- age[e1] - age[e2] + #if (any(real.edge.length < 0)) + # return(numeric(N + length(unknown.ages))) + ## gradient for the rates: + gr <- el/rate - real.edge.length + + ## gradient for the dates: + tmp <- el/real.edge.length - rate + gr.dates <- sapply(A_ki, function(x) sum(tmp[x])) - tmp[D_ki] + + c(gr, gr.dates) + } + + ## gradient of the penalized lik (must be multiplied by -1 before calling nlminb) + gradient <- + switch(model, + "correlated" = + function(rate, node.time) { + gr <- gradient.poisson(rate, node.time) + #if (all(gr == 0)) return(gr) + + ## contribution of the penalty for the rates: + gr[RATE] <- gr[RATE] - lambda * 2 * (eta_i * rate - sapply(X, function(x) sum(rate[x]))) + ## the contribution of the root variance term: + if (Nbasal == 2) { # the simpler formulae if there's a basal dichotomy + i <- basal[1] + j <- basal[2] + gr[i] <- gr[i] - lambda * (rate[i] - rate[j]) + gr[j] <- gr[j] - lambda * (rate[j] - rate[i]) + } else { # the general case + for (i in 1:Nbasal) + j <- basal[i] + gr[j] <- gr[j] - + lambda*2*(rate[j]*(1 - 1/Nbasal) - sum(rate[basal[-i]])/Nbasal)/(Nbasal - 1) + } + gr + }, + "relaxed" = + function(rate, node.time) { + gr <- gradient.poisson(rate, node.time) + #if (all(gr == 0)) return(gr) + + ## contribution of the penalty for the rates: + mean.rate <- mean(rate) + ## rank(rate)/Nb.rates is the same than ecdf(rate)(rate) but faster + gr[RATE] <- gr[RATE] + lambda*2*dgamma(rate, mean.rate)*(rank(rate)/Nb.rates - pgamma(rate, mean.rate)) + gr + }, + "discrete" = NULL) + + log.lik.poisson <- function(rate, node.time) { + age[unknown.ages] <- node.time + real.edge.length <- age[e1] - age[e2] + if (isTRUE(any(real.edge.length < 0))) return(-1e100) + B <- rate * real.edge.length + sum(el * log(B) - B - lfactorial(el)) + } + +### penalized log-likelihood + penal.loglik <- + switch(model, + "correlated" = + function(rate, node.time) { + loglik <- log.lik.poisson(rate, node.time) + if (!is.finite(loglik)) return(-1e100) + loglik - lambda * (sum((rate[ind1] - rate[ind2])^2) + + var(rate[basal])) + }, + "relaxed" = + function(rate, node.time) { + loglik <- log.lik.poisson(rate, node.time) + if (!is.finite(loglik)) return(-1e100) + mu <- mean(rate) + ## loglik - lambda * sum((1:N/N - pbeta(sort(rate), mu/(1 + mu), 1))^2) # avec loi beta + ## loglik - lambda * sum((1:N/N - pcauchy(sort(rate)))^2) # avec loi Cauchy + loglik - lambda * sum((1:N/N - pgamma(sort(rate), mean(rate)))^2) # avec loi Gamma + }, + "discrete" = + if (Nb.rates == 1) + function(rate, node.time) log.lik.poisson(rate, node.time) + else function(rate, node.time, freq) { + if (isTRUE(sum(freq) > 1)) return(-1e100) + rate.freq <- sum(c(freq, 1 - sum(freq)) * rate) + log.lik.poisson(rate.freq, node.time) + }) + + opt.ctrl <- list(eval.max = control$eval.max, iter.max = control$iter.max) + + ## the following capitalized vectors give the indices of + ## the parameters once they are concatenated in 'p' + RATE <- 1:Nb.rates + AGE <- Nb.rates + 1:length(unknown.ages) + + if (model == "discrete") { + if (Nb.rates == 1) { + start.para <- c(ini.rate, ini.time[unknown.ages]) + f <- function(p) -penal.loglik(p[RATE], p[AGE]) + g <- NULL + LOW <- c(lower.rate, lower.age) + UP <- c(upper.rate, upper.age) + } else { + FREQ <- length(RATE) + length(AGE) + 1:(Nb.rates - 1) + start.para <- c(ini.rate, ini.time[unknown.ages], ini.freq) + f <- function(p) -penal.loglik(p[RATE], p[AGE], p[FREQ]) + g <- NULL + LOW <- c(lower.rate, lower.age, lower.freq) + UP <- c(upper.rate, upper.age, upper.freq) + } + } else { + start.para <- c(ini.rate, ini.time[unknown.ages]) + f <- function(p) -penal.loglik(p[RATE], p[AGE]) + g <- function(p) -gradient(p[RATE], p[AGE]) + LOW <- c(lower.rate, lower.age) + UP <- c(upper.rate, upper.age) + } + + k <- length(LOW) # number of free parameters + + if (!quiet) cat("Fitting in progress... get a first set of estimates\n") + + out <- nlminb(start.para, f, g, + control = opt.ctrl, lower = LOW, upper = UP) + + if (model == "discrete") { + if (Nb.rates == 1) { + f.rates <- function(p) -penal.loglik(p, current.ages) + f.ages <- function(p) -penal.loglik(current.rates, p) + } else { + f.rates <- function(p) -penal.loglik(p, current.ages, current.freqs) + f.ages <- function(p) -penal.loglik(current.rates, p, current.freqs) + f.freqs <- function(p) -penal.loglik(current.rates, current.ages, p) + g.freqs <- NULL + } + g.rates <- NULL + g.ages <- NULL + } else { + f.rates <- function(p) -penal.loglik(p, current.ages) + g.rates <- function(p) -gradient(p, current.ages)[RATE] + f.ages <- function(p) -penal.loglik(current.rates, p) + g.ages <- function(p) -gradient(current.rates, p)[AGE] + } + + current.ploglik <- -out$objective + current.rates <- out$par[RATE] + current.ages <- out$par[AGE] + if (model == "discrete" && Nb.rates > 1) current.freqs <- out$par[FREQ] + + dual.iter.max <- control$dual.iter.max + i <- 0L + + if (!quiet) cat(" Penalised log-lik =", current.ploglik, "\n") + + repeat { + if (dual.iter.max < 1) break + if (!quiet) cat("Optimising rates...") + out.rates <- nlminb(current.rates, f.rates, g.rates,# h.rates, + control = list(eval.max = 1000, iter.max = 1000, + step.min = 1e-8, step.max = .1), + lower = lower.rate, upper = upper.rate) + new.rates <- out.rates$par + if (-out.rates$objective > current.ploglik) + current.rates <- new.rates + + if (model == "discrete" && Nb.rates > 1) { + if (!quiet) cat(" frequencies...") + out.freqs <- nlminb(current.freqs, f.freqs, + control = list(eval.max = 1000, iter.max = 1000, + step.min = .001, step.max = .5), + lower = lower.freq, upper = upper.freq) + new.freqs <- out.freqs$par + } + + if (!quiet) cat(" dates...") + out.ages <- nlminb(current.ages, f.ages, g.ages,# h.ages, + control = list(eval.max = 1000, iter.max = 1000, + step.min = .001, step.max = 100), + lower = lower.age, upper = upper.age) + new.ploglik <- -out.ages$objective + + if (!quiet) cat("", current.ploglik, "\n") + + if (new.ploglik - current.ploglik > 1e-6 && i <= dual.iter.max) { + current.ploglik <- new.ploglik + current.rates <- new.rates + current.ages <- out.ages$par + if (model == "discrete" && Nb.rates > 1) current.freqs <- new.freqs + out <- out.ages + i <- i + 1L + } else break + } + + if (!quiet) cat("\nDone.\n") + +# browser() + + if (model == "discrete") { + rate.freq <- + if (Nb.rates == 1) current.rates + else mean(c(current.freqs, 1 - sum(current.freqs)) * current.rates) + logLik <- log.lik.poisson(rate.freq, current.ages) + PHIIC <- list(logLik = logLik, k = k, PHIIC = - 2 * logLik + 2 * k) + } else { + logLik <- log.lik.poisson(current.rates, current.ages) + PHI <- switch(model, + "correlated" = (current.rates[ind1] - current.rates[ind2])^2 + var(current.rates[basal]), + "relaxed" = (1:N/N - pgamma(sort(current.rates), mean(current.rates)))^2) # avec loi Gamma + PHIIC <- list(logLik = logLik, k = k, lambda = lambda, + PHIIC = - 2 * logLik + 2 * k + lambda * svd(PHI)$d) + } + + attr(phy, "call") <- match.call() + attr(phy, "ploglik") <- -out$objective + attr(phy, "rates") <- current.rates #out$par[EDGES] + if (model == "discrete" && Nb.rates > 1) + attr(phy, "frequencies") <- current.freqs + attr(phy, "message") <- out$message + attr(phy, "PHIIC") <- PHIIC + age[unknown.ages] <- current.ages #out$par[-EDGES] + phy$edge.length <- age[e1] - age[e2] + class(phy) <- c("chronos", class(phy)) + phy +} + +print.chronos <- function(x, ...) +{ + cat("\n Chronogram\n\n") + cat("Call: ") + print(attr(x, "call")) + cat("\n") + NextMethod("print") +} diff --git a/R/read.dna.R b/R/read.dna.R index 9e46cba..4b2ddd8 100644 --- a/R/read.dna.R +++ b/R/read.dna.R @@ -1,4 +1,4 @@ -## read.dna.R (2012-12-27) +## read.dna.R (2013-01-04) ## Read DNA Sequences in a File @@ -47,7 +47,6 @@ read.dna <- function(file, format = "interleaved", skip = 0, } else { X <- scan(file = file, what = "", sep = "\n", quiet = TRUE, skip = skip, nlines = nlines, comment.char = comment.char) - if (format %in% formats[1:2]) { ## need to remove the possible leading spaces and/or tabs in the first line fl <- gsub("^[[:blank:]]+", "", X[1]) @@ -106,9 +105,10 @@ read.dna <- function(file, format = "interleaved", skip = 0, for (i in 2:n) obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)]) }) - + } if (format != "fasta") { rownames(obj) <- taxa + if (!as.character) obj <- as.DNAbin(obj) } else { LENGTHS <- unique(unlist(lapply(obj, length))) allSameLength <- length(LENGTHS) == 1 @@ -119,10 +119,15 @@ read.dna <- function(file, format = "interleaved", skip = 0, as.matrix <- allSameLength } if (as.matrix) { - obj <- matrix(unlist(obj), ncol = LENGTHS, byrow = TRUE) + taxa <- names(obj) + n <- length(obj) + y <- matrix(as.raw(0), n, LENGTHS) + for (i in seq_len(n)) y[i, ] <- obj[[i]] + obj <- y rownames(obj) <- taxa + class(obj) <- "DNAbin" } + if (as.character) obj <- as.character(obj) } - if (!as.character) obj <- as.DNAbin(obj) obj } diff --git a/man/chronopl.Rd b/man/chronopl.Rd index 13f183f..339a284 100644 --- a/man/chronopl.Rd +++ b/man/chronopl.Rd @@ -101,6 +101,10 @@ chronopl(phy, lambda, age.min = 1, age.max = NULL, \item{D2}{the influence of each observation on overall date estimates (if \code{CV = TRUE}).} } +\note{ + The new function \code{\link{chronos}} replaces the present one which + is no more maintained. +} \references{ Sanderson, M. J. (2002) Estimating absolute rates of molecular evolution and divergence times: a penalized likelihood @@ -109,6 +113,6 @@ chronopl(phy, lambda, age.min = 1, age.max = NULL, } \author{Emmanuel Paradis} \seealso{ - \code{\link{chronoMPL}} + \code{\link{chronos}}, \code{\link{chronoMPL}} } \keyword{models} diff --git a/man/chronos.Rd b/man/chronos.Rd new file mode 100644 index 0000000..22c15f1 --- /dev/null +++ b/man/chronos.Rd @@ -0,0 +1,136 @@ +\name{chronos} +\alias{chronos} +\alias{makeChronosCalib} +\alias{chronos.control} +\alias{print.chronos} +\title{Molecular Dating by Penalised Likelihood and Maximum Likelihood} +\description{ + \code{chronos} is the main function fitting a chronogram to a + phylogenetic tree whose branch lengths are in number of substitution + per sites. + + \code{makeChronosCalib} is a tool to prepare data frames with the + calibration points of the phylogenetic tree. + + \code{chronos.control} creates a list of parameters to be passed + to \code{chronos}. +} +\usage{ +chronos(phy, lambda = 1, model = "correlated", quiet = FALSE, + calibration = makeChronosCalib(phy), + control = chronos.control()) +\method{print}{chronos}(x, ...) +makeChronosCalib(phy, node = "root", age.min = 1, + age.max = age.min, interactive = FALSE, soft.bounds = FALSE) +chronos.control(...) +} +\arguments{ + \item{phy}{an object of class \code{"phylo"}.} + \item{lambda}{value of the smoothing parameter.} + \item{model}{a character string specifying the model of substitution + rate variation among branches. The possible choices are: + ``correlated'', ``relaxed'', ``discrete'', or an unambiguous + abbreviation of these.} + \item{quiet}{a logical value; by default the calculation progress are + displayed.} + \item{calibration}{a data frame (see details).} + \item{control}{a list of parameters controlling the optimisation + procedure (see details).} + \item{x}{an object of class \code{c("chronos", "phylo")}.} + \item{node}{a vector of integers giving the node numbers for which a + calibration point is given. The default is a short-cut for the + root.} + \item{age.min, age.max}{vectors of numerical values giving the minimum + and maximum ages of the nodes specified in \code{node}.} + \item{interactive}{a logical value. If \code{TRUE}, then \code{phy} is + plotted and the user is asked to click close to a node and enter the + ages on the keyboard.} + \item{soft.bounds}{(currently unused)} + \item{\dots}{in the case of \code{chronos.control}: one of the five + parameters controlling optimisation (unused in the case of + \code{print.chronos}).} +} +\details{ + \code{chronos} replaces \code{chronopl} but with a different interface + and some extensions (see References). + + The known dates (argument \code{calibration}) must be given in a data + frame with the following column names: node, age.min, age.max, and + soft.bounds (the last one is yet unused). For each row, these are, + respectively: the number of the node in the ``phylo'' coding standard, + the minimum age for this node, the maximum age, and a logical value + specifying whether the bounds are soft. If age.min = age.max, this + means that the age is exactly known. This data frame can be built with + \code{makeChronosCalib} which returns by default a data frame with a + single row giving age = 1 for the root. The data frame can be built + interactively by clicking on the plotted tree. + + The argument \code{control} allows one to change some parameters of + the optimisation procedure. This must be a list with names. The + available options with their default values are: + + \itemize{ + \item{tol = 1e-8: }{tolerance for the estimation of the substitution + rates.} + \item{iter.max = 1e4: }{the maximum number of iterations at each + optimization step.} + \item{eval.max = 1e4: }{the maximum number of function evaluations at + each optimization step.} + \item{nb.rate.cat = 10: }{the number of rate categories if \code{model + = "discrete"} (set this parameter to 1 to fit a strict clock + model).} + \item{dual.iter.max = 20: }{the maximum number of alternative + iterations between rates and dates.} + } + + The command \code{chronos.control()} returns a list with the default + values of these parameters. They may be modified by passing them to + this function, or directly in the list. +} +\value{ + \code{chronos} returns an object of class \code{c("chronos", + "phylo")}. There is a print method for it. There are additional + attributes which can be visualised with \code{str} or extracted with + \code{attr}. + + \code{makeChronosCalib} returns a data frame. + + \code{chronos.control} returns a list. +} +\references{ + Kim, J. and Sanderson, M. J. (2008) Penalized likelihood phylogenetic + inference: bridging the parsimony-likelihood gap. \emph{Systematic + Biology}, \bold{57}, 665--674. + + Paradis, E. (2012) Molecular dating of phylogenies by likelihood + methods: a comparison of models and a new information + criterion. \emph{Manuscript}. + + Sanderson, M. J. (2002) Estimating absolute rates of molecular + evolution and divergence times: a penalized likelihood + approach. \emph{Molecular Biology and Evolution}, \bold{19}, + 101--109. +} +\author{Emmanuel Paradis} +\seealso{ + \code{\link{chronoMPL}} +} +\examples{ +tr <- rtree(10) +### the default is the correlated rate model: +chr <- chronos(tr) +### strict clock model: +ctrl <- chronos.control(nb.rate.cat = 1) +chr.clock <- chronos(tr, model = "discrete", control = ctrl) +### How different are the rates? +attr(chr, "rates") +attr(chr.clock, "rates") +\dontrun{ +cal <- makeChronosCalib(tr, interactive = TRUE) +cal +### if you made mistakes, you can edit the data frame with: +### fix(cal) +chr <- chronos(tr, calibration = cal) +} +} +\keyword{models} diff --git a/man/plot.phylo.Rd b/man/plot.phylo.Rd index 6d08946..3712669 100644 --- a/man/plot.phylo.Rd +++ b/man/plot.phylo.Rd @@ -97,12 +97,12 @@ \item{rotate.tree}{for "fan", "unrooted", or "radial" trees: the rotation of the whole tree in degrees (negative values are accepted).} - \item{layout}{the number of trees to be plotted simultaneously.} - \item{\dots}{further arguments to be passed to \code{plot} or to - \code{plot.phylo}.} \item{open.angle}{if \code{type = "f"}, the angle in degrees left blank. Use a non-zero value if you want to call \code{\link{axisPhylo}} after the tree is plotted.} + \item{layout}{the number of trees to be plotted simultaneously.} + \item{\dots}{further arguments to be passed to \code{plot} or to + \code{plot.phylo}.} } \description{ These functions plot phylogenetic trees on the current graphical diff --git a/man/read.dna.Rd b/man/read.dna.Rd index 19bab40..4cf664e 100644 --- a/man/read.dna.Rd +++ b/man/read.dna.Rd @@ -2,6 +2,13 @@ \alias{read.dna} \alias{read.FASTA} \title{Read DNA Sequences in a File} +\description{ + These functions read DNA sequences in a file, and returns a matrix or a + list of DNA sequences with the names of the taxa read in the file as + rownames or names, respectively. By default, the sequences are stored + in binary format, otherwise (if \code{as.character = "TRUE"}) in lower + case. +} \usage{ read.dna(file, format = "interleaved", skip = 0, nlines = 0, comment.char = "#", @@ -16,11 +23,11 @@ read.FASTA(file) \code{"sequential"}, \code{"clustal"}, or \code{"fasta"}, or any unambiguous abbreviation of these.} \item{skip}{the number of lines of the input file to skip before - beginning to read data.} + beginning to read data (ignored for FASTA files; see below).} \item{nlines}{the number of lines to be read (by default the file is - read untill its end).} + read untill its end; ignored for FASTA files)).} \item{comment.char}{a single character, the remaining of the line - after this character is ignored.} + after this character is ignored (ignored for FASTA files).} \item{as.character}{a logical controlling whether to return the sequences as an object of class \code{"DNAbin"} (the default).} \item{as.matrix}{(used if \code{format = "fasta"}) one of the three @@ -30,15 +37,8 @@ read.FASTA(file) are of different lengths; (iii) \code{FALSE}: always returns the sequences in a list.} } -\description{ - This function reads DNA sequences in a file, and returns a matrix or a - list of DNA sequences with the names of the taxa read in the file as - rownames or names, respectively. By default, the sequences are stored - in binary format, otherwise (if \code{as.character = "TRUE"}) in lower - case. -} \details{ - This function follows the interleaved and sequential formats defined + \code{read.dna} follows the interleaved and sequential formats defined in PHYLIP (Felsenstein, 1993) but with the original feature than there is no restriction on the lengths of the taxa names. For these two formats, the first line of the file must contain the dimensions of the @@ -72,9 +72,9 @@ read.FASTA(file) \item{FASTA:}{This looks like the sequential format but the taxa names (or rather a description of the sequence) are on separate lines - beginning with a `greater than' character ``>'' (there may be + beginning with a `greater than' character `>' (there may be leading spaces before this character). These lines are taken as taxa - names after removing the ``>'' and the possible leading and trailing + names after removing the `>' and the possible leading and trailing spaces. All the data in the file before the first sequence is ignored.} }} \value{ @@ -82,7 +82,7 @@ read.FASTA(file) stored in binary format, or of mode character (if \code{as.character = "TRUE"}). - \code{read.FASTA} always returns a list. + \code{read.FASTA} always returns a list of class \code{"DNAbin"}. } \references{ Anonymous. FASTA format description. diff --git a/src/read_dna.c b/src/read_dna.c index 6d44c66..01a4db3 100644 --- a/src/read_dna.c +++ b/src/read_dna.c @@ -1,6 +1,6 @@ -/* read_dna.c 2012-12-27 */ +/* read_dna.c 2013-01-04 */ -/* Copyright 2008-2012 Emmanuel Paradis */ +/* Copyright 2013 Emmanuel Paradis */ /* This file is part of the R-package `ape'. */ /* See the file ../COPYING for licensing issues. */ @@ -81,7 +81,7 @@ static const unsigned char lineFeed = 0x0a; SEXP rawStreamToDNAbin(SEXP x) { - int N, i, j, k, n; + int N, i, j, k, n, startOfSeq; unsigned char *xr, *rseq, *buffer, tmp; SEXP obj, nms, seq; @@ -89,15 +89,25 @@ SEXP rawStreamToDNAbin(SEXP x) N = LENGTH(x); xr = RAW(x); -/* do a 1st pass to find the number of sequences */ +/* do a 1st pass to find the number of sequences - n = 0; - j = 0; /* use j as a flag */ - for (i = 0; i < N; i++) { + this code should be robust to '>' present inside + a label or in the header text before the sequences */ + + n = j = 0; /* use j as a flag */ + if (xr[0] == hook) { + j = 1; + startOfSeq = 0; + } + i = 1; + for (i = 1; i < N; i++) { if (j && xr[i] == lineFeed) { n++; j = 0; - } else if (xr[i] == hook) j = 1; + } else if (xr[i] == hook) { + if (!n) startOfSeq = i; + j = 1; + } } PROTECT(obj = allocVector(VECSXP, n)); @@ -106,16 +116,15 @@ SEXP rawStreamToDNAbin(SEXP x) /* Refine the way the size of the buffer is set? */ buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *)); - i = j = 0; + i = startOfSeq; + j = 0; /* gives the index of the sequence */ while (i < N) { /* 1st read the label... */ - if (xr[i] == hook) { - i++; - k = 0; - while (xr[i] != lineFeed) buffer[k++] = xr[i++]; - buffer[k] = '\0'; - SET_STRING_ELT(nms, j, mkChar(buffer)); - } + i++; + k = 0; + while (xr[i] != lineFeed) buffer[k++] = xr[i++]; + buffer[k] = '\0'; + SET_STRING_ELT(nms, j, mkChar(buffer)); /* ... then read the sequence */ n = 0; while (xr[i] != hook && i < N) { @@ -125,7 +134,7 @@ SEXP rawStreamToDNAbin(SEXP x) then the following check would not be needed; additionally the size of tab_trans could be restriced to 0-121. This check has the advantage that all invalid characters are simply ignored without - causing error. */ + causing error -- except if '>' occurs in the middle of a sequence. */ if (tmp) buffer[n++] = tmp; } PROTECT(seq = allocVector(RAWSXP, n)); -- 2.39.2 From f5c4abe6ac31486e821d82788bf66b5db2be51d1 Mon Sep 17 00:00:00 2001 From: paradis Date: Wed, 16 Jan 2013 02:07:27 +0000 Subject: [PATCH 09/16] some updates for ape 3.0-7 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@204 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 14 +++++++++---- R/Cheverud.R | 2 +- R/collapse.singles.R | 47 ++++++++++++++++++++------------------------ R/plot.phylo.R | 20 +++++++++---------- 5 files changed, 42 insertions(+), 43 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8d17b0b..8245658 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2013-01-03 +Date: 2013-01-16 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 1f2cd22..6c87184 100644 --- a/NEWS +++ b/NEWS @@ -3,7 +3,11 @@ NEW FEATURES - The new function chronos .......... + o The new function chronos estimates chronograms by penalised + likelihood and maximum likelihood with a completely reworked + code and interface. There is a new function makeChronosCalib to + set the calibration points easily. chronos() will eventually + replace chronopl(). o The new function 'where' searches patterns in DNA sequences. @@ -36,9 +40,11 @@ OTHER CHANGES o base.freq() is now faster with lists. - as.matrix.DNAbin() ............ should be faster.... now accepts vectors, + o as.matrix.DNAbin() should be faster and more efficient with + lists; it now accepts vectors. - read.dna() has a new for FASTA files............. faster C code. + o collapse.singles() is faster thanks to Klaus Schliep. This will + speed-up drop.tip() significantly. @@ -480,7 +486,7 @@ NEW FEATURES o base.freq() gains an option 'all' to count all the possible bases including the ambiguous ones (defaults to FALSE). - o read.nexus() now writes tree names in the NEXUS file if given a + o write.nexus() now writes tree names in the NEXUS file if given a list of trees with names. diff --git a/R/Cheverud.R b/R/Cheverud.R index 6b70a12..a2073d0 100644 --- a/R/Cheverud.R +++ b/R/Cheverud.R @@ -35,7 +35,7 @@ compar.cheverud <- function(y, W, tolerance=1e-6, gold.tol=1e-4) } sorted[ii] <- Re(sorted[ii]) # Remove imaginary part } - sorted <- as.real(sorted) + sorted <- as.double(sorted) Distinct <- numeric(0) Distinct[1] <- -Inf diff --git a/R/collapse.singles.R b/R/collapse.singles.R index bbde0a1..e4d2bfe 100644 --- a/R/collapse.singles.R +++ b/R/collapse.singles.R @@ -1,14 +1,15 @@ -## collapse.singles.R (2010-07-23) +## collapse.singles.R (2013-01-16) ## Collapse "Single" Nodes -## Copyright 2006 Ben Bolker +## Copyright 2006 Ben Bolker, 2013 Klaus Schliep ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. collapse.singles <- function(tree) { + tree <- reorder(tree) # added by Klaus elen <- tree$edge.length xmat <- tree$edge ## added by Elizabeth Purdom (2008-06-19): @@ -16,32 +17,26 @@ collapse.singles <- function(tree) nnode <- tree$Nnode ntip <- length(tree$tip.label) ## end - singles <- NA - while (length(singles) > 0) { - ## changed by EP to make it slightly more efficient: - ## tx <- table(xmat[xmat < 0]) - ## singles <- as.numeric(names(tx)[tx < 3]) - tx <- tabulate(xmat[, 1]) - singles <- which(tx == 1) - ## END - if (length(singles) > 0) { - i <- singles[1] - prev.node <- which(xmat[, 2] == i) - next.node <- which(xmat[, 1] == i) - xmat[prev.node, 2] <- xmat[next.node, 2] - xmat <- xmat[xmat[, 1] != i, ] # drop - ## changed by EP for the new coding of "phylo" (2006-10-05): - ## xmat[xmat < i] <- xmat[xmat < i] + 1 ## adjust indices - xmat[xmat > i] <- xmat[xmat > i] - 1L ## adjust indices # changed '1' by '1L' (2010-07-23) - ## END - elen[prev.node] <- elen[prev.node] + elen[next.node] - ## added by Elizabeth Purdom (2008-06-19): - if (!is.null(node.lab)) node.lab <- node.lab[-c(i - ntip)] - nnode <- nnode - 1L - ## end - elen <- elen[-next.node] + ## Added by Klaus (2013-01-16): + tx <- tabulate(xmat[, 1]) + singles <- which(tx == 1) + if (length(singles) > 0) { + prev.nodes <- match(singles, xmat[,2]) + next.nodes <- match(singles, xmat[,1]) + for(j in length(singles):1) { + i <- singles[j] + xmat[prev.nodes[j], 2] <- xmat[next.nodes[j], 2] + elen[prev.nodes[j]] <- elen[prev.nodes[j]] + elen[next.nodes[j]] } + xmat <- xmat[-next.nodes,] + elen <- elen[-next.nodes] + if (!is.null(node.lab)) node.lab <- node.lab[-c(singles - ntip)] + nnode = nnode - as.integer(length(singles)) + tmp = integer(max(xmat)) + tmp[sort(unique(as.vector(xmat)))] = as.integer(c(1:(ntip+nnode))) + xmat[] = tmp[xmat] } + # End tree$edge <- xmat tree$edge.length <- elen ## added by Elizabeth Purdom (2008-06-19): diff --git a/R/plot.phylo.R b/R/plot.phylo.R index 96d9aa4..b9309a0 100644 --- a/R/plot.phylo.R +++ b/R/plot.phylo.R @@ -1,8 +1,8 @@ -## plot.phylo.R (2012-12-19) +## plot.phylo.R (2013-01-11) ## Plot Phylogenies -## Copyright 2002-2012 Emmanuel Paradis +## Copyright 2002-2013 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -650,12 +650,10 @@ node.height.clado <- function(phy) plot.multiPhylo <- function(x, layout = 1, ...) { - if (layout > 1) - layout(matrix(1:layout, ceiling(sqrt(layout)), byrow = TRUE)) - else layout(matrix(1)) - if (!par("ask")) { - par(ask = TRUE) - on.exit(par(ask = FALSE)) + layout(matrix(1:layout, ceiling(sqrt(layout)), byrow = TRUE)) + if (!devAskNewPage() && interactive()) { + devAskNewPage(TRUE) + on.exit(devAskNewPage(FALSE)) } for (i in 1:length(x)) plot(x[[i]], ...) } @@ -721,9 +719,9 @@ kronoviz <- function(x, layout = length(x), horiz = TRUE, ...) h <- 1 } layout(matrix(1:layout, nrow), widths = w, heights = h) - if (layout > Ntree && !par("ask")) { - par(ask = TRUE) - on.exit(par(ask = FALSE)) + if (layout < Ntree && !devAskNewPage() && interactive()) { + devAskNewPage(TRUE) + on.exit(devAskNewPage(FALSE)) } if (horiz) { for (i in 1:Ntree) -- 2.39.2 From dc2dda7eb763b4e140f1e5adb7fa8bfaa2661f6d Mon Sep 17 00:00:00 2001 From: paradis Date: Mon, 21 Jan 2013 10:54:42 +0000 Subject: [PATCH 10/16] more corrections for ape 3.0-7 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@205 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 3 --- R/collapse.singles.R | 47 ++++++++++++++++++++++++-------------------- R/read.dna.R | 1 + src/read_dna.c | 4 ++-- 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8245658..bed8b3d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 3.0-7 -Date: 2013-01-16 +Date: 2013-01-19 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index 6c87184..6ca4ae5 100644 --- a/NEWS +++ b/NEWS @@ -43,9 +43,6 @@ OTHER CHANGES o as.matrix.DNAbin() should be faster and more efficient with lists; it now accepts vectors. - o collapse.singles() is faster thanks to Klaus Schliep. This will - speed-up drop.tip() significantly. - CHANGES IN APE VERSION 3.0-6 diff --git a/R/collapse.singles.R b/R/collapse.singles.R index e4d2bfe..bbde0a1 100644 --- a/R/collapse.singles.R +++ b/R/collapse.singles.R @@ -1,15 +1,14 @@ -## collapse.singles.R (2013-01-16) +## collapse.singles.R (2010-07-23) ## Collapse "Single" Nodes -## Copyright 2006 Ben Bolker, 2013 Klaus Schliep +## Copyright 2006 Ben Bolker ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. collapse.singles <- function(tree) { - tree <- reorder(tree) # added by Klaus elen <- tree$edge.length xmat <- tree$edge ## added by Elizabeth Purdom (2008-06-19): @@ -17,26 +16,32 @@ collapse.singles <- function(tree) nnode <- tree$Nnode ntip <- length(tree$tip.label) ## end - ## Added by Klaus (2013-01-16): - tx <- tabulate(xmat[, 1]) - singles <- which(tx == 1) - if (length(singles) > 0) { - prev.nodes <- match(singles, xmat[,2]) - next.nodes <- match(singles, xmat[,1]) - for(j in length(singles):1) { - i <- singles[j] - xmat[prev.nodes[j], 2] <- xmat[next.nodes[j], 2] - elen[prev.nodes[j]] <- elen[prev.nodes[j]] + elen[next.nodes[j]] + singles <- NA + while (length(singles) > 0) { + ## changed by EP to make it slightly more efficient: + ## tx <- table(xmat[xmat < 0]) + ## singles <- as.numeric(names(tx)[tx < 3]) + tx <- tabulate(xmat[, 1]) + singles <- which(tx == 1) + ## END + if (length(singles) > 0) { + i <- singles[1] + prev.node <- which(xmat[, 2] == i) + next.node <- which(xmat[, 1] == i) + xmat[prev.node, 2] <- xmat[next.node, 2] + xmat <- xmat[xmat[, 1] != i, ] # drop + ## changed by EP for the new coding of "phylo" (2006-10-05): + ## xmat[xmat < i] <- xmat[xmat < i] + 1 ## adjust indices + xmat[xmat > i] <- xmat[xmat > i] - 1L ## adjust indices # changed '1' by '1L' (2010-07-23) + ## END + elen[prev.node] <- elen[prev.node] + elen[next.node] + ## added by Elizabeth Purdom (2008-06-19): + if (!is.null(node.lab)) node.lab <- node.lab[-c(i - ntip)] + nnode <- nnode - 1L + ## end + elen <- elen[-next.node] } - xmat <- xmat[-next.nodes,] - elen <- elen[-next.nodes] - if (!is.null(node.lab)) node.lab <- node.lab[-c(singles - ntip)] - nnode = nnode - as.integer(length(singles)) - tmp = integer(max(xmat)) - tmp[sort(unique(as.vector(xmat)))] = as.integer(c(1:(ntip+nnode))) - xmat[] = tmp[xmat] } - # End tree$edge <- xmat tree$edge.length <- elen ## added by Elizabeth Purdom (2008-06-19): diff --git a/R/read.dna.R b/R/read.dna.R index 4b2ddd8..f898f3b 100644 --- a/R/read.dna.R +++ b/R/read.dna.R @@ -16,6 +16,7 @@ read.FASTA <- function(file) x <- x[-icr] } res <- .Call("rawStreamToDNAbin", x, PACKAGE = "ape") + names(res) <- sub("^ +", "", names(res)) # to permit phylosim class(res) <- "DNAbin" res } diff --git a/src/read_dna.c b/src/read_dna.c index 01a4db3..d9e52c4 100644 --- a/src/read_dna.c +++ b/src/read_dna.c @@ -1,4 +1,4 @@ -/* read_dna.c 2013-01-04 */ +/* read_dna.c 2013-01-19 */ /* Copyright 2013 Emmanuel Paradis */ @@ -55,7 +55,7 @@ static const unsigned char tab_trans[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, /* 40-49 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 50-59 */ 0x00, 0x00, 0x00, 0x02, 0x00, 0x88, 0x70, 0x28, 0xd0, 0x00, /* 60-69 */ - 0x48, 0x00, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */ + 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */ 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, 0x00, 0x30, /* 80-89 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x70, 0x28, /* 90-99 */ 0xd0, 0x00, 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, /* 100-109 */ -- 2.39.2 From 0d2caf41f13f7a3e8a313274cf819147940eac69 Mon Sep 17 00:00:00 2001 From: paradis Date: Thu, 31 Jan 2013 11:58:31 +0000 Subject: [PATCH 11/16] some modifs for ape 3.0-8 git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@206 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 8 ++++---- NAMESPACE | 1 + NEWS | 15 +++++++++++++++ R/ace.R | 13 ++++++++----- R/read.dna.R | 10 ++++------ man/ace.Rd | 15 ++++++++++++++- man/read.dna.Rd | 6 +++--- 7 files changed, 49 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bed8b3d..925db38 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,12 @@ Package: ape -Version: 3.0-7 -Date: 2013-01-19 +Version: 3.0-8 +Date: 2013-01-31 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis Depends: R (>= 2.6.0) -Suggests: gee -Imports: gee, nlme, lattice +Suggests: gee, expm +Imports: gee, nlme, lattice, expm ZipData: no Description: ape provides functions for reading, writing, plotting, and manipulating phylogenetic trees, analyses of comparative data in a phylogenetic framework, ancestral character analyses, analyses of diversification and macroevolution, computing distances from allelic and nucleotide data, reading and writing nucleotide sequences, and several tools such as Mantel's test, minimum spanning tree, generalized skyline plots, graphical exploration of phylogenetic data (alex, trex, kronoviz), estimation of absolute evolutionary rates and clock-like trees using mean path lengths and penalized likelihood. Phylogeny estimation can be done with the NJ, BIONJ, ME, MVR, SDM, and triangle methods, and several methods handling incomplete distance matrices (NJ*, BIONJ*, MVR*, and the corresponding triangle method). Some functions call external applications (PhyML, Clustal, T-Coffee, Muscle) whose results are returned into R. License: GPL (>= 2) diff --git a/NAMESPACE b/NAMESPACE index 6951d42..08f0899 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ exportPattern(".+") import(gee, nlme) importFrom(lattice, xyplot, panel.lines, panel.points) importFrom(stats, as.hclust, cophenetic, reorder) +importFrom(expm, expm) S3method(print, phylo) S3method(plot, phylo) diff --git a/NEWS b/NEWS index 6ca4ae5..5f1d758 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,18 @@ + CHANGES IN APE VERSION 3.0-8 + + +NEW FEATURES + + o ace() gains a new option 'use.expm' to use expm() from the + package of the same name in place of matexpo(). + + +BUG FIXES + + o read.dna(, "fasta") may add '\r' in labels: this is fixed. + + + CHANGES IN APE VERSION 3.0-7 diff --git a/R/ace.R b/R/ace.R index 55e32c3..2b9230e 100644 --- a/R/ace.R +++ b/R/ace.R @@ -1,8 +1,8 @@ -## ace.R (2012-06-25) +## ace.R (2013-01-31) ## Ancestral Character Estimation -## Copyright 2005-2012 Emmanuel Paradis and Ben Bolker +## Copyright 2005-2013 Emmanuel Paradis and Ben Bolker ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -21,7 +21,8 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, model = if (type == "continuous") "BM" else "ER", - scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1) + scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1, + use.expm = FALSE) { if (!inherits(phy, "phylo")) stop('object "phy" is not of class "phylo"') @@ -190,6 +191,8 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, liks[cbind(TIPS, x)] <- 1 phy <- reorder(phy, "pruningwise") + E <- if (use.expm) expm::expm else ape::matexpo + Q <- matrix(0, nl, nl) dev <- function(p, output.liks = FALSE) { if (any(is.nan(p)) || any(is.infinite(p))) return(1e50) @@ -202,8 +205,8 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, anc <- phy$edge[i, 1] des1 <- phy$edge[i, 2] des2 <- phy$edge[j, 2] - v.l <- matexpo(Q * phy$edge.length[i]) %*% liks[des1, ] - v.r <- matexpo(Q * phy$edge.length[j]) %*% liks[des2, ] + v.l <- E(Q * phy$edge.length[i]) %*% liks[des1, ] + v.r <- E(Q * phy$edge.length[j]) %*% liks[des2, ] v <- v.l * v.r comp[anc] <- sum(v) liks[anc, ] <- v/comp[anc] diff --git a/R/read.dna.R b/R/read.dna.R index f898f3b..fb8b54b 100644 --- a/R/read.dna.R +++ b/R/read.dna.R @@ -1,8 +1,8 @@ -## read.dna.R (2013-01-04) +## read.dna.R (2013-01-31) ## Read DNA Sequences in a File -## Copyright 2003-2012 Emmanuel Paradis +## Copyright 2003-2013 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -11,10 +11,8 @@ read.FASTA <- function(file) { sz <- file.info(file)$size x <- readBin(file, "raw", sz) - if (Sys.info()[1] == "Windows") { - icr <- which(x == as.raw(0x0d)) # CR - x <- x[-icr] - } + icr <- which(x == as.raw(0x0d)) # CR + x <- x[-icr] res <- .Call("rawStreamToDNAbin", x, PACKAGE = "ape") names(res) <- sub("^ +", "", names(res)) # to permit phylosim class(res) <- "DNAbin" diff --git a/man/ace.Rd b/man/ace.Rd index 15bd387..101aebf 100644 --- a/man/ace.Rd +++ b/man/ace.Rd @@ -25,7 +25,8 @@ \usage{ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, model = if (type == "continuous") "BM" else "ER", - scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1) + scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1, + use.expm = FALSE) \method{print}{ace}(x, digits = 4, ...) \method{logLik}{ace}(object, ...) \method{deviance}{ace}(object, ...) @@ -56,6 +57,10 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, structure to be used (this also gives the assumed model).} \item{ip}{the initial value(s) used for the ML estimation procedure when \code{type == "discrete"} (possibly recycled).} + \item{use.expm}{a logical specifying whether to use the package + \pkg{expm} to compute the matrix exponential (relevant only if + \code{type = "d"}). The default is to use the function + \code{matexpo} from \pkg{ape} (see details).} \item{digits}{the number of digits to be printed.} \item{object}{an object of class \code{"ace"}.} \item{k}{a numeric value giving the penalty per estimated parameter; @@ -108,6 +113,14 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, \code{"SYM"} is a symmetrical model (e.g., \code{matrix(c(0, 1, 2, 1, 0, 3, 2, 3, 0), 3)}). If a short-cut is used, the number of states is determined from the data. + + With discrete characters it is necessary to compute the exponential of + the rate matrix. By default (and the only possible choice until + \pkg{ape} 3.0-7) the function \code{\link{matexpo}} in \pkg{ape} is + used. If \code{use.expm = TRUE}, the function + \code{\link[expm]{expm}}, in the package of the same name, is + used. \code{matexpo} is faster but quite inaccurate for large and/or + asymmetric matrices. In case of doubt, use the latter. } \value{ an object of class \code{"ace"} with the following elements: diff --git a/man/read.dna.Rd b/man/read.dna.Rd index 4cf664e..ca0a5c0 100644 --- a/man/read.dna.Rd +++ b/man/read.dna.Rd @@ -129,11 +129,11 @@ cat("CLUSTAL (ape) multiple sequence alignment", "", file = "exdna.txt", sep = "\n") ex.dna3 <- read.dna("exdna.txt", format = "clustal") ### ... and in FASTA format -cat("> No305", +cat(">No305", "NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT", -"> No304", +">No304", "ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT", -"> No306", +">No306", "ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT", file = "exdna.txt", sep = "\n") ex.dna4 <- read.dna("exdna.txt", format = "fasta") -- 2.39.2 From 5090ae602c33e1cd3bbdbd2429a43daa8188b8c3 Mon Sep 17 00:00:00 2001 From: paradis Date: Sat, 9 Feb 2013 12:43:50 +0000 Subject: [PATCH 12/16] fix bug in prop.clades git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@207 6e262413-ae40-0410-9e79-b911bd7a66b7 --- NEWS | 4 ++++ R/dist.topo.R | 13 +++++++++++-- man/boot.phylo.Rd | 4 ++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 5f1d758..59c6fed 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,10 @@ BUG FIXES o read.dna(, "fasta") may add '\r' in labels: this is fixed. + o prop.clades() returned wrong numbers when the tip labels of + 'phy' are not in the same order than the list of trees (thanks + to Rupert Collins for the report). + CHANGES IN APE VERSION 3.0-7 diff --git a/R/dist.topo.R b/R/dist.topo.R index 230f009..fa7edf8 100644 --- a/R/dist.topo.R +++ b/R/dist.topo.R @@ -1,9 +1,9 @@ -## dist.topo.R (2012-12-12) +## dist.topo.R (2013-02-09) ## Topological Distances, Tree Bipartitions, ## Consensus Trees, and Bootstrapping Phylogenies -## Copyright 2005-2012 Emmanuel Paradis +## Copyright 2005-2013 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -87,6 +87,7 @@ dist.topo <- function(x, y, method = "PH85") y$tip.label <- NULL y } + x <- unclass(x) # another killer improvement by Tucson's hackathon (1/2/2013) x <- lapply(x, relabel) attr(x, "TipLabel") <- ref class(x) <- "multiPhylo" @@ -165,6 +166,14 @@ prop.clades <- function(phy, ..., part = NULL, rooted = FALSE) part <- prop.part(obj, check.labels = TRUE) } + ## until ape 3.0-7 it was assumed implicitly that the labels in phy + ## are in the same order than in 'part' (bug report by Rupert Collins) + if (!identical(phy$tip.label, attr(part, "labels"))) { + i <- match(phy$tip.label, attr(part, "labels")) + j <- match(seq_len(Ntip(phy)), phy$edge[, 2]) + phy$edge[j, 2] <- i + phy$tip.label <- attr(part, "labels") + } bp <- prop.part(phy) if (!rooted) { bp <- postprocess.prop.part(bp) diff --git a/man/boot.phylo.Rd b/man/boot.phylo.Rd index 8ac8668..2e16665 100644 --- a/man/boot.phylo.Rd +++ b/man/boot.phylo.Rd @@ -133,11 +133,11 @@ tr <- f(woodmouse) for (i in 1:5) print(boot.phylo(tr, woodmouse, f, quiet = TRUE)) ### How many partitions in 100 random trees of 10 labels?... -TR <- replicate(100, rtree(10), FALSE) +TR <- rmtree(100, 10) pp10 <- prop.part(TR) length(pp10) ### ... and in 100 random trees of 20 labels? -TR <- replicate(100, rtree(20), FALSE) +TR <- rmtree(100, 20) pp20 <- prop.part(TR) length(pp20) plot(pp10, pch = "x", col = 2) -- 2.39.2 From dfe58641d0e6fe53612710cd92401306273609f4 Mon Sep 17 00:00:00 2001 From: paradis Date: Wed, 13 Feb 2013 06:52:04 +0000 Subject: [PATCH 13/16] fix from Pierre Legendre git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@208 6e262413-ae40-0410-9e79-b911bd7a66b7 --- R/CDAM.global.R => CADM.global.R | 0 R/CDAM.post.R => CADM.post.R | 2 +- NEWS | 5 +++++ man/chronos.Rd | 4 ++-- 4 files changed, 8 insertions(+), 3 deletions(-) rename R/CDAM.global.R => CADM.global.R (100%) rename R/CDAM.post.R => CADM.post.R (99%) diff --git a/R/CDAM.global.R b/CADM.global.R similarity index 100% rename from R/CDAM.global.R rename to CADM.global.R diff --git a/R/CDAM.post.R b/CADM.post.R similarity index 99% rename from R/CDAM.post.R rename to CADM.post.R index 00b3d76..8530cd3 100644 --- a/R/CDAM.post.R +++ b/CADM.post.R @@ -257,7 +257,7 @@ } } Mantel.prob <- as.matrix(as.dist(Mantel.prob/(nperm+1))) - diag(Mantel.prob) <- 1 + diag(Mantel.prob) <- NA # Corrected 08feb13 } }) diff --git a/NEWS b/NEWS index 59c6fed..9bc3f6c 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,11 @@ BUG FIXES 'phy' are not in the same order than the list of trees (thanks to Rupert Collins for the report). + o CADM.post() displayed "1" on the diagonal of the matrix of + Mantel p-values. It now displays "NA" on the diagonal, + indicating that no test of significance is computed between a + distance matrix and itself. + CHANGES IN APE VERSION 3.0-7 diff --git a/man/chronos.Rd b/man/chronos.Rd index 22c15f1..d85f5f7 100644 --- a/man/chronos.Rd +++ b/man/chronos.Rd @@ -102,9 +102,9 @@ chronos.control(...) inference: bridging the parsimony-likelihood gap. \emph{Systematic Biology}, \bold{57}, 665--674. - Paradis, E. (2012) Molecular dating of phylogenies by likelihood + Paradis, E. (2013) Molecular dating of phylogenies by likelihood methods: a comparison of models and a new information - criterion. \emph{Manuscript}. + criterion. \emph{Molecular Phylogenetics and Evolution}, in press. Sanderson, M. J. (2002) Estimating absolute rates of molecular evolution and divergence times: a penalized likelihood -- 2.39.2 From 0eea4e45fea5cbcd63d5b257578579d52970d68c Mon Sep 17 00:00:00 2001 From: paradis Date: Wed, 13 Feb 2013 06:53:44 +0000 Subject: [PATCH 14/16] corrects previous error when moving files git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@209 6e262413-ae40-0410-9e79-b911bd7a66b7 --- CADM.global.R => R/CADM.global.R | 0 CADM.post.R => R/CADM.post.R | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename CADM.global.R => R/CADM.global.R (100%) rename CADM.post.R => R/CADM.post.R (100%) diff --git a/CADM.global.R b/R/CADM.global.R similarity index 100% rename from CADM.global.R rename to R/CADM.global.R diff --git a/CADM.post.R b/R/CADM.post.R similarity index 100% rename from CADM.post.R rename to R/CADM.post.R -- 2.39.2 From f0ba00490fd11146cab4a4039b0055217d1b376f Mon Sep 17 00:00:00 2001 From: paradis Date: Mon, 18 Mar 2013 10:29:37 +0000 Subject: [PATCH 15/16] some changes done before the HD failure of my laptop... I presume git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@210 6e262413-ae40-0410-9e79-b911bd7a66b7 --- NEWS | 6 ++++++ R/pic.R | 18 ++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 9bc3f6c..adf3082 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,12 @@ BUG FIXES distance matrix and itself. +OTHER CHANGES + + o The files CDAM.global.R and CDAM.post.R have been renamed + CADM.global.R and CADM.post.R + + CHANGES IN APE VERSION 3.0-7 diff --git a/R/pic.R b/R/pic.R index bc54067..2735f8f 100644 --- a/R/pic.R +++ b/R/pic.R @@ -1,26 +1,24 @@ -## pic.R (2012-11-20) +## pic.R (2013-02-18) ## Phylogenetically Independent Contrasts -## Copyright 2002-2012 Emmanuel Paradis +## Copyright 2002-2013 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. pic <- function(x, phy, scaled = TRUE, var.contrasts = FALSE, rescaled.tree = FALSE) { - if (!inherits(phy, "phylo")) - stop("object 'phy' is not of class \"phylo\"") - if (is.null(phy$edge.length)) - stop("your tree has no branch lengths") + if (!inherits(phy, "phylo")) stop("object 'phy' is not of class \"phylo\"") + if (is.null(phy$edge.length)) stop("your tree has no branch lengths") nb.tip <- length(phy$tip.label) nb.node <- phy$Nnode if (nb.node != nb.tip - 1) - stop("'phy' is not rooted and fully dichotomous") + stop("'phy' is not rooted and fully dichotomous") if (length(x) != nb.tip) - stop("length of phenotypic and of phylogenetic data do not match") + stop("length of phenotypic and of phylogenetic data do not match") if (any(is.na(x))) - stop("missing data in 'x': you may consider removing the species with missing data from your tree with the function 'drop.tip'.") + stop("missing data in 'x': you may consider removing the species with missing data from your tree with the function 'drop.tip'.") phy <- reorder(phy, "postorder") phenotype <- numeric(nb.tip + nb.node) @@ -65,7 +63,7 @@ pic.ortho <- function(x, phy, var.contrasts = FALSE, intra = FALSE) { n <- length(x) m <- n - 1L # number of nodes - phy <- reorder(phy, "pruningwise") + phy <- reorder(phy, "postorder") xx <- unlist(lapply(x, mean)) # 'x' in Felsenstein's paper xx <- c(xx, numeric(m)) delta.v <- numeric(n + m) -- 2.39.2 From 646d8be3497d656d95331e2c7f6bef5d2ff86b2c Mon Sep 17 00:00:00 2001 From: paradis Date: Mon, 18 Mar 2013 10:51:36 +0000 Subject: [PATCH 16/16] modified ace( method = ...) git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@211 6e262413-ae40-0410-9e79-b911bd7a66b7 --- R/ace.R | 12 +++++++----- man/ace.Rd | 44 ++++++++++++++++++++++---------------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/R/ace.R b/R/ace.R index 2b9230e..b751c69 100644 --- a/R/ace.R +++ b/R/ace.R @@ -1,4 +1,4 @@ -## ace.R (2013-01-31) +## ace.R (2013-03-18) ## Ancestral Character Estimation @@ -19,10 +19,12 @@ se } -ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, - model = if (type == "continuous") "BM" else "ER", - scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1, - use.expm = FALSE) +ace <- + function(x, phy, type = "continuous", + method = if (type == "continuous") "REML" else "ML", + CI = TRUE, model = if (type == "continuous") "BM" else "ER", + scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1, + use.expm = FALSE) { if (!inherits(phy, "phylo")) stop('object "phy" is not of class "phylo"') diff --git a/man/ace.Rd b/man/ace.Rd index 101aebf..ee66b0d 100644 --- a/man/ace.Rd +++ b/man/ace.Rd @@ -23,7 +23,8 @@ largest. } \usage{ -ace(x, phy, type = "continuous", method = "ML", CI = TRUE, +ace(x, phy, type = "continuous", method = if (type == "continuous") + "REML" else "ML", CI = TRUE, model = if (type == "continuous") "BM" else "ER", scaled = TRUE, kappa = 1, corStruct = NULL, ip = 0.1, use.expm = FALSE) @@ -71,31 +72,30 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, \details{ If \code{type = "continuous"}, the default model is Brownian motion where characters evolve randomly following a random walk. This model - can be fitted by maximum likelihood (the default, Schluter et - al. 1997), least squares (\code{method = "pic"}, Felsenstein 1985), or - generalized least squares (\code{method = "GLS"}, Martins and Hansen - 1997, Cunningham et al. 1998). In the latter case, the specification - of \code{phy} and \code{model} are actually ignored: it is instead - given through a correlation structure with the option - \code{corStruct}. + can be fitted by residual maximum likelihood (the default), maximum + likelihood (Schluter et al. 1997), least squares (\code{method = + "pic"}, Felsenstein 1985), or generalized least squares (\code{method + = "GLS"}, Martins and Hansen 1997, Cunningham et al. 1998). In the + last case, the specification of \code{phy} and \code{model} are + actually ignored: it is instead given through a correlation structure + with the option \code{corStruct}. - In the default setting (\code{method = "ML"} and \code{model = "BM"}) - the maximum likelihood estimation is done simultaneously on the - ancestral values and the variance of the Brownian motion process; - these estimates are then used to compute the confidence intervals in - the standard way. The REML method first estimates the ancestral value - at the root (aka, the phylogenetic mean), then the variance of the - Brownian motion process is estimated by optimizing the residual - log-likelihood. The ancestral values are finally inferred from the - likelihood function giving these two parameters. If \code{method = - "pic"} or \code{"GLS"}, the confidence intervals are computed using - the expected variances under the model, so they depend only on the - tree. + In the setting \code{method = "ML"} and \code{model = "BM"} (this used + to be the default until ape 3.0-7) the maximum likelihood estimation + is done simultaneously on the ancestral values and the variance of the + Brownian motion process; these estimates are then used to compute the + confidence intervals in the standard way. The REML method first + estimates the ancestral value at the root (aka, the phylogenetic + mean), then the variance of the Brownian motion process is estimated + by optimizing the residual log-likelihood. The ancestral values are + finally inferred from the likelihood function giving these two + parameters. If \code{method = "pic"} or \code{"GLS"}, the confidence + intervals are computed using the expected variances under the model, + so they depend only on the tree. It could be shown that, with a continous character, REML results in unbiased estimates of the variance of the Brownian motion process - while ML gives a downward bias. Therefore the former is recommanded, - even though it is not the default. + while ML gives a downward bias. Therefore the former is recommanded. For discrete characters (\code{type = "discrete"}), only maximum likelihood estimation is available (Pagel 1994). The model is -- 2.39.2