From ce466967e4824b3f6aec6f75798da08fe8c10666 Mon Sep 17 00:00:00 2001 From: paradis Date: Mon, 18 Jul 2011 04:05:06 +0000 Subject: [PATCH] new method = "REML" in ace() with continuous traits git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@166 6e262413-ae40-0410-9e79-b911bd7a66b7 --- DESCRIPTION | 2 +- NEWS | 4 +++ R/ace.R | 93 +++++++++++++++++++++++++++++++++-------------------- man/ace.Rd | 25 +++++++++----- 4 files changed, 81 insertions(+), 43 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2089fab..de29863 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 2.7-3 -Date: 2011-07-15 +Date: 2011-07-18 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/NEWS b/NEWS index cfd28ab..92a114a 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,10 @@ NEW FEATURES "two-sided" by default. Previously, this test was one-tailed with no possibility to change. + o ace() can now do REML estimation with continuous characters, + giving better estimates of the variance of the Brownian motion + process. + BUG FIXES diff --git a/R/ace.R b/R/ace.R index b67549f..b51b58e 100644 --- a/R/ace.R +++ b/R/ace.R @@ -1,8 +1,8 @@ -## ace.R (2010-12-08) +## ace.R (2011-07-18) ## Ancestral Character Estimation -## Copyright 2005-2010 Emmanuel Paradis and Ben Bolker +## Copyright 2005-2011 Emmanuel Paradis and Ben Bolker ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -19,9 +19,9 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, nb.tip <- length(phy$tip.label) nb.node <- phy$Nnode if (nb.node != nb.tip - 1) - stop('"phy" is not rooted AND fully dichotomous.') + stop('"phy" is not rooted AND fully dichotomous.') if (length(x) != nb.tip) - stop("length of phenotypic and of phylogenetic data do not match.") + stop("length of phenotypic and of phylogenetic data do not match.") if (!is.null(names(x))) { if(all(names(x) %in% phy$tip.label)) x <- x[phy$tip.label] @@ -30,9 +30,42 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, obj <- list() if (kappa != 1) phy$edge.length <- phy$edge.length^kappa if (type == "continuous") { - if (method == "pic") { + switch(method, "REML" = { + minusLogLik <- function(sig2) { + if (sig2 < 0) return(1e100) + V <- sig2 * vcv(phy) + ## next three lines borrowed from dmvnorm() in 'mvtnorm' + distval <- mahalanobis(x, center = mu, cov = V) + logdet <- sum(log(eigen(V, symmetric = TRUE, only.values = TRUE)$values)) + (nb.tip * log(2 * pi) + logdet + distval)/2 + } + mu <- rep(ace(x, phy, method="pic")$ace[1], nb.tip) + out <- nlm(minusLogLik, 1, hessian = TRUE) + sigma2 <- out$estimate + se_sgi2 <- sqrt(1/out$hessian) + tip <- phy$edge[, 2] <= nb.tip + minus.REML.BM <- function(p) { + x1 <- p[phy$edge[, 1] - nb.tip] + x2 <- numeric(length(x1)) + x2[tip] <- x[phy$edge[tip, 2]] + x2[!tip] <- p[phy$edge[!tip, 2] - nb.tip] + -(-sum((x1 - x2)^2/phy$edge.length)/(2 * sigma2) - + nb.node * log(sigma2)) + } + out <- nlm(function(p) minus.REML.BM(p), + p = rep(mu[1], nb.node), hessian = TRUE) + obj$resloglik <- -out$minimum + obj$ace <- out$estimate + names(obj$ace) <- (nb.tip + 1):(nb.tip + nb.node) + obj$sigma2 <- c(sigma2, se_sgi2) + if (CI) { + se <- sqrt(diag(solve(out$hessian))) + tmp <- se * qt(0.025, nb.node) + obj$CI95 <- cbind(obj$ace + tmp, obj$ace - tmp) + } + }, "pic" = { if (model != "BM") - stop('the "pic" method can be used only with model = "BM".') + stop('the "pic" method can be used only with model = "BM".') ## See pic.R for some annotations. phy <- reorder(phy, "pruningwise") phenotype <- numeric(nb.tip + nb.node) @@ -48,13 +81,10 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, names(obj$ace) <- (nb.tip + 1):(nb.tip + nb.node) if (CI) { se <- sqrt(ans[[8]]) - CI95 <- matrix(NA, nb.node, 2) - CI95[, 1] <- obj$ace + se * qnorm(0.025) - CI95[, 2] <- obj$ace - se * qnorm(0.025) - obj$CI95 <- CI95 + tmp <- se * qnorm(0.025) + obj$CI95 <- cbind(obj$ace + tmp, obj$ace - tmp) } - } - if (method == "ML") { + }, "ML" = { if (model == "BM") { tip <- phy$edge[, 2] <= nb.tip dev.BM <- function(p) { @@ -73,24 +103,20 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, names(obj$ace) <- (nb.tip + 1):(nb.tip + nb.node) se <- sqrt(diag(solve(out$hessian))) obj$sigma2 <- c(out$estimate[1], se[1]) - se <- se[-1] if (CI) { - CI95 <- matrix(NA, nb.node, 2) - CI95[, 1] <- obj$ace + se * qt(0.025, nb.node) - CI95[, 2] <- obj$ace - se * qt(0.025, nb.node) - obj$CI95 <- CI95 + tmp <- se[-1] * qt(0.025, nb.node) + obj$CI95 <- cbind(obj$ace + tmp, obj$ace - tmp) } } - } - if (method == "GLS") { + }, "GLS" = { if (is.null(corStruct)) - stop('you must give a correlation structure if method = "GLS".') + stop('you must give a correlation structure if method = "GLS".') if (class(corStruct)[1] == "corMartins") - M <- corStruct[1] * dist.nodes(phy) + M <- corStruct[1] * dist.nodes(phy) if (class(corStruct)[1] == "corGrafen") - phy <- compute.brlen(attr(corStruct, "tree"), - method = "Grafen", - power = exp(corStruct[1])) + phy <- compute.brlen(attr(corStruct, "tree"), + method = "Grafen", + power = exp(corStruct[1])) if (class(corStruct)[1] %in% c("corBrownian", "corGrafen")) { dis <- dist.nodes(attr(corStruct, "tree")) MRCA <- mrca(attr(corStruct, "tree"), full = TRUE) @@ -107,16 +133,14 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, obj$ace <- drop(varAY %*% invV %*% (x - GM) + GM) names(obj$ace) <- (nb.tip + 1):(nb.tip + nb.node) if (CI) { - CI95 <- matrix(NA, nb.node, 2) se <- sqrt((varA - varAY %*% invV %*% t(varAY))[cbind(1:nb.node, 1:nb.node)]) - CI95[, 1] <- obj$ace + se * qnorm(0.025) - CI95[, 2] <- obj$ace - se * qnorm(0.025) - obj$CI95 <- CI95 + tmp <- se * qnorm(0.025) + obj$CI95 <- cbind(obj$ace + tmp, obj$ace - tmp) } - } + }) } else { # type == "discrete" if (method != "ML") - stop("only ML estimation is possible for discrete characters.") + stop("only ML estimation is possible for discrete characters.") if (!is.factor(x)) x <- factor(x) nl <- nlevels(x) lvls <- levels(x) @@ -137,10 +161,9 @@ ace <- function(x, phy, type = "continuous", method = "ML", CI = TRUE, } } else { if (ncol(model) != nrow(model)) - stop("the matrix given as `model' is not square") + stop("the matrix given as 'model' is not square") if (ncol(model) != nl) - stop("the matrix `model' must have as many rows -as the number of categories in `x'") + stop("the matrix 'model' must have as many rows as the number of categories in 'x'") rate <- model np <- max(rate) } @@ -239,10 +262,12 @@ print.ace <- function(x, digits = 4, ...) cat("\n") if (!is.null(x$loglik)) cat(" Log-likelihood:", x$loglik, "\n\n") + if (!is.null(x$resloglik)) + cat(" Residual log-likelihood:", x$resloglik, "\n\n") ratemat <- x$index.matrix if (is.null(ratemat)) { # to be improved class(x) <- NULL - x$loglik <- x$call <- NULL + x$resloglik <- x$loglik <- x$call <- NULL print(x) } else { dimnames(ratemat)[1:2] <- dimnames(x$lik.anc)[2] diff --git a/man/ace.Rd b/man/ace.Rd index 126a216..1fc5d01 100644 --- a/man/ace.Rd +++ b/man/ace.Rd @@ -23,8 +23,8 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, \item{type}{the variable type; either \code{"continuous"} or \code{"discrete"} (or an abbreviation of these).} \item{method}{a character specifying the method used for - estimation. Three choices are possible: \code{"ML"}, \code{"pic"}, - or \code{"GLS"}.} + estimation. Four choices are possible: \code{"ML"}, \code{"REML"}, + \code{"pic"}, or \code{"GLS"}.} \item{CI}{a logical specifying whether to return the 95\% confidence intervals of the ancestral state estimates (for continuous characters) or the likelihood of the different states (for discrete @@ -75,13 +75,22 @@ ace(x, phy, type = "continuous", method = "ML", CI = TRUE, \code{corStruct}. In the default setting (i.e., \code{method = "ML"} and \code{model = - "BM"}) the maximum likelihood estimation is done simultaneously on - the ancestral values and the variance of the Brownian motion process; + "BM"}) the maximum likelihood estimation is done simultaneously on the + ancestral values and the variance of the Brownian motion process; these estimates are then used to compute the confidence intervals in - the standard way (see the package \pkg{geiger} for a different - implementation). If \code{method = "pic"} or \code{"GLS"}, the - confidence intervals are computed using the expected variances under - the model, so they depend only on the tree. + the standard way. The REML method first estimates the ancestral value + at the root (aka, the phylogenetic mean), then the variance of the + Brownian motion process is estimated by optimizing the residual + log-likelihood. The ancestral values are finally inferred from the + likelihood function giving these two parameters. If \code{method = + "pic"} or \code{"GLS"}, the confidence intervals are computed using + the expected variances under the model, so they depend only on the + tree. + + It could be shown that, with a continous character, REML results in + unbiased estimates of the variance of the Brownian motion process + while ML gives a downward bias. Therefore, the former is recommanded + over the latter, even though it is not the default. For discrete characters (\code{type = "discrete"}), only maximum likelihood estimation is available (Pagel 1994). The model is -- 2.39.2