From: paradis Date: Wed, 21 Jul 2010 08:43:53 +0000 (+0000) Subject: improvements to compar.gee + bug fix in extract.clade X-Git-Url: https://git.donarmstrong.com/?p=ape.git;a=commitdiff_plain;h=188a8123d039d442dacb35ae678729375e22d239 improvements to compar.gee + bug fix in extract.clade git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@127 6e262413-ae40-0410-9e79-b911bd7a66b7 --- diff --git a/ChangeLog b/ChangeLog index df3b32b..f92c51c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -12,6 +12,16 @@ NEW FEATURES control the aspect of thermometers: horiz = TRUE, width = NULL, and height = NULL. + o compar.gee() has been improved with the new option 'corStruct' as an + alternative to 'phy' to specify the correlation structure, and + calculation of the QIC (Pan 2001, Biometrics). The display of the + results have also been improved. + + +BUG FIXES + + o extract.clade() sometimes shuffled the tip labels. + CHANGES IN APE VERSION 2.5-3 diff --git a/R/compar.gee.R b/R/compar.gee.R index 488379c..caead07 100644 --- a/R/compar.gee.R +++ b/R/compar.gee.R @@ -1,31 +1,51 @@ -## compar.gee.R (2008-02-21) +## compar.gee.R (2010-07-20) ## Comparative Analysis with GEEs -## Copyright 2002-2008 Emmanuel Paradis +## Copyright 2002-2010 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. -compar.gee <- function(formula, data = NULL, family = "gaussian", phy, - scale.fix = FALSE, scale.value = 1) +compar.gee <- + function(formula, data = NULL, family = gaussian, phy, + corStruct, scale.fix = FALSE, scale.value = 1) { require(gee, quietly = TRUE) - if (is.null(data)) data <- parent.frame() else { - if(!any(is.na(match(rownames(data), phy$tip.label)))) - data <- data[phy$tip.label, ] - else warning("the rownames of the data.frame and the tip labels of the tree -do not match: the former were ignored in the analysis.") + + if (!missing(corStruct)) { + if (!missing(phy)) + warning("the phylogeny was ignored because you gave a 'corStruct' object") + R <- vcv(corStruct, corr = TRUE) + } else { + R <- vcv(phy, corr = TRUE) + } + + if (is.null(data)) data <- parent.frame() + else { + nmsR <- rownames(R) + if (!identical(rownames(data), nmsR)) { + if (!any(is.na(match(rownames(data), nmsR)))) + data <- data[nmsR, ] + else { + msg <- if (missing(corStruct)) + "the tip labels of the tree" else "those of the correlation structure" + msg <- paste("the rownames of the data.frame and", msg, + "do not match: the former were ignored in the analysis") + warning(msg) + } + } } + effect.assign <- attr(model.matrix(formula, data = data), "assign") + for (i in all.vars(formula)) { if (any(is.na(eval(parse(text = i), envir = data)))) - stop("the present method cannot (yet) be used directly with missing data: you may consider removing the species with missing data from your tree with the function 'drop.tip'.") + stop("the present method cannot be used with missing data: you may consider removing the species with missing data from your tree with the function 'drop.tip'.") } - if (is.null(phy$edge.length)) stop("the tree has no branch lengths.") - R <- vcv.phylo(phy, cor = TRUE) + id <- rep(1, dim(R)[1]) - geemod <- do.call("gee", list(formula, id, data = data, family = family, R = R, + geemod <<- geemod <- do.call("gee", list(formula, id, data = data, family = family, R = R, corstr = "fixed", scale.fix = scale.fix, scale.value = scale.value)) W <- geemod$naive.variance @@ -34,10 +54,30 @@ do not match: the former were ignored in the analysis.") if (fname == "binomial") W <- summary(glm(formula, family = quasibinomial, data = data))$cov.scaled N <- geemod$nobs - dfP <- sum(phy$edge.length)*N / sum(diag(vcv.phylo(phy))) - obj <- list(call = geemod$call, + ## + ## maybe need to refine below in case of non-Brownian corStruct + if (!missing(corStruct)) phy <- attr(corStruct, "tree") + dfP <- sum(phy$edge.length)*N / sum(diag(R)) + ## + + ## compute QIC: + Y <- geemod$y + MU <- geemod$fitted.values + Qlik <- switch(fname, + "gaussian" = -sum((Y - MU)^2)/2, + "binomial" = sum(Y*log(MU/(1 - MU)) + log(1 - MU)), + "poisson" = sum(Y*log(MU) - MU), + "Gamma" = sum(Y/MU + log(MU)), + "inverse.gaussian" = sum(-Y/(2*MU^2) + 1/MU)) + Ai <- do.call("gee", list(formula, id, data = data, family = family, + corstr = "independence", scale.fix = scale.fix, + scale.value = scale.value))$naive.variance + QIC <- -2*Qlik + 2*sum(diag(solve(Ai) %*% W)) + + obj <- list(call = match.call(), effect.assign = effect.assign, nobs = N, + QIC = QIC, coefficients = geemod$coefficients, residuals = geemod$residuals, family = geemod$family$family, @@ -66,13 +106,13 @@ print.compar.gee <- function(x, ...) } else coef[, 4] <- 2 * (1 - pt(abs(coef[, 3]), df)) residu <- quantile(as.vector(x$residuals)) names(residu) <- c("Min", "1Q", "Median", "3Q", "Max") - cat("\nCall:\n") - cat(" formula: ") - print(x$call$formula) - cat("\nNumber of observations: ", x$nobs, "\n") - cat("\nModel:\n") - cat(" Link: ", x$link, "\n") + cat("Call: ") + print(x$call) + cat("Number of observations: ", x$nobs, "\n") + cat("Model:\n") + cat(" Link:", x$link, "\n") cat(" Variance to Mean Relation:", x$family, "\n") + cat("\nQIC:", x$QIC, "\n") cat("\nSummary of Residuals:\n") print(residu) if (any(nas)) @@ -107,7 +147,7 @@ drop1.compar.gee <- function(object, scope, quiet = FALSE, ...) warning("there is at least one interaction term in your model: you should be careful when interpreting the significance of the main effects.") class(ans) <- "anova" - attr(ans, "heading") <- c("Single term deletions\n\nModel:\n", - as.character(as.expression(fm))) + attr(ans, "heading") <- paste("Single term deletions\n\n Model:", + as.character(as.expression(fm)), "\n") ans } diff --git a/R/drop.tip.R b/R/drop.tip.R index e59b59b..c5eb07c 100644 --- a/R/drop.tip.R +++ b/R/drop.tip.R @@ -1,4 +1,4 @@ -## drop.tip.R (2010-02-11) +## drop.tip.R (2010-07-21) ## Remove Tips in a Phylogenetic Tree @@ -54,7 +54,7 @@ extract.clade <- function(phy, node, root.edge = 0, interactive = FALSE) if (wbl) phy$edge.length <- phy$edge.length[keep] TIPS <- phy$edge[, 2] <= Ntip tip <- phy$edge[TIPS, 2] - phy$tip.label <- phy$tip.label[tip] + phy$tip.label <- phy$tip.label[sort(tip)] # <- added sort to avoid shuffling of tip labels (2010-07-21) ## keep the ordering so no need to reorder tip.label: phy$edge[TIPS, 2] <- order(tip) if (!is.null(phy$node.label)) diff --git a/man/compar.gee.Rd b/man/compar.gee.Rd index eec81f1..170b8e0 100644 --- a/man/compar.gee.Rd +++ b/man/compar.gee.Rd @@ -3,8 +3,15 @@ \alias{print.compar.gee} \alias{drop1.compar.gee} \title{Comparative Analysis with GEEs} +\description{ + \code{compar.gee} performs the comparative analysis using generalized + estimating equations as described by Paradis and Claude (2002). + + \code{drop1} tests single effects of a fitted model output from + \code{compar.gee}. +} \usage{ -compar.gee(formula, data = NULL, family = "gaussian", phy, +compar.gee(formula, data = NULL, family = "gaussian", phy, corStruct, scale.fix = FALSE, scale.value = 1) \method{drop1}{compar.gee}(object, scope, quiet = FALSE, ...) } @@ -13,11 +20,13 @@ compar.gee(formula, data = NULL, family = "gaussian", phy, \item{data}{the name of the data frame where the variables in \code{formula} are to be found; by default, the variables are looked for in the global environment.} - \item{family}{a character string specifying the distribution assumed - for the response; by default a Gaussian distribution (with link - identity) is assumed (see \code{?family} for details on specifying - the distribution, and on changing the link function).} - \item{phy}{an object of class \code{"phylo"}.} + \item{family}{a function specifying the distribution assumed for the + response; by default a Gaussian distribution (with link identity) is + assumed (see \code{?family} for details on specifying the + distribution, and on changing the link function).} + \item{phy}{an object of class \code{"phylo"} (ignored if + \code{corStruct} is used).} + \item{corStruct}{a (phylogenetic) correlation structure.} \item{scale.fix}{logical, indicates whether the scale parameter should be fixed (TRUE) or estimated (FALSE, the default).} \item{scale.value}{if \code{scale.fix = TRUE}, gives the value for the @@ -29,13 +38,6 @@ compar.gee(formula, data = NULL, family = "gaussian", phy, about eventual ``marginality principle violation''.} \item{\dots}{further arguments to be passed to \code{drop1}.} } -\description{ - \code{compar.gee} performs the comparative analysis using generalized - estimating equations as described by Paradis and Claude (2002). - - \code{drop1} tests single effects of a fitted model output from - \code{compar.gee}. -} \details{ If a data frame is specified for the argument \code{data}, then its rownames are matched to the tip labels of \code{phy}. The user must be @@ -48,6 +50,14 @@ compar.gee(formula, data = NULL, family = "gaussian", phy, If \code{data = NULL}, then it is assumed that the variables are in the same order than the tip labels of \code{phy}. } +\note{ + The calculation of the phylogenetic degrees of freedom is likely to be + approximative for non-Brownian correlation structures (this will be + refined soon). + + The calculation of the quasilikelihood information criterion (QIC) + needs to be tested. +} \value{ \code{compar.gee} returns an object of class \code{"compar.gee"} with the following components: @@ -55,6 +65,8 @@ compar.gee(formula, data = NULL, family = "gaussian", phy, \item{effect.assign}{a vector of integers assigning the coefficients to the effects (used by \code{drop1}).} \item{nobs}{the number of observations.} + \item{QIC}{the quasilikelihood information criterion as defined by Pan + (2001).} \item{coefficients}{the estimated coefficients (or regression parameters).} \item{residuals}{the regression residuals.} \item{family}{a character string, the distribution assumed for the response.} @@ -67,6 +79,9 @@ compar.gee(formula, data = NULL, family = "gaussian", phy, \code{drop1} returns an object of class \code{"\link[stats]{anova}"}. } \references{ + Pan, W. (2001) Akaike's information criterion in generalized + estimating equations. \emph{Biometrics}, \bold{57}, 120--125. + Paradis, E. and Claude J. (2002) Analysis of comparative data using generalized estimating equations. \emph{Journal of theoretical Biology}, \bold{218}, 175--185.