R/mlphylo.R

   1 ## mlphylo.R (2008-01-03)
   2
   3 ##   Estimating Phylogenies by Maximum Likelihood
   4
   5 ## Copyright 2006-2008 Emmanuel Paradis
   6
   7 ## This file is part of the R-package `ape'.
   8 ## See the file ../COPYING for licensing issues.
   9
  10 logLik.phylo <- function(object, ...) attr(object, "loglik")
  11
  12 deviance.phylo <- function(object, ...) -2*attr(object, "loglik")
  13
  14 AIC.phylo <- function(object, ..., k = 2)
  15 {
  16     np <- length(object$edge.length) +
  17         length(attr(object, "rates")) +
  18             length(attr(object, "alpha")) +
  19                 length(attr(object, "invar")) +
  20                     length(attr(object, "xi"))
  21     if (!attr(object, "model") %in% c("JC69", "F81"))
  22         np <- np + 3
  23     -2*attr(object, "loglik") + k*np
  24 }
  25
  26 .subst.model <- structure(c(0, 1, 0, 1, 1, 1, 2, 5),
  27    names = c("JC69", "K80", "F81", "F84",
  28    "HKY85", "T92", "TN93", "GTR"))
  29
  30 mlphylo <-
  31     function(x, phy, model = DNAmodel(), search.tree = FALSE,
  32              quiet = FALSE, value = NULL, fixed = FALSE)
  33 {
  34     ## not yet generic....
  35     if (class(x) != "DNAbin") stop("DNA sequences not in binary format")
  36     if (!is.binary.tree(phy))
  37         stop("the initial tree must be dichotomous.")
  38     if (!quiet && is.rooted(phy)) {
  39         warning("the initial tree is rooted: it will be unrooted.")
  40         phy <- unroot(phy)
  41     }
  42     if (is.null(phy$edge.length))
  43       stop("the initial tree must have branch lengths.")
  44     if (any(phy$edge.length > 1))
  45       stop("some branch lengths are greater than one.")
  46     phy <- reorder(phy, "pruningwise")
  47     if (!quiet) cat("Preparing the sequences...\n")
  48     BF <- if (model$model %in% 1:2) rep(0.25, 4) else base.freq(x)
  49     if (is.list(x)) x <- as.matrix(x)
  50     if (is.null(rownames(x)))
  51         stop("DNA sequences have no names") # safe...
  52     if (!all(names(x) %in% phy$tip.label))
  53         stop("the names of the DNA sequences and the tip labels
  54 of the tree do not match") # safe here also
  55     x <- x[phy$tip.label, ]
  56     Y <- prepareDNA(x, model)
  57     S <- length(Y$weight)
  58     npart <- dim(Y$partition)[2] # the number of overall partitions
  59     ## in case of negative branch lengths:
  60     phy$edge.length <- abs(phy$edge.length)
  61     nb.tip <- length(phy$tip.label)
  62     para <- if (Y$npara) rep(1, Y$npara) else 0
  63     alpha <- if (Y$nalpha) rep(.5, Y$nalpha) else 0
  64     invar <- if (Y$ninvar) rep(0.5, Y$ninvar) else 0
  65
  66     if (!is.null(value)) {
  67         if (para && !is.null(value$rates))
  68             para <- value$rates[1:Y$npara]
  69         if (alpha && !is.null(value$alpha))
  70             alpha <- value$alpha[1:Y$nalpha]
  71         if (invar && !is.null(value$invar))
  72             invar <- value$invar[1:Y$ninvar]
  73     }
  74
  75     loglik <- 0
  76     if (!quiet) cat("Fitting in progress... ")
  77     res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
  78               as.raw(Y$SEQ), as.double(Y$ANC), as.double(Y$w),
  79               as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]),
  80               as.double(phy$edge.length), as.integer(npart),
  81               as.integer(Y$partition), as.integer(Y$model),
  82               as.double(Y$xi), as.double(para), as.integer(Y$npara),
  83               as.double(alpha), as.integer(Y$nalpha),
  84               as.integer(Y$ncat), as.double(invar), as.integer(Y$ninvar),
  85               as.double(BF), as.integer(search.tree), as.integer(fixed),
  86               as.double(loglik), NAOK = TRUE, PACKAGE = "ape")
  87     if (!quiet) cat("DONE!\n")
  88     phy$edge.length = res[[8]]
  89     attr(phy, "loglik") <- res[[23]]
  90     attr(phy, "npart") <- npart
  91     attr(phy, "model") <- names(Y$npara)
  92     if (para) attr(phy, "rates") <- res[[13]]
  93     if (alpha) attr(phy, "alpha") <- res[[15]]
  94     if (invar) attr(phy, "invar") <- res[[18]]
  95     if (npart > 1) attr(phy, "xi") <- res[[12]]
  96     phy
  97 }
  98
  99 DNAmodel <- function(model = "K80", partition = 1,
 100          ncat.isv = 1, invar = FALSE,
 101          equal.isv = TRUE, equal.invar = 1)
 102 {
 103     if (ncat.isv > 10)
 104         stop("number of categories for inter-site variation cannot exceed 10")
 105     structure(list(model = model, partition = partition,
 106                    ncat.isv = ncat.isv, invar = invar,
 107                    equal.isv = equal.isv, equal.invar = equal.invar),
 108               class = "DNAmodel")
 109 }
 110
 111 prepareDNA <- function(X, DNAmodel)
 112 {
 113     L <- dim(X)[2] # already converted as a matrix in mlphylo()
 114
 115     npart <- length(unique(DNAmodel$partition))
 116
 117     ## find which substitution model:
 118     mo <- which(names(.subst.model) == DNAmodel$model)
 119     npara <- .subst.model[mo] # keeps the 'names'
 120
 121     ## inter-sites variation:
 122     nalpha <- as.numeric(DNAmodel$ncat.isv > 1)
 123     if (!DNAmodel$equal.isv) nalpha <- npart * nalpha
 124
 125     ## proportion of invariants:
 126     ninvar <- as.numeric(DNAmodel$invar)
 127     if (!DNAmodel$equal.invar) ninvar <- npart * ninvar
 128
 129     SEQ <- weight <- part <- NULL
 130
 131     ## For each partition...
 132     for (i in 1:npart) {
 133         ## extracts the sites in this partition:
 134         M <- X[, DNAmodel$partition == i, drop = FALSE]
 135         ## convert each column as a character string:
 136         M <- apply(M, 2, rawToChar)
 137         ## get their frequencies:
 138         w <- table(M)
 139         ## convert back to raw the unique(M):
 140         M <- sapply(dimnames(w)[[1]], charToRaw)
 141         ## remove useless attributes:
 142         colnames(M) <- dimnames(w) <- NULL
 143         w <- unclass(w)
 144         ## bind everything:
 145         SEQ <- cbind(SEQ, M)
 146         weight <- c(weight, w)
 147         part <- c(part, length(w)) # the length of each partition
 148     }
 149
 150     class(SEQ) <- "DNAbin"
 151     ANC <- array(1, c(nrow(SEQ) - 2, ncol(SEQ), 4))
 152
 153     ## 'partition' gives the start and end of each partition:
 154     partition <- matrix(1, 2, npart)
 155     partition[2, ] <- cumsum(part)
 156     if (npart > 1) {
 157         partition[1, 2:npart] <- partition[2, 1:(npart - 1)] + 1
 158         partition[2, npart] <- length(weight)
 159         xi <- rep(1, npart - 1)
 160     } else xi <- 0
 161     list(SEQ = SEQ, ANC = ANC, weight = weight, partition = partition,
 162          model = mo, xi = xi, npara = npara, nalpha = nalpha,
 163          ncat = DNAmodel$ncat.isv, ninvar = ninvar)
 164 }