## mlphylo.R (2008-07-15)

##   Estimating Phylogenies by Maximum Likelihood

## Copyright 2006-2008 Emmanuel Paradis

## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.

logLik.phylo <- function(object, ...) attr(object, "loglik")

deviance.phylo <- function(object, ...) -2*attr(object, "loglik")

AIC.phylo <- function(object, ..., k = 2)
{
    np <- length(object$edge.length) +
        length(attr(object, "rates")) +
            length(attr(object, "alpha")) +
                length(attr(object, "invar")) +
                    length(attr(object, "xi"))
    if (!attr(object, "model") %in% c("JC69", "F81"))
        np <- np + 3
    -2*attr(object, "loglik") + k*np
}

.subst.model <- structure(c(0, 1, 0, 1, 1, 1, 2, 5),
   names = c("JC69", "K80", "F81", "F84",
   "HKY85", "T92", "TN93", "GTR"))

mlphylo <-
    function(x, phy, model = DNAmodel(), search.tree = FALSE,
             quiet = FALSE, value = NULL, fixed = FALSE)
{
    ## not yet generic....
    if (class(x) != "DNAbin") stop("DNA sequences not in binary format")
    if (!is.binary.tree(phy))
        stop("the initial tree must be dichotomous.")
    if (!quiet && is.rooted(phy)) {
        warning("the initial tree is rooted: it will be unrooted.")
        phy <- unroot(phy)
    }
    if (is.null(phy$edge.length))
      stop("the initial tree must have branch lengths.")
    if (any(phy$edge.length > 1))
      stop("some branch lengths are greater than one.")
    phy <- reorder(phy, "pruningwise")
    if (!quiet) cat("Preparing the sequences...\n")
    if (is.list(x)) x <- as.matrix(x)
    if (is.null(rownames(x)))
        stop("DNA sequences have no names") # safe...
    if (!all(names(x) %in% phy$tip.label))
        stop("the names of the DNA sequences and the tip labels
of the tree do not match") # safe here also
    x <- x[phy$tip.label, ]
    Y <- prepareDNA(x, model)
    BF <- if (Y$model %in% 1:2) rep(0.25, 4) else base.freq(x)
    S <- length(Y$weight)
    npart <- dim(Y$partition)[2] # the number of overall partitions
    ## in case of negative branch lengths:
    phy$edge.length <- abs(phy$edge.length)
    nb.tip <- length(phy$tip.label)
    para <- if (Y$npara) rep(1, Y$npara) else 0
    alpha <- if (Y$nalpha) rep(.5, Y$nalpha) else 0
    invar <- if (Y$ninvar) rep(0.5, Y$ninvar) else 0

    if (!is.null(value)) {
        if (para && !is.null(value$rates))
            para <- value$rates[1:Y$npara]
        if (alpha && !is.null(value$alpha))
            alpha <- value$alpha[1:Y$nalpha]
        if (invar && !is.null(value$invar))
            invar <- value$invar[1:Y$ninvar]
    }

    loglik <- 0
    if (!quiet) cat("Fitting in progress... ")
    res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
              as.raw(Y$SEQ), as.double(Y$ANC), as.double(Y$w),
              as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]),
              as.double(phy$edge.length), as.integer(npart),
              as.integer(Y$partition), as.integer(Y$model),
              as.double(Y$xi), as.double(para), as.integer(Y$npara),
              as.double(alpha), as.integer(Y$nalpha),
              as.integer(Y$ncat), as.double(invar), as.integer(Y$ninvar),
              as.double(BF), as.integer(search.tree), as.integer(fixed),
              as.double(loglik), NAOK = TRUE, PACKAGE = "ape")
    if (!quiet) cat("DONE!\n")
    phy$edge.length = res[[8]]
    attr(phy, "loglik") <- res[[23]]
    attr(phy, "npart") <- npart
    attr(phy, "model") <- names(Y$npara)
    if (para) attr(phy, "rates") <- res[[13]]
    if (alpha) attr(phy, "alpha") <- res[[15]]
    if (invar) attr(phy, "invar") <- res[[18]]
    if (npart > 1) attr(phy, "xi") <- res[[12]]
    phy
}

DNAmodel <- function(model = "K80", partition = 1,
         ncat.isv = 1, invar = FALSE,
         equal.isv = TRUE, equal.invar = 1)
{
    if (ncat.isv > 10)
        stop("number of categories for inter-site variation cannot exceed 10")
    structure(list(model = model, partition = partition,
                   ncat.isv = ncat.isv, invar = invar,
                   equal.isv = equal.isv, equal.invar = equal.invar),
              class = "DNAmodel")
}

prepareDNA <- function(X, DNAmodel)
{
    L <- dim(X)[2] # already converted as a matrix in mlphylo()

    npart <- length(unique(DNAmodel$partition))

    ## find which substitution model:
    mo <- which(names(.subst.model) == DNAmodel$model)
    npara <- .subst.model[mo] # keeps the 'names'

    ## inter-sites variation:
    nalpha <- as.numeric(DNAmodel$ncat.isv > 1)
    if (!DNAmodel$equal.isv) nalpha <- npart * nalpha

    ## proportion of invariants:
    ninvar <- as.numeric(DNAmodel$invar)
    if (!DNAmodel$equal.invar) ninvar <- npart * ninvar

    SEQ <- weight <- part <- NULL

    ## For each partition...
    for (i in 1:npart) {
        ## extracts the sites in this partition:
        M <- X[, DNAmodel$partition == i, drop = FALSE]
        ## convert each column as a character string:
        M <- apply(M, 2, rawToChar)
        ## get their frequencies:
        w <- table(M)
        ## convert back to raw the unique(M):
        M <- sapply(dimnames(w)[[1]], charToRaw)
        ## remove useless attributes:
        colnames(M) <- dimnames(w) <- NULL
        w <- unclass(w)
        ## bind everything:
        SEQ <- cbind(SEQ, M)
        weight <- c(weight, w)
        part <- c(part, length(w)) # the length of each partition
    }

    class(SEQ) <- "DNAbin"
    ANC <- array(0, c(nrow(SEQ) - 2, ncol(SEQ), 4))

    ## 'partition' gives the start and end of each partition:
    partition <- matrix(1, 2, npart)
    partition[2, ] <- cumsum(part)
    if (npart > 1) {
        partition[1, 2:npart] <- partition[2, 1:(npart - 1)] + 1
        partition[2, npart] <- length(weight)
        xi <- rep(1, npart - 1)
    } else xi <- 0
    list(SEQ = SEQ, ANC = ANC, weight = weight, partition = partition,
         model = mo, xi = xi, npara = npara, nalpha = nalpha,
         ncat = DNAmodel$ncat.isv, ninvar = ninvar)
}