## mlphylo.R (2008-07-15) ## Estimating Phylogenies by Maximum Likelihood ## Copyright 2006-2008 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. logLik.phylo <- function(object, ...) attr(object, "loglik") deviance.phylo <- function(object, ...) -2*attr(object, "loglik") AIC.phylo <- function(object, ..., k = 2) { np <- length(object$edge.length) + length(attr(object, "rates")) + length(attr(object, "alpha")) + length(attr(object, "invar")) + length(attr(object, "xi")) if (!attr(object, "model") %in% c("JC69", "F81")) np <- np + 3 -2*attr(object, "loglik") + k*np } .subst.model <- structure(c(0, 1, 0, 1, 1, 1, 2, 5), names = c("JC69", "K80", "F81", "F84", "HKY85", "T92", "TN93", "GTR")) mlphylo <- function(x, phy, model = DNAmodel(), search.tree = FALSE, quiet = FALSE, value = NULL, fixed = FALSE) { ## not yet generic.... if (class(x) != "DNAbin") stop("DNA sequences not in binary format") if (!is.binary.tree(phy)) stop("the initial tree must be dichotomous.") if (!quiet && is.rooted(phy)) { warning("the initial tree is rooted: it will be unrooted.") phy <- unroot(phy) } if (is.null(phy$edge.length)) stop("the initial tree must have branch lengths.") if (any(phy$edge.length > 1)) stop("some branch lengths are greater than one.") phy <- reorder(phy, "pruningwise") if (!quiet) cat("Preparing the sequences...\n") if (is.list(x)) x <- as.matrix(x) if (is.null(rownames(x))) stop("DNA sequences have no names") # safe... if (!all(names(x) %in% phy$tip.label)) stop("the names of the DNA sequences and the tip labels of the tree do not match") # safe here also x <- x[phy$tip.label, ] Y <- prepareDNA(x, model) BF <- if (Y$model %in% 1:2) rep(0.25, 4) else base.freq(x) S <- length(Y$weight) npart <- dim(Y$partition)[2] # the number of overall partitions ## in case of negative branch lengths: phy$edge.length <- abs(phy$edge.length) nb.tip <- length(phy$tip.label) para <- if (Y$npara) rep(1, Y$npara) else 0 alpha <- if (Y$nalpha) rep(.5, Y$nalpha) else 0 invar <- if (Y$ninvar) rep(0.5, Y$ninvar) else 0 if (!is.null(value)) { if (para && !is.null(value$rates)) para <- value$rates[1:Y$npara] if (alpha && !is.null(value$alpha)) alpha <- value$alpha[1:Y$nalpha] if (invar && !is.null(value$invar)) invar <- value$invar[1:Y$ninvar] } loglik <- 0 if (!quiet) cat("Fitting in progress... ") res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S), as.raw(Y$SEQ), as.double(Y$ANC), as.double(Y$w), as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]), as.double(phy$edge.length), as.integer(npart), as.integer(Y$partition), as.integer(Y$model), as.double(Y$xi), as.double(para), as.integer(Y$npara), as.double(alpha), as.integer(Y$nalpha), as.integer(Y$ncat), as.double(invar), as.integer(Y$ninvar), as.double(BF), as.integer(search.tree), as.integer(fixed), as.double(loglik), NAOK = TRUE, PACKAGE = "ape") if (!quiet) cat("DONE!\n") phy$edge.length = res[[8]] attr(phy, "loglik") <- res[[23]] attr(phy, "npart") <- npart attr(phy, "model") <- names(Y$npara) if (para) attr(phy, "rates") <- res[[13]] if (alpha) attr(phy, "alpha") <- res[[15]] if (invar) attr(phy, "invar") <- res[[18]] if (npart > 1) attr(phy, "xi") <- res[[12]] phy } DNAmodel <- function(model = "K80", partition = 1, ncat.isv = 1, invar = FALSE, equal.isv = TRUE, equal.invar = 1) { if (ncat.isv > 10) stop("number of categories for inter-site variation cannot exceed 10") structure(list(model = model, partition = partition, ncat.isv = ncat.isv, invar = invar, equal.isv = equal.isv, equal.invar = equal.invar), class = "DNAmodel") } prepareDNA <- function(X, DNAmodel) { L <- dim(X)[2] # already converted as a matrix in mlphylo() npart <- length(unique(DNAmodel$partition)) ## find which substitution model: mo <- which(names(.subst.model) == DNAmodel$model) npara <- .subst.model[mo] # keeps the 'names' ## inter-sites variation: nalpha <- as.numeric(DNAmodel$ncat.isv > 1) if (!DNAmodel$equal.isv) nalpha <- npart * nalpha ## proportion of invariants: ninvar <- as.numeric(DNAmodel$invar) if (!DNAmodel$equal.invar) ninvar <- npart * ninvar SEQ <- weight <- part <- NULL ## For each partition... for (i in 1:npart) { ## extracts the sites in this partition: M <- X[, DNAmodel$partition == i, drop = FALSE] ## convert each column as a character string: M <- apply(M, 2, rawToChar) ## get their frequencies: w <- table(M) ## convert back to raw the unique(M): M <- sapply(dimnames(w)[[1]], charToRaw) ## remove useless attributes: colnames(M) <- dimnames(w) <- NULL w <- unclass(w) ## bind everything: SEQ <- cbind(SEQ, M) weight <- c(weight, w) part <- c(part, length(w)) # the length of each partition } class(SEQ) <- "DNAbin" ANC <- array(0, c(nrow(SEQ) - 2, ncol(SEQ), 4)) ## 'partition' gives the start and end of each partition: partition <- matrix(1, 2, npart) partition[2, ] <- cumsum(part) if (npart > 1) { partition[1, 2:npart] <- partition[2, 1:(npart - 1)] + 1 partition[2, npart] <- length(weight) xi <- rep(1, npart - 1) } else xi <- 0 list(SEQ = SEQ, ANC = ANC, weight = weight, partition = partition, model = mo, xi = xi, npara = npara, nalpha = nalpha, ncat = DNAmodel$ncat.isv, ninvar = ninvar) }