From: paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Date: Fri, 18 Jul 2008 11:39:37 +0000 (+0000)
Subject: new dist.gene() + corrected mlphylo()
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=f5788af1ae347b00a14d94d12b50b3804d63e9bf;p=ape.git

new dist.gene() + corrected mlphylo()

git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@44 6e262413-ae40-0410-9e79-b911bd7a66b7
---

diff --git a/Changes b/Changes
index a99d5d5..cfea531 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,13 @@
+		CHANGES IN APE VERSION 2.2-2
+
+
+NEW FEATURES
+
+    o dist.gene() has been substantially improved and gains an option
+      'pairwise.deletion'.
+
+
+
 		CHANGES IN APE VERSION 2.2-1
 
 
diff --git a/DESCRIPTION b/DESCRIPTION
index c40a448..96c7945 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ape
-Version: 2.2-1
-Date: 2008-07-11
+Version: 2.2-2
+Date: 2008-07-18
 Title: Analyses of Phylogenetics and Evolution
 Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong,
   Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel,
diff --git a/R/dist.gene.R b/R/dist.gene.R
index b0c432e..ea17b9b 100644
--- a/R/dist.gene.R
+++ b/R/dist.gene.R
@@ -1,47 +1,54 @@
-## dist.gene.R (2002-08-28)
+## dist.gene.R (2008-07-18)
 
 ##   Pairwise Distances from Genetic Data
 
-## Copyright 2002 Emmanuel Paradis
+## Copyright 2002-2008 Emmanuel Paradis
 
 ## This file is part of the R-package `ape'.
 ## See the file ../COPYING for licensing issues.
 
-dist.gene.pairwise <- function(x, variance = FALSE)
+dist.gene <-
+    function(x, method = "pairwise", pairwise.deletion = FALSE,
+             variance = FALSE)
 {
-    if (is.data.frame(x)) x <- as.matrix(x)
-    L <- ncol(x)
-    n <- nrow(x)
-    D <- matrix(NA, n, n)
-    diag(D) <- 0
+    if (!is.data.frame(x) || !is.matrix(x))
+        stop("'x' should be a matrix or a data.frame")
+    method <- match.arg(method, c("pairwise", "percentage"))
+
+    if (!pairwise.deletion) {
+        ## delete the columns with at least one NA:
+        del <- apply(x, 2, function(xx) any(is.na(xx)))
+        x <- x[, -del]
+    }
+    n <- dim(x)
+    L <- n[2]
+    n <- n[1]
+    D <- double(n*(n - 1)/2)
+    if (pairwise.deletion) L <- D
+    k <- 1
     for (i in 1:(n - 1)) {
         for (j in (i + 1):n) {
-            D[i, j] <- D[j, i] <- L - sum(x[i, ] == x[j, ])
+            y <- x[i, ] != x[j, ]
+            if (pairwise.deletion) L[k] <- sum(!is.na(y))
+            D[k] <-  sum(y, na.rm = TRUE)
+            k <- k + 1
         }
     }
-    if (!is.null(rownames(x))) rownames(D) <- colnames(D) <- rownames(x)
-    if (variance) {
-        var.D <- D * (L - D) / L
-        return(list(distances = D, variance = var.D))
-    }
-    else return(D)
-}
+    ## L is either a single integer value if pairwise.deletion = FALSE,
+    ## or a vector of integers if pairwise.deletion = TRUE
+
+    if (method == "percentage") D <- D/L
+
+    attr(D, "Size") <- n
+    attr(D, "Labels") <-  dimnames(x)[[1]]
+    attr(D, "Diag") <- attr(d, "Upper") <- FALSE
+    attr(D, "call") <- match.call()
+    attr(D, "method") <- method
+    class(D) <- "dist"
 
-dist.gene.percentage <- function(x, variance = FALSE)
-{
-    L <- ncol(x)
-    D <- dist.gene.pairwise(x) / L
     if (variance) {
-        var.D <- D * (1 - D) / L
-        return(list(pairwise.distances = D, variance = var.D))
+        y <- if (method == "pairwise") L else 1
+        attr(D, "variance") <- D*(y - D)/L
     }
-    else return(D)
-}
-
-dist.gene <- function(x, method = "pairwise", variance = FALSE)
-{
-    if (method == "pairwise")
-      return(dist.gene.pairwise(x, variance = variance))
-    if (method == "percentage")
-      return(dist.gene.percentage(x, variance = variance))
+    D
 }
diff --git a/R/mlphylo.R b/R/mlphylo.R
index a67379b..5f4b6ec 100644
--- a/R/mlphylo.R
+++ b/R/mlphylo.R
@@ -1,4 +1,4 @@
-## mlphylo.R (2008-06-18)
+## mlphylo.R (2008-07-15)
 
 ##   Estimating Phylogenies by Maximum Likelihood
 
@@ -74,7 +74,7 @@ of the tree do not match") # safe here also
 
     loglik <- 0
     if (!quiet) cat("Fitting in progress... ")
-    res <<- res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
+    res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
               as.raw(Y$SEQ), as.double(Y$ANC), as.double(Y$w),
               as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]),
               as.double(phy$edge.length), as.integer(npart),
@@ -83,7 +83,7 @@ of the tree do not match") # safe here also
               as.double(alpha), as.integer(Y$nalpha),
               as.integer(Y$ncat), as.double(invar), as.integer(Y$ninvar),
               as.double(BF), as.integer(search.tree), as.integer(fixed),
-              as.double(loglik), NAOK = TRUE, PACKAGE = "mlphylo")
+              as.double(loglik), NAOK = TRUE, PACKAGE = "ape")
     if (!quiet) cat("DONE!\n")
     phy$edge.length = res[[8]]
     attr(phy, "loglik") <- res[[23]]
diff --git a/man/dist.gene.Rd b/man/dist.gene.Rd
index 730cdba..45ac67a 100644
--- a/man/dist.gene.Rd
+++ b/man/dist.gene.Rd
@@ -1,28 +1,28 @@
 \name{dist.gene}
 \alias{dist.gene}
-\alias{dist.gene.pairwise}
-\alias{dist.gene.percentage}
 \title{Pairwise Distances from Genetic Data}
 \usage{
-dist.gene(x, method = "pairwise", variance = FALSE)
-dist.gene.pairwise(x, variance = FALSE)
-dist.gene.percentage(x, variance = FALSE)
+dist.gene(x, method = "pairwise", pairwise.deletion = FALSE,
+          variance = FALSE)
 }
 \arguments{
   \item{x}{a matrix or a data frame.}
   \item{method}{a character string specifying the method used to compute
-    the distances; only two choices are available: \code{"pairwise"},
-    and \code{"percentage"}.}
+    the distances; two choices are available: \code{"pairwise"} and
+    \code{"percentage"}, or any unambiguous abbreviation of these.}
+  \item{pairwise.deletion}{a logical indicating whether to delete the
+    columns with missing data on a pairwise basis. The default is to
+    delete the columns with at least one missing observation.}
   \item{variance}{a logical, indicates whether the variance of the
-    distances should be returned (default to FALSE).}
+    distances should be returned (default to \code{FALSE}).}
 }
 \description{
-  These functions compute a matrix of distances between pairs of
+  This function computes a matrix of distances between pairs of
   individuals from a matrix or a data frame of genetic data.
 }
 \details{
   This function is meant to be very general and accepts different kinds
-  of data (alleles, haplotypes, DNA sequences, and so on). The rows of
+  of data (alleles, haplotypes, SNP, DNA sequences, \dots). The rows of
   the data matrix represent the individuals, and the columns the loci.
 
   In the case of the pairwise method, the distance \eqn{d} between two
@@ -36,16 +36,17 @@ dist.gene.percentage(x, variance = FALSE)
   For more elaborate distances with DNA sequences, see the function
   \code{dist.dna}.
 }
+\note{
+  Missing data (\code{NA}) are coded and treated in R's usual way.
+}
 \value{
-  either a numeric matrix with possibly the names of the individuals (as
-  given by the rownames of the argument \code{x}) as colnames and rownames
-  (if \code{variance = FALSE}, the default), or a list of two matrices names
-  \code{distances} and \code{variance}, respectively (if \code{variance =
-    TRUE}).
+  an object of class \link[stats]{"dist"}. If \code{variance = TRUE} an
+  attribute called \code{"variance"} is given to the returned object.
 }
 \author{Emmanuel Paradis \email{Emmanuel.Paradis@mpl.ird.fr}}
 \seealso{
-  \code{\link{dist.dna}}, \code{\link{cophenetic.phylo}}
+  \code{\link{dist.dna}}, \code{\link{cophenetic.phylo}},
+  \code{\link[stats]{dist}}
 }
 \keyword{manip}