new dist.gene() + corrected mlphylo()

author paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>

Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)

committer paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>

Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)
author paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)
committer paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)
diff --git a/Changes b/Changes

index a99d5d528217a6585821ef5424a188ad11c4f68c..cfea531b4ca2f06db23fe32d276382022a4c896a 100644 (file)
--- a/Changes
+++ b/Changes
@@ -1,3 +1,13 @@
+               CHANGES IN APE VERSION 2.2-2
+
+
+NEW FEATURES
+
+    o dist.gene() has been substantially improved and gains an option
+      'pairwise.deletion'.
+
+
+
                 CHANGES IN APE VERSION 2.2-1
  
  
diff --git a/DESCRIPTION b/DESCRIPTION

index c40a448d269df9e2a02580b6531edfbd70f69e16..96c7945a0b8c287b4190bc3c13435d6405f30185 100644 (file)
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
  Package: ape
-Version: 2.2-1
-Date: 2008-07-11
+Version: 2.2-2
+Date: 2008-07-18
  Title: Analyses of Phylogenetics and Evolution
  Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong,
    Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel,
diff --git a/R/dist.gene.R b/R/dist.gene.R

index b0c432eef9fb1cbb8a91958dd646eeded6536ff4..ea17b9b577a37c3cc617d552d9e5bfa5d8eb3465 100644 (file)
--- a/R/dist.gene.R
+++ b/R/dist.gene.R
@@ -1,47 +1,54 @@
-## dist.gene.R (2002-08-28)
+## dist.gene.R (2008-07-18)
  
  ##   Pairwise Distances from Genetic Data
  
-## Copyright 2002 Emmanuel Paradis
+## Copyright 2002-2008 Emmanuel Paradis
  
  ## This file is part of the R-package `ape'.
  ## See the file ../COPYING for licensing issues.
  
-dist.gene.pairwise <- function(x, variance = FALSE)
+dist.gene <-
+    function(x, method = "pairwise", pairwise.deletion = FALSE,
+             variance = FALSE)
  {
-    if (is.data.frame(x)) x <- as.matrix(x)
-    L <- ncol(x)
-    n <- nrow(x)
-    D <- matrix(NA, n, n)
-    diag(D) <- 0
+    if (!is.data.frame(x) || !is.matrix(x))
+        stop("'x' should be a matrix or a data.frame")
+    method <- match.arg(method, c("pairwise", "percentage"))
+
+    if (!pairwise.deletion) {
+        ## delete the columns with at least one NA:
+        del <- apply(x, 2, function(xx) any(is.na(xx)))
+        x <- x[, -del]
+    }
+    n <- dim(x)
+    L <- n[2]
+    n <- n[1]
+    D <- double(n*(n - 1)/2)
+    if (pairwise.deletion) L <- D
+    k <- 1
      for (i in 1:(n - 1)) {
          for (j in (i + 1):n) {
-            D[i, j] <- D[j, i] <- L - sum(x[i, ] == x[j, ])
+            y <- x[i, ] != x[j, ]
+            if (pairwise.deletion) L[k] <- sum(!is.na(y))
+            D[k] <-  sum(y, na.rm = TRUE)
+            k <- k + 1
          }
      }
-    if (!is.null(rownames(x))) rownames(D) <- colnames(D) <- rownames(x)
-    if (variance) {
-        var.D <- D * (L - D) / L
-        return(list(distances = D, variance = var.D))
-    }
-    else return(D)
-}
+    ## L is either a single integer value if pairwise.deletion = FALSE,
+    ## or a vector of integers if pairwise.deletion = TRUE
+
+    if (method == "percentage") D <- D/L
+
+    attr(D, "Size") <- n
+    attr(D, "Labels") <-  dimnames(x)[[1]]
+    attr(D, "Diag") <- attr(d, "Upper") <- FALSE
+    attr(D, "call") <- match.call()
+    attr(D, "method") <- method
+    class(D) <- "dist"
  
-dist.gene.percentage <- function(x, variance = FALSE)
-{
-    L <- ncol(x)
-    D <- dist.gene.pairwise(x) / L
      if (variance) {
-        var.D <- D * (1 - D) / L
-        return(list(pairwise.distances = D, variance = var.D))
+        y <- if (method == "pairwise") L else 1
+        attr(D, "variance") <- D*(y - D)/L
      }
-    else return(D)
-}
-
-dist.gene <- function(x, method = "pairwise", variance = FALSE)
-{
-    if (method == "pairwise")
-      return(dist.gene.pairwise(x, variance = variance))
-    if (method == "percentage")
-      return(dist.gene.percentage(x, variance = variance))
+    D
  }
diff --git a/R/mlphylo.R b/R/mlphylo.R

index a67379be89b55869ab88c9aab43803263e1f5ea2..5f4b6ecd56cee8758c28c0b06ace72cc07a8ccbd 100644 (file)
--- a/R/mlphylo.R
+++ b/R/mlphylo.R
@@ -1,4 +1,4 @@
-## mlphylo.R (2008-06-18)
+## mlphylo.R (2008-07-15)
  
  ##   Estimating Phylogenies by Maximum Likelihood
  
@@ -74,7 +74,7 @@ of the tree do not match") # safe here also
  
      loglik <- 0
      if (!quiet) cat("Fitting in progress... ")
-    res <<- res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
+    res <- .C("mlphylo_DNAmodel", as.integer(nb.tip), as.integer(S),
                as.raw(Y$SEQ), as.double(Y$ANC), as.double(Y$w),
                as.integer(phy$edge[, 1]), as.integer(phy$edge[, 2]),
                as.double(phy$edge.length), as.integer(npart),
@@ -83,7 +83,7 @@ of the tree do not match") # safe here also
                as.double(alpha), as.integer(Y$nalpha),
                as.integer(Y$ncat), as.double(invar), as.integer(Y$ninvar),
                as.double(BF), as.integer(search.tree), as.integer(fixed),
-              as.double(loglik), NAOK = TRUE, PACKAGE = "mlphylo")
+              as.double(loglik), NAOK = TRUE, PACKAGE = "ape")
      if (!quiet) cat("DONE!\n")
      phy$edge.length = res[[8]]
      attr(phy, "loglik") <- res[[23]]
diff --git a/man/dist.gene.Rd b/man/dist.gene.Rd

index 730cdbacd5ab67464b87ec01217fffe01e4e7d33..45ac67ae4cf74bb3b8aa7874757bd18f06590aeb 100644 (file)
--- a/man/dist.gene.Rd
+++ b/man/dist.gene.Rd
@@ -1,28 +1,28 @@
  \name{dist.gene}
  \alias{dist.gene}
-\alias{dist.gene.pairwise}
-\alias{dist.gene.percentage}
  \title{Pairwise Distances from Genetic Data}
  \usage{
-dist.gene(x, method = "pairwise", variance = FALSE)
-dist.gene.pairwise(x, variance = FALSE)
-dist.gene.percentage(x, variance = FALSE)
+dist.gene(x, method = "pairwise", pairwise.deletion = FALSE,
+          variance = FALSE)
  }
  \arguments{
    \item{x}{a matrix or a data frame.}
    \item{method}{a character string specifying the method used to compute
-    the distances; only two choices are available: \code{"pairwise"},
-    and \code{"percentage"}.}
+    the distances; two choices are available: \code{"pairwise"} and
+    \code{"percentage"}, or any unambiguous abbreviation of these.}
+  \item{pairwise.deletion}{a logical indicating whether to delete the
+    columns with missing data on a pairwise basis. The default is to
+    delete the columns with at least one missing observation.}
    \item{variance}{a logical, indicates whether the variance of the
-    distances should be returned (default to FALSE).}
+    distances should be returned (default to \code{FALSE}).}
  }
  \description{
-  These functions compute a matrix of distances between pairs of
+  This function computes a matrix of distances between pairs of
    individuals from a matrix or a data frame of genetic data.
  }
  \details{
    This function is meant to be very general and accepts different kinds
-  of data (alleles, haplotypes, DNA sequences, and so on). The rows of
+  of data (alleles, haplotypes, SNP, DNA sequences, \dots). The rows of
    the data matrix represent the individuals, and the columns the loci.
  
    In the case of the pairwise method, the distance \eqn{d} between two
@@ -36,16 +36,17 @@ dist.gene.percentage(x, variance = FALSE)
    For more elaborate distances with DNA sequences, see the function
    \code{dist.dna}.
  }
+\note{
+  Missing data (\code{NA}) are coded and treated in R's usual way.
+}
  \value{
-  either a numeric matrix with possibly the names of the individuals (as
-  given by the rownames of the argument \code{x}) as colnames and rownames
-  (if \code{variance = FALSE}, the default), or a list of two matrices names
-  \code{distances} and \code{variance}, respectively (if \code{variance =
-    TRUE}).
+  an object of class \link[stats]{"dist"}. If \code{variance = TRUE} an
+  attribute called \code{"variance"} is given to the returned object.
  }
  \author{Emmanuel Paradis \email{Emmanuel.Paradis@mpl.ird.fr}}
  \seealso{
-  \code{\link{dist.dna}}, \code{\link{cophenetic.phylo}}
+  \code{\link{dist.dna}}, \code{\link{cophenetic.phylo}},
+  \code{\link[stats]{dist}}
  }
  \keyword{manip}
author	paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
	Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)
committer	paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
	Fri, 18 Jul 2008 11:39:37 +0000 (11:39 +0000)
Changes		patch \| blob \| history
DESCRIPTION		patch \| blob \| history
R/dist.gene.R		patch \| blob \| history
R/mlphylo.R		patch \| blob \| history
man/dist.gene.Rd		patch \| blob \| history