From 100e0d086372a87bc2506873bdf86d3fcdda9922 Mon Sep 17 00:00:00 2001 From: paradis Date: Tue, 14 Jul 2009 04:40:09 +0000 Subject: [PATCH] fixing nj() with many 0 distances git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@82 6e262413-ae40-0410-9e79-b911bd7a66b7 --- ChangeLog | 2 + DESCRIPTION | 2 +- R/nj.R | 16 ++- src/nj.c | 292 ++++++++++++++++++++++++---------------------------- 4 files changed, 146 insertions(+), 166 deletions(-) diff --git a/ChangeLog b/ChangeLog index dbe6fe0..e14ba74 100644 --- a/ChangeLog +++ b/ChangeLog @@ -16,6 +16,8 @@ BUG FIXES o vcv.phylo() did not work correctly when the tree is in "pruningwise" order. + o nj() did not handle correctly distance matrices with many 0's. + CHANGES IN APE VERSION 2.3-1 diff --git a/DESCRIPTION b/DESCRIPTION index 0ff3acf..8668f1f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 2.3-2 -Date: 2009-07-06 +Date: 2009-07-09 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Gangolf Jobb, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis diff --git a/R/nj.R b/R/nj.R index c99bce3..6f60d8e 100644 --- a/R/nj.R +++ b/R/nj.R @@ -1,8 +1,8 @@ -## nj.R (2006-09-15) +## nj.R (2009-07-10) ## Neighbor-Joining Tree Estimation -## Copyright 2004-2006 Emmanuel Paradis +## Copyright 2004-2009 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -13,13 +13,11 @@ nj <- function(X) N <- attr(X, "Size") labels <- attr(X, "Labels") if (is.null(labels)) labels <- as.character(1:N) - edge1 <- edge2 <- integer(2*N - 3) - edge.length <- numeric(2*N - 3) - ans <- .C("nj", as.double(X), as.integer(N), as.integer(edge1), - as.integer(edge2), as.double(edge.length), PACKAGE = "ape") - obj <- list(edge = cbind(ans[[3]], ans[[4]]), - edge.length = ans[[5]], tip.label = labels) - obj$Nnode <- N - 2 + ans <- .C("nj", as.double(X), as.integer(N), integer(2*N - 3), + integer(2*N - 3), double(2*N - 3), + DUP = FALSE, NAOK = TRUE, PACKAGE = "apex") + obj <- list(edge = cbind(ans[[3]], ans[[4]]), edge.length = ans[[5]], + tip.label = labels, Nnode = N - 2L) class(obj) <- "phylo" reorder(obj) } diff --git a/src/nj.c b/src/nj.c index 84e1a73..af53abb 100644 --- a/src/nj.c +++ b/src/nj.c @@ -1,6 +1,6 @@ -/* nj.c 2006-11-13 */ +/* nj.c 2009-07-09 */ -/* Copyright 2006 Emmanuel Paradis +/* Copyright 2006-2009 Emmanuel Paradis /* This file is part of the R-package `ape'. */ /* See the file ../COPYING for licensing issues. */ @@ -11,30 +11,51 @@ int give_index(int i, int j, int n) { - if (i > j) return(DINDEX(j, i)); - else return(DINDEX(i, j)); + if (i > j) return(DINDEX(j, i)); + else return(DINDEX(i, j)); } double sum_dist_to_i(int n, double *D, int i) /* returns the sum of all distances D_ij between i and j - with j between 1, and n and j != i */ + with j = 1...n and j != i */ { - double sum; - int j; - - sum = 0; - - if (i != 1) { - for (j = 1; j < i; j++) - sum += D[DINDEX(j, i)]; - } +/* we use the fact that the distances are arranged sequentially + in the lower triangle, e.g. with n = 6 the 15 distances are + stored as (the C indices are indicated): + + i + 1 2 3 4 5 + + 2 0 + 3 1 5 +j 4 2 6 9 + 5 3 7 10 12 + 6 4 8 11 13 14 + + so that we sum the values of the ith column-1st loop-and those of + (i - 1)th row (labelled 'i')-2nd loop */ + + double sum = 0; + int j, start, end; + + if (i < n) { + /* the expression below CANNOT be factorized + because of the integer operations (it took + me a while to find out...) */ + start = n*(i - 1) - i*(i - 1)/2; + end = start + n - i; + for (j = start; j < end; j++) sum += D[j]; + } - if (i != n) { - for (j = i + 1; j <= n; j++) - sum += D[DINDEX(i, j)]; - } + if (i > 1) { + start = i - 2; + for (j = 1; j <= i - 1; j++) { + sum += D[start]; + start += n - j - 1; + } + } - return(sum); + return(sum); } #define GET_I_AND_J \ @@ -49,162 +70,121 @@ double sum_dist_to_i(int n, double *D, int i) if (i >= smallest + 1) break; \ } \ /* Finding the second OTU is easier! */ \ - OTU2 = smallest + 1 + OTU1 - n*(OTU1 - 1) + OTU1*(OTU1 - 1)/2; + OTU2 = smallest + 1 + OTU1 - n*(OTU1 - 1) + OTU1*(OTU1 - 1)/2 #define SET_CLADE \ /* give the node and tip numbers to edge */ \ edge2[k] = otu_label[OTU1 - 1]; \ edge2[k + 1] = otu_label[OTU2 - 1]; \ - edge1[k] = edge1[k + 1] = cur_nod; + edge1[k] = edge1[k + 1] = cur_nod void nj(double *D, int *N, int *edge1, int *edge2, double *edge_length) { - double SUMD, Sdist, *S, Ndist, *new_dist, A, B, *DI, d_i, x, y; - int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; - - S = &Sdist; - new_dist = &Ndist; - otu_label = &o_l; - DI = &d_i; - - n = *N; - cur_nod = 2*n - 2; - - S = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); - new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); - otu_label = (int*)R_alloc(n, sizeof(int)); - DI = (double*)R_alloc(n - 2, sizeof(double)); - - for (i = 0; i < n; i++) otu_label[i] = i + 1; - k = 0; - - /* First, look if there are distances equal to 0. */ - /* Since there may be multichotomies, we loop - through the OTUs instead of the distances. */ - - OTU1 = 1; - while (OTU1 < n) { - OTU2 = OTU1 + 1; - while (OTU2 <= n) { - if (D[DINDEX(OTU1, OTU2)] == 0) { - SET_CLADE - edge_length[k] = edge_length[k + 1] = 0.; - k = k + 2; + double SUMD, *S, Sdist, Ndist, *new_dist, A, B, *DI, d_i, x, y; + int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; - /* update */ - /* We remove the second tip label: */ - if (OTU2 < n) { - for (i = OTU2; i < n; i++) - otu_label[i - 1] = otu_label[i]; - } + S = &Sdist; + new_dist = &Ndist; + otu_label = &o_l; + DI = &d_i; + + n = *N; + cur_nod = 2*n - 2; + + S = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); + new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); + otu_label = (int*)R_alloc(n, sizeof(int)); + DI = (double*)R_alloc(n - 2, sizeof(double)); + + for (i = 0; i < n; i++) otu_label[i] = i + 1; + k = 0; + + while (n > 3) { + + SUMD = 0; + for (i = 0; i < n*(n - 1)/2; i++) SUMD += D[i]; ij = 0; for (i = 1; i < n; i++) { - if (i == OTU2) continue; - for (j = i + 1; j <= n; j++) { - if (j == OTU2) continue; - new_dist[ij] = D[DINDEX(i, j)]; + for (j = i + 1; j <= n; j++) { + A = sum_dist_to_i(n, D, i) - D[ij]; + B = sum_dist_to_i(n, D, j) - D[ij]; + S[ij] = (A + B)/(2*n - 4) + 0.5*D[ij] + + (SUMD - A - B - D[ij])/(n - 2); + ij++; + } + } + + /* find the 'C' index of the smallest value of S */ + smallest = 0; + for (i = 1; i < n*(n - 1)/2; i++) + if (S[smallest] > S[i]) smallest = i; + + GET_I_AND_J; + SET_CLADE; + + /* get the distances between all OTUs but the 2 selected ones + and the latter: + a) get the sum for both + b) compute the distances for the new OTU */ + A = B = ij = 0; + for (i = 1; i <= n; i++) { + if (i == OTU1 || i == OTU2) continue; + x = D[give_index(i, OTU1, n)]; /* dist between OTU1 and i */ + y = D[give_index(i, OTU2, n)]; /* dist between OTU2 and i */ + new_dist[ij] = (x + y)/2; + A += x; + B += y; ij++; - } } + /* compute the branch lengths */ + A /= n - 2; + B /= n - 2; + edge_length[k] = (D[smallest] + A - B)/2; + edge_length[k + 1] = (D[smallest] + B - A)/2; + DI[cur_nod - *N - 1] = D[smallest]; + + /* update before the next loop */ + if (OTU1 > OTU2) { /* make sure that OTU1 < OTU2 */ + i = OTU1; + OTU1 = OTU2; + OTU2 = i; + } + if (OTU1 != 1) + for (i = OTU1 - 1; i > 0; i--) otu_label[i] = otu_label[i - 1]; + if (OTU2 != n) + for (i = OTU2; i <= n; i++) otu_label[i - 1] = otu_label[i]; + otu_label[0] = cur_nod; + + for (i = 1; i < n; i++) { + if (i == OTU1 || i == OTU2) continue; + for (j = i + 1; j <= n; j++) { + if (j == OTU1 || j == OTU2) continue; + new_dist[ij] = D[DINDEX(i, j)]; + ij++; + } + } + n--; for (i = 0; i < n*(n - 1)/2; i++) D[i] = new_dist[i]; - otu_label[OTU1 - 1] = cur_nod; - /* to avoid adjusting the internal branch at the end: */ - DI[cur_nod - *N - 1] = 0; cur_nod--; - } else OTU2++; + k = k + 2; } - OTU1++; - } - - while (n > 3) { - - SUMD = 0; - for (i = 0; i < n*(n - 1)/2; i++) SUMD += D[i]; - - for (i = 1; i < n; i++) { - for (j = i + 1; j <= n; j++) { - /* we know that i < j, so: */ - ij = DINDEX(i, j); - A = sum_dist_to_i(n, D, i) - D[ij]; - B = sum_dist_to_i(n, D, j) - D[ij]; - S[ij] = (A + B)/(2*n - 4) + 0.5*D[ij] + (SUMD - A - B - D[ij])/(n - 2); - } + + for (i = 0; i < 3; i++) { + edge1[*N*2 - 4 - i] = cur_nod; + edge2[*N*2 - 4 - i] = otu_label[i]; } - /* find the 'C' index of the smallest value of S */ - smallest = 0; - for (i = 1; i < n*(n - 1)/2; i++) - if (S[smallest] > S[i]) smallest = i; - - GET_I_AND_J - - SET_CLADE - - /* get the distances between all OTUs but the 2 selected ones - and the latter: - a) get the sum for both - b) compute the distances for the new OTU */ - A = B = ij = 0; - for (i = 1; i <= n; i++) { - if (i == OTU1 || i == OTU2) continue; - x = D[give_index(i, OTU1, n)]; /* distance between OTU1 and i */ - y = D[give_index(i, OTU2, n)]; /* distance between OTU2 and i */ - new_dist[ij] = (x + y)/2; - A += x; - B += y; - ij++; - } - /* compute the branch lengths */ - A /= n - 2; - B /= n - 2; - edge_length[k] = (D[smallest] + A - B)/2; - edge_length[k + 1] = (D[smallest] + B - A)/2; - DI[cur_nod - *N - 1] = D[smallest]; - - /* update before the next loop */ - if (OTU1 > OTU2) { /* make sure that OTU1 < OTU2 */ - i = OTU1; - OTU1 = OTU2; - OTU2 = i; - } - if (OTU1 != 1) - for (i = OTU1 - 1; i > 0; i--) otu_label[i] = otu_label[i - 1]; - if (OTU2 != n) - for (i = OTU2; i <= n; i++) otu_label[i - 1] = otu_label[i]; - otu_label[0] = cur_nod; - - for (i = 1; i < n; i++) { - if (i == OTU1 || i == OTU2) continue; - for (j = i + 1; j <= n; j++) { - if (j == OTU1 || j == OTU2) continue; - new_dist[ij] = D[DINDEX(i, j)]; - ij++; - } - } - - n--; - for (i = 0; i < n*(n - 1)/2; i++) D[i] = new_dist[i]; - - cur_nod--; - k = k + 2; - } - - for (i = 0; i < 3; i++) { - edge1[*N*2 - 4 - i] = cur_nod; - edge2[*N*2 - 4 - i] = otu_label[i]; - } - - edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; - edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; - edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; - - for (i = 0; i < *N*2 - 3; i++) { - if (edge2[i] <= *N) continue; - /* In case there are zero branch lengths (see above): */ - if (DI[edge2[i] - *N - 1] == 0) continue; - edge_length[i] -= DI[edge2[i] - *N - 1]/2; - } + edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; + edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; + edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; + + for (i = 0; i < *N*2 - 3; i++) { + if (edge2[i] <= *N) continue; + /* In case there are zero branch lengths: */ + if (DI[edge2[i] - *N - 1] == 0) continue; + edge_length[i] -= DI[edge2[i] - *N - 1]/2; + } } -- 2.39.2