7 *****************************************************************************
9 * Copyright (c) 2004, Luke Sheneman
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * + Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * + Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in
20 * the documentation and/or other materials provided with the
22 * + The names of its contributors may not be used to endorse or promote
23 * products derived from this software without specific prior
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
38 *****************************************************************************
40 * An implementation of the Relaxed Neighbor-Joining algorithm
41 * of Evans, J., Sheneman, L., and Foster, J.
47 * sheneman@cs.uidaho.edu
62 #include "distclearcut.h"
74 * The entry point to the program.
77 int clearcut_main(int argc, char *argv[]) {
79 DMAT *dmat; /* The working distance matrix */
80 DMAT *dmat_backup = NULL;/* A backup distance matrix */
81 NJ_TREE *tree; /* The phylogenetic tree */
82 NJ_ARGS *nj_args; /* Structure for holding command-line arguments */
85 /* some variables for tracking time */
87 unsigned long long startUs, endUs;
90 /* check and parse supplied command-line arguments */
91 nj_args = NJ_handle_args(argc, argv);
94 fprintf(stderr, "Clearcut: Error processing command-line arguments.\n");
98 /* for verbose reporting, print the random number seed to stdout */
99 if(nj_args->verbose_flag) {
100 printf("PRNG SEED: %d\n", nj_args->seed);
103 /* Initialize Mersenne Twister PRNG */
104 init_genrand(nj_args->seed);
107 switch(nj_args->input_mode) {
109 /* If the input type is a distance matrix */
110 case NJ_INPUT_MODE_DISTANCE:
112 /* parse the distance matrix */
113 dmat = NJ_parse_distance_matrix(nj_args);
120 /* If the input type is a multiple sequence alignment */
121 case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
123 /* build a distance matrix from a multiple sequence alignment */
124 dmat = NJ_build_distance_matrix(nj_args);
126 fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n");
134 fprintf(stderr, "Clearcut: Could not determine how to process input\n");
139 * Output the computed distance matrix,
140 * if the user specified one.
142 if(nj_args->matrixout) {
143 NJ_output_matrix(nj_args, dmat);
147 * If we are going to generate multiple trees from
148 * the same distance matrix, we need to make a backup
149 * of the original distance matrix.
151 if(nj_args->ntrees > 1) {
152 dmat_backup = NJ_dup_dmat(dmat);
155 /* process n trees */
156 for(i=0;i<nj_args->ntrees;i++) {
159 * If the user has specified matrix shuffling, we need
160 * to randomize the distance matrix
162 if(nj_args->shuffle) {
163 NJ_shuffle_distance_matrix(dmat);
166 /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */
167 gettimeofday(&tv, NULL);
168 startUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
169 + ((unsigned long long) tv.tv_usec);
173 * Invoke either the Relaxed Neighbor-Joining algorithm (default)
174 * or the "traditional" Neighbor-Joining algorithm
176 if(nj_args->neighbor) {
177 tree = NJ_neighbor_joining(nj_args, dmat);
179 tree = NJ_relaxed_nj(nj_args, dmat);
183 fprintf(stderr, "Clearcut: Failed to construct tree.\n");
187 /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */
188 gettimeofday(&tv, NULL);
189 endUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
190 + ((unsigned long long) tv.tv_usec);
192 /* print the time taken to perform the neighbor join */
193 if(nj_args->verbose_flag) {
194 if(nj_args->neighbor) {
195 fprintf(stderr, "NJ tree built in %llu.%06llu secs\n",
196 (endUs - startUs) / 1000000ULL,
197 (endUs - startUs) % 1000000ULL);
199 fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n",
200 (endUs - startUs) / 1000000ULL,
201 (endUs - startUs) % 1000000ULL);
205 /* Output the neighbor joining tree here */
206 NJ_output_tree(nj_args, tree, dmat, i);
208 NJ_free_tree(tree); /* Free the tree */
209 NJ_free_dmat(dmat); /* Free the working distance matrix */
212 * If we need to do another iteration, lets re-initialize
213 * our working distance matrix.
215 if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) {
216 dmat = NJ_dup_dmat(dmat_backup);
220 /* Free the backup distance matrix */
221 if(nj_args->ntrees > 1) {
222 NJ_free_dmat(dmat_backup);
225 /* If verbosity, describe where the tree output is */
226 if(nj_args->verbose_flag) {
227 if(nj_args->neighbor) {
228 printf("NJ tree(s) in %s\n", nj_args->outfilename);
230 printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename);
242 * NJ_find_hmin() - Find minimum transformed values along horizontal
247 * dmat -- The distance matrix
248 * a -- The index of the specific taxon in the distance matrix
252 * <float> -- The value of the selected minimum
253 * min -- Used to transport the index of the minima out
254 * of the function (by reference)
255 * hmincount -- Return the number of minima along the horizontal
262 * A fast, inline function to find the smallest transformed value
263 * along the "horizontal" portion of an entry in a distance matrix.
265 * Distance matrices are stored internally as continguously-allocated
266 * upper-diagonal structures. With the exception of the taxa at
267 * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
268 * and vertical component in the distance matrix. This function
269 * scans the horizonal portion of the entry in the distance matrix
270 * for the specified taxon and finds the minimum transformed value
271 * along that horizontal component.
273 * Since multiple minima can exist along the horizontal portion
274 * of the entry, I consider all minima and break ties
275 * stochastically to help avoid systematic bias.
277 * Just searching along the horizontal portion of a row is very fast
278 * since the data is stored linearly and contiguously in memory and
279 * cache locality is exploited in the distance matrix representation.
281 * Look at nj.h for more information on how the distance matrix
287 NJ_find_hmin(DMAT *dmat,
290 long int *hmincount) {
292 long int i; /* index variable for looping */
293 int size; /* current size of distance matrix */
294 int mindex = 0; /* holds the current index to the chosen minimum */
295 float curval; /* used to hold current transformed values */
296 float hmin; /* the value of the transformed minimum */
298 float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */
300 /* values used for stochastic selection among multiple minima */
304 /* initialize the min to something large */
305 hmin = (float)HUGE_VAL;
307 /* setup some pointers to limit dereferencing later */
312 /* initialize values associated with minima tie breaking */
317 ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */
318 for(i=a+1;i<size;i++) {
320 curval = *(ptr++) - (r2[a] + r2[i]); /* compute transformed distance */
322 if(NJ_FLT_EQ(curval, hmin)) { /* approx. equal */
326 p = 1.0/(float)smallcnt;
329 /* select this minimum in a way which is proportional to
330 the number of minima found along the row so far */
335 } else if (curval < hmin) {
343 /* save off the the minimum index to be returned via reference */
346 /* save off the number of minima */
347 *hmincount = smallcnt;
349 /* return the value of the smallest tranformed distance */
361 * NJ_find_vmin() - Find minimum transformed distance along vertical
366 * dmat -- The distance matrix
367 * a -- The index of the specific taxon in the distance matrix
372 * <float> -- The value of the selected minimum
373 * min -- Used to transport the index of the minima out
374 * of the function (by reference)
375 * vmincount -- The number of minima along the vertical
376 * return by reference.
381 * A fast, inline function to find the smallest transformed value
382 * along the "vertical" portion of an entry in a distance matrix.
384 * Distance matrices are stored internally as continguously-allocated
385 * upper-diagonal matrices. With the exception of the taxa at
386 * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
387 * and vertical component in the distance matrix. This function
388 * scans the vertical portion of the entry in the distance matrix
389 * for the specified taxon and finds the minimum transformed value
390 * along that vertical component.
392 * Since multiple minima can exist along the vertical portion
393 * of the entry, I consider all minima and break ties
394 * stochastically to help avoid systematic bias.
396 * Due to cache locality reasons, searching along the vertical
397 * component is going to be considerably slower than searching
398 * along the horizontal.
400 * Look at nj.h for more information on how the distance matrix
406 NJ_find_vmin(DMAT *dmat,
409 long int *vmincount) {
411 long int i; /* index variable used for looping */
412 long int size; /* track the size of the matrix */
413 long int mindex = 0;/* track the index to the minimum */
414 float curval; /* track value of current transformed distance */
415 float vmin; /* the index to the smallest "vertical" minimum */
417 /* pointers which are used to reduce pointer dereferencing in inner loop */
418 float *ptr, *r2, *val;
420 /* values used in stochastically breaking ties */
424 /* initialize the vertical min to something really big */
425 vmin = (float)HUGE_VAL;
427 /* save off some values to limit dereferencing later */
435 /* start on the first row and work down */
436 ptr = &(val[NJ_MAP(0, a, size)]);
439 curval = *ptr - (r2[i] + r2[a]); /* compute transformed distance */
441 if(NJ_FLT_EQ(curval, vmin)) { /* approx. equal */
445 p = 1.0/(float)smallcnt;
448 /* break ties stochastically to avoid systematic bias */
453 } else if (curval < vmin) {
460 /* increment our working pointer to the next row down */
464 /* pass back the index to the minimum found so far (by reference) */
467 /* pass back the number of minima along the vertical */
468 *vmincount = smallcnt;
470 /* return the value of the smallest transformed distance */
478 * NJ_permute() - Generate random permutation using the provably unbiased
479 * Fisher-Yates Shuffle.
483 * perm -- A pointer to the array of long ints which will be filled.
484 * size -- the length of the permutation vector
495 * Return a permuted list of numbers from 0 through size.
496 * This is accomplished by initializing the permutation
497 * as an ordered list of integers and then iterating
498 * through and swapping two values selected according to the
499 * Fisher-Yates method.
501 * This unbiased method for random permutation generation is
504 * Donald E. Knuth, The Art of Computer Programming,
505 * Addison-Wesley, Volumes 1, 2, and 3, 3rd edition, 1998
510 NJ_permute(long int *perm,
513 long int i; /* index used for looping */
514 long int swap; /* we swap values to generate permutation */
515 long int tmp; /* used for swapping values */
518 /* check to see if vector of long ints is valid */
520 fprintf(stderr, "Clearcut: NULL permutation pointer in NJ_permute()\n");
524 /* init permutation as an ordered list of integers */
525 for(i=0;i<size;i++) {
530 * Iterate across the array from i = 0 to size -1, swapping ith element
531 * with a randomly chosen element from a changing range of possible values
533 for(i=0;i<size;i++) {
535 /* choose which element we will swap with */
536 swap = i + NJ_genrand_int31_top(size-i);
538 /* swap elements here */
541 perm[swap] = perm[i];
554 * NJ_compute_r() - Compute post-join changes to r-vector. In this
555 * case, we decrement all of the accumulated distances
556 * in the r-vector for the two nodes that were
557 * recently joined (i.e. x, y)
561 * dmat -- The distance matrix
562 * a -- The index of one of the taxa that were joined
563 * b -- The index of the other taxa that was joined
572 * This vector of floats is used as a summary of overall distances from
573 * each entry in the distance matrix to every other entry. These values
574 * are then used when computing the transformed distances from which
575 * decisions concerning joining are made.
577 * For speed, we don't recompute r from scratch. Instead, we decrement
578 * all entries in r by the appropriate amount. That is, r[i] -= dist(i, a)
579 * and r[i] -= dist(i, b).
581 * As a speed optimization, I process the rows altogether for cache locality
582 * purposes, and then process columns.
584 * The processing of the scaled r matrix (r2) is handled on-the-fly elsewhere.
589 NJ_compute_r(DMAT *dmat,
593 long int i; /* a variable used in indexing */
594 float *ptrx, *ptry; /* pointers into the distance matrix */
596 /* some variables to limit pointer dereferencing in loop */
600 /* to limit pointer dereferencing */
606 * Loop through the rows and decrement the stored r values
607 * by the distances stored in the rows and columns of the distance
608 * matrix which are being removed post-join.
610 * We do the rows altogether in order to benefit from cache locality.
612 ptrx = &(val[NJ_MAP(a, a+1, size)]);
613 ptry = &(val[NJ_MAP(b, b+1, size)]);
615 for(i=a+1;i<size;i++) {
625 /* Similar to the above loop, we now do the columns */
626 ptrx = &(val[NJ_MAP(0, a, size)]);
627 ptry = &(val[NJ_MAP(0, b, size)]);
648 * NJ_check_additivity() - Check to make sure that addivity preserved by join
653 * dmat -- distance matrix
654 * a -- index into dmat for one of the rows to be joined
655 * b -- index into dmat for another row to be joined
659 * int 1 if join adheres to additivity constraint
660 * 0 if join does breaks additivity
665 * Here we perform the check to make sure that by joining a and b we do not
666 * also break consistency (i.e. preserves additivity) with the distances between
667 * the taxa in the new clade and other nodes in the tree. This is done quite
668 * efficiently by looking up the untransformed distance between node b and
669 * some other "target" taxa in the distance matrix (which is not a nor b) and
670 * comparing that distance to the distance computed by finding the distance
671 * from node a to the proposed internal node "x" which joins (a,b).
673 * If dist(x,b) + dist (b, target) == dist(b, target) then additivity is
674 * preserved, otherwise, additivity is not preserved. If we are in
675 * additivity mode, this join should be rejected.
680 NJ_check_additivity(DMAT *dmat,
684 float a2clade, b2clade;
689 /* determine target taxon here */
690 if(b == dmat->size-1) {
691 /* if we can't do a row here, lets do a column */
706 /* distance between a and the root of clade (a,b) */
708 ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
709 (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
711 /* distance between b and the root of clade (a,b) */
713 ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
714 (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
716 /* distance between the clade (a,b) and the target taxon */
719 /* compute the distance from the clade root to the target */
721 ( (dmat->val[NJ_MAP(a, target, dmat->size)] - a2clade) +
722 (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0;
725 * Check to see that distance from clade root to target + distance from
726 * b to clade root are equal to the distance from b to the target
728 if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)],
729 (clade_dist + b2clade))) {
730 return(1); /* join is legitimate */
732 return(0); /* join is illigitimate */
737 /* compute the distance from the clade root to the target */
739 ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) +
740 (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0;
743 * Check to see that distance from clade root to target + distance from
744 * b to clade root are equal to the distance from b to the target
746 if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)],
747 (clade_dist + b2clade))) {
748 return(1); /* join is legitimate */
750 return(0); /* join is illegitimate */
762 * NJ_check() - Check to see if two taxa can be joined
767 * nj_args -- Pointer to the data structure holding command-line args
768 * dmat -- distance matrix
769 * a -- index into dmat for one of the rows to be joined
770 * b -- index into dmat for another row to be joined
771 * min -- the minimum value found
772 * additivity -- a flag (0 = not additive mode, 1 = additive mode)
776 * int 1 if join is okay
777 * 0 if join is not okay
782 * This function ultimately takes two rows and makes sure that the
783 * intersection of those two rows, which has a transformed distance of
784 * "min", is actually the smallest (or equal to the smallest)
785 * transformed distance for both rows (a, b). If so, it returns
786 * 1, else it returns 0.
788 * Basically, we want to join two rows only if the minimum
789 * transformed distance on either row is at the intersection of
795 NJ_check(NJ_ARGS *nj_args,
804 float *ptr, *val, *r2;
807 /* some aliases for speed and readability reasons */
813 /* now determine if joining a, b will result in broken distances */
815 if(!NJ_check_additivity(dmat, a, b)) {
820 /* scan the horizontal of row b, punt if anything < min */
821 ptr = &(val[NJ_MAP(b, b+1, size)]);
822 for(i=b+1;i<size;i++) {
823 if( NJ_FLT_LT( (*ptr - (r2[b] + r2[i])), min) ) {
829 /* scan the vertical component of row a, punt if anything < min */
830 if(nj_args->norandom) { /* if we are doing random joins, we checked this */
833 if( NJ_FLT_LT( (*ptr - (r2[i] + r2[a])), min) ) {
840 /* scan the vertical component of row b, punt if anything < min */
843 if( NJ_FLT_LT( (*ptr - (r2[i] + r2[b])), min) && i!=a) {
859 * NJ_collapse() - Collapse the distance matrix by removing
860 * rows a and b from the distance matrix and
861 * replacing them with a single new row which
862 * represents the internal node joining a and b
867 * dmat -- A pointer to the distance matrix
868 * vertex -- A pointer to the vertex vector (vector of tree nodes)
869 * which is used in constructing the tree
870 * a -- An index to a row in the distance matrix from which we
871 * joined. This row will be collapsed.
872 * b -- An index to a row in the distance matrix from which we
873 * joined. This row will be collapsed.
883 * This function collapses the distance matrix in a way which optimizes
884 * cache locality and ultimately gives us a speed improvement due to
885 * cache. At this point, we've decided to join rows a and b from
886 * the distance matrix. We will remove rows a and b from the distance
887 * matrix and replace them with a new row which represents the internal
888 * node which joins rows a and b together.
890 * We always keep the matrix as compact as possible in order to
891 * get good performance from our cache in subsequent operations. Cache
892 * is the key to good performance here.
897 * 1) Fill the "a" row with the new distances of the internal node
898 * joining a and b to all other rows.
899 * 2) Copy row 0 into what was row b
900 * 3) Increment the pointer to the start of the distance matrix
902 * 4) Decrement the size of the matrix by one row.
903 * 5) Do roughly the same thing to the r vector in order to
904 * keep it in sync with the distance matrix.
905 * 6) Compute the scaled r vector (r2) based on the updated
908 * This keeps the distance matrix as compact as possible in memory, and
909 * is a relatively fast operation.
911 * This function requires that a < b
916 NJ_collapse(DMAT *dmat,
922 long int i; /* index used for looping */
923 long int size; /* size of dmat --> reduce pointer dereferencing */
924 float a2clade; /* distance from a to the new node that joins a and b */
925 float b2clade; /* distance from b to the new node that joins a and b */
926 float cval; /* stores distance information during loop */
927 float *vptr; /* pointer to elements in first row of dist matrix */
928 float *ptra; /* pointer to elements in row a of distance matrix */
929 float *ptrb; /* pointer to elements in row b of distance matrix */
931 float *val, *r, *r2; /* simply used to limit pointer dereferencing */
934 /* We must assume that a < b */
936 fprintf(stderr, "Clearcut: (a<b) constraint check failed in NJ_collapse()\n");
940 /* some shortcuts to help limit dereferencing */
946 /* compute the distance from the clade components (a, b) to the new node */
948 ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
950 ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
953 r[a] = 0.0; /* we are removing row a, so clear dist. in r */
956 * Fill the horizontal part of the "a" row and finish computing r and r2
957 * we handle the horizontal component first to maximize cache locality
959 ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */
960 ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */
961 for(i=a+1;i<size;i++) {
964 * Compute distance from new internal node to others in
965 * the distance matrix.
968 ( (*ptra - a2clade) +
969 (*ptrb - b2clade) ) / 2.0;
971 /* incr. row b pointer differently depending on where i is in loop */
973 ptrb += size-i-1; /* traverse vertically by incrementing by row */
975 ptrb++; /* traverse horiz. by incrementing by column */
978 /* assign the newly computed distance and increment a ptr by a column */
981 /* accumulate the distance onto the r vector */
985 /* scale r2 on the fly here */
986 r2[i] = r[i]/(float)(size-3);
989 /* fill the vertical part of the "a" column and finish computing r and r2 */
990 ptra = val + a; /* start at the top of the columb for "a" */
991 ptrb = val + b; /* start at the top of the columb for "b" */
995 * Compute distance from new internal node to others in
996 * the distance matrix.
999 ( (*ptra - a2clade) +
1000 (*ptrb - b2clade) ) / 2.0;
1002 /* assign the newly computed distance and increment a ptr by a column */
1005 /* accumulate the distance onto the r vector */
1009 /* scale r2 on the fly here */
1010 r2[i] = r[i]/(float)(size-3);
1012 /* here, always increment by an entire row */
1018 /* scale r2 on the fly here */
1019 r2[a] = r[a]/(float)(size-3);
1024 * Copy row 0 into row b. Again, the code is structured into two
1025 * loops to maximize cache locality for writes along the horizontal
1026 * component of row b.
1034 vptr++; /* skip over the diagonal */
1035 ptrb = &(val[NJ_MAP(b, b+1, size)]);
1036 for(i=b+1;i<size;i++) {
1037 *(ptrb++) = *(vptr++);
1041 * Collapse r here by copying contents of r[0] into r[b] and
1042 * incrementing pointer to the beginning of r by one row
1049 * Collapse r2 here by copying contents of r2[0] into r2[b] and
1050 * incrementing pointer to the beginning of r2 by one row
1055 /* increment dmat pointer to next row */
1058 /* decrement the total size of the distance matrix by one row */
1073 * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining
1078 * nj_args -- A pointer to a structure containing the command-line arguments
1079 * dmat -- A pointer to the distance matrix
1083 * NJ_TREE * -- A pointer to the Neighbor-Joining tree.
1088 * This function performs a traditional Neighbor-Joining operation in which
1089 * the distance matrix is exhaustively searched for the global minimum
1090 * transformed distance. The two nodes which intersect at the global
1091 * minimum transformed distance are then joined and the distance
1092 * matrix is collapsed. This process continues until there are only
1093 * two nodes left, at which point those nodes are joined.
1097 NJ_neighbor_joining(NJ_ARGS *nj_args,
1101 NJ_TREE *tree = NULL;
1102 NJ_VERTEX *vertex = NULL;
1108 /* initialize the r and r2 vectors */
1111 /* allocate and initialize our vertex vector used for tree construction */
1112 vertex = NJ_init_vertex(dmat);
1114 fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n");
1118 /* we iterate until the working distance matrix has only 2 entries */
1119 while(vertex->nactive > 2) {
1122 * Find the global minimum transformed distance from the distance matrix
1124 min = NJ_min_transform(dmat, &a, &b);
1127 * Build the tree by removing nodes a and b from the vertex array
1128 * and inserting a new internal node which joins a and b. Collapse
1129 * the vertex array similarly to how the distance matrix and r and r2
1132 NJ_decompose(dmat, vertex, a, b, 0);
1134 /* decrement the r and r2 vectors by the distances corresponding to a, b */
1135 NJ_compute_r(dmat, a, b);
1137 /* compact the distance matrix and the r and r2 vectors */
1138 NJ_collapse(dmat, vertex, a, b);
1141 /* Properly join the last two nodes on the vertex list */
1142 tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
1144 /* return the computed tree to the calling function */
1156 * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining
1160 * nj_args -- A pointer to a data structure containing the command-line args
1161 * dmat -- A pointer to the distance matrix
1166 * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree
1171 * This function implements the Relaxed Neighbor-Joining algorithm of
1172 * Evans, J., Sheneman, L., and Foster, J.
1174 * Relaxed Neighbor-Joining works by choosing a local minimum transformed
1175 * distance when determining when to join two nodes. (Traditional
1176 * Neighbor-Joining chooses a global minimum transformed distance).
1178 * The algorithm shares the property with traditional NJ that if the
1179 * input distances are additive (self-consistent), then the algorithm
1180 * will manage to construct the true tree consistent with the additive
1181 * distances. Additivity state is tracked and every proposed join is checked
1182 * to make sure it maintains additivity constraints. If no
1183 * additivity-preserving join is possible in a single pass, then the distance
1184 * matrix is non-additive, and additivity checking is abandoned.
1186 * The algorithm will either attempt joins randomly, or it will perform joins
1187 * in a particular order. The default behavior is to perform joins randomly,
1188 * but this can be switched off with a command-line switch.
1190 * For randomized joins, all attempts are made to alleviate systematic bias
1191 * for the choice of rows to joins. All tie breaking is done in a way which
1192 * is virtually free of bias.
1194 * To perform randomized joins, a random permutation is constructed which
1195 * specifies the order in which to attempt joins. I iterate through the
1196 * random permutation, and for each row in the random permutation, I find
1197 * the minimum transformed distance for that row. If there are multiple
1198 * minima, I break ties evenly. For the row which intersects our
1199 * randomly chosen row at the chosen minimum, if we are are still in
1200 * additivity mode, I check to see if joining the two rows will break
1201 * our additivity constraints. If not, I check to see if there exists
1202 * a transformed distance which is smaller than the minimum found on the
1203 * original row. If there is, then we proceed through the random permutation
1204 * trying additional rows in the random order specified in the permutation.
1205 * If there is no smaller minimum transformed distance on either of the
1206 * two rows, then we join them, collapse the distance matrix, and compute
1207 * a new random permutation.
1209 * If the entire random permutation is traversed and no joins are possible
1210 * due to additivity constraints, then the distance matrix is not
1211 * additive, and additivity constraint-checking is disabled.
1215 NJ_relaxed_nj(NJ_ARGS *nj_args,
1221 long int a, b, t, bh, bv, i;
1222 float hmin, vmin, hvmin;
1225 int additivity_mode;
1226 long int hmincount, vmincount;
1227 long int *permutation = NULL;
1231 /* initialize the r and r2 vectors */
1234 additivity_mode = 1;
1236 /* allocate the permutation vector, if we are in randomize mode */
1237 if(!nj_args->norandom) {
1238 permutation = (long int *)calloc(dmat->size, sizeof(long int));
1240 fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n");
1245 /* allocate and initialize our vertex vector used for tree construction */
1246 vertex = NJ_init_vertex(dmat);
1248 /* loop until there are only 2 nodes left to join */
1249 while(vertex->nactive > 2) {
1251 switch(nj_args->norandom) {
1253 /* RANDOMIZED JOINS */
1258 NJ_permute(permutation, dmat->size-1);
1259 for(i=0;i<dmat->size-1 && (vertex->nactive>2) ;i++) {
1263 /* find min trans dist along horiz. of row a */
1264 hmin = NJ_find_hmin(dmat, a, &bh, &hmincount);
1266 /* find min trans dist along vert. of row a */
1267 vmin = NJ_find_vmin(dmat, a, &bv, &vmincount);
1274 if(NJ_FLT_EQ(hmin, vmin)) {
1277 * The minima along the vertical and horizontal are
1278 * the same. Compute the proportion of minima along
1279 * the horizonal (p) and the proportion of minima
1280 * along the vertical (q).
1282 * If the same minima exist along the horizonal and
1283 * vertical, we break the tie in a way which is
1284 * non-biased. That is, we break the tie based on the
1285 * proportion of horiz. minima versus vertical minima.
1288 p = (float)hmincount / ((float)hmincount + (float)vmincount);
1290 x = genrand_real2();
1299 } else if(NJ_FLT_LT(hmin, vmin) ) {
1307 if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) {
1309 /* swap a and b, if necessary, to make sure a < b */
1318 /* join taxa from rows a and b */
1319 NJ_decompose(dmat, vertex, a, b, 0);
1321 /* collapse matrix */
1322 NJ_compute_r(dmat, a, b);
1323 NJ_collapse(dmat, vertex, a, b);
1325 NJ_permute(permutation, dmat->size-1);
1329 /* turn off additivity if go through an entire cycle without joining */
1331 additivity_mode = 0;
1338 /* DETERMINISTIC JOINS */
1343 for(a=0;a<dmat->size-1 && (vertex->nactive > 2) ;) {
1345 /* find the min along the horizontal of row a */
1346 hmin = NJ_find_hmin(dmat, a, &b, &hmincount);
1348 if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) {
1352 /* join taxa from rows a and b */
1353 NJ_decompose(dmat, vertex, a, b, 0);
1355 /* collapse matrix */
1356 NJ_compute_r(dmat, a, b);
1357 NJ_collapse(dmat, vertex, a, b);
1368 /* turn off additivity if go through an entire cycle without joining */
1370 additivity_mode = 0;
1378 /* Join the last two nodes on the vertex list */
1379 tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
1381 if(nj_args->verbose_flag) {
1382 if(additivity_mode) {
1383 printf("Tree is additive\n");
1385 printf("Tree is not additive\n");
1390 NJ_free_vertex(vertex);
1393 if(!nj_args->norandom && permutation) {
1406 * NJ_print_distance_matrix() -
1408 * Print a distance matrix
1412 NJ_print_distance_matrix(DMAT *dmat) {
1416 printf("ntaxa: %ld\n", dmat->ntaxa);
1417 printf(" size: %ld\n", dmat->size);
1419 for(i=0;i<dmat->size;i++) {
1421 for(j=0;j<dmat->size;j++) {
1423 printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]);
1430 if(dmat->r && dmat->r2) {
1431 printf("\t\t%0.4f", dmat->r[i]);
1432 printf("\t%0.4f", dmat->r2[i]);
1438 for(j=0;j<dmat->size;j++) {
1440 printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j]));
1462 * NJ_output_tree() -
1464 * A wrapper for the function that really prints the tree,
1465 * basically to get a newline in there conveniently. :-)
1467 * Print n trees, as specified in command-args
1468 * using "count" variable from 0 to (n-1)
1472 NJ_output_tree(NJ_ARGS *nj_args,
1479 if(nj_args->stdout_flag) {
1484 fp = fopen(nj_args->outfilename, "w"); /* open for writing */
1486 fp = fopen(nj_args->outfilename, "a"); /* open for appending */
1490 fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename);
1495 NJ_output_tree2(fp, nj_args, tree, tree, dmat);
1498 if(!nj_args->stdout_flag) {
1510 * NJ_output_tree2() -
1515 NJ_output_tree2(FILE *fp,
1525 if(tree->taxa_index != NJ_INTERNAL_NODE) {
1527 if(nj_args->expblen) {
1528 fprintf(fp, "%s:%e",
1529 dmat->taxaname[tree->taxa_index],
1532 fprintf(fp, "%s:%f",
1533 dmat->taxaname[tree->taxa_index],
1540 if(tree->left && tree->right) {
1544 NJ_output_tree2(fp, nj_args, tree->left, root, dmat);
1547 if(tree->left && tree->right) {
1551 NJ_output_tree2(fp, nj_args, tree->right, root, dmat);
1554 if(tree != root->left) {
1555 if(tree->left && tree->right) {
1557 if(nj_args->expblen) {
1558 fprintf(fp, "):%e", tree->dist);
1560 fprintf(fp, "):%f", tree->dist);
1583 * This function computes the r column in our matrix
1587 NJ_init_r(DMAT *dmat) {
1589 long int i, j, size;
1591 float *r, *r2, *val;
1600 size2 = (float)(size-2);
1603 for(i=0;i<size1;i++) {
1605 for(j=i+1;j<size;j++) {
1628 * NJ_init_vertex() -
1630 * Construct a vertex, which we will use to construct our tree
1631 * in a true bottom-up approach. The vertex construct is
1632 * basically the center node in the initial star topology.
1636 NJ_init_vertex(DMAT *dmat) {
1641 /* allocate the vertex here */
1642 vertex = (NJ_VERTEX *)calloc(1, sizeof(NJ_VERTEX));
1644 /* allocate the nodes in the vertex */
1645 vertex->nodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *));
1646 vertex->nodes_handle = vertex->nodes;
1648 /* initialize our size and active variables */
1649 vertex->nactive = dmat->ntaxa;
1650 vertex->size = dmat->ntaxa;
1652 /* initialize the nodes themselves */
1653 for(i=0;i<dmat->ntaxa;i++) {
1655 vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
1657 vertex->nodes[i]->left = NULL;
1658 vertex->nodes[i]->right = NULL;
1660 vertex->nodes[i]->taxa_index = i;
1673 * This function decomposes the star by creating new internal nodes
1674 * and joining two existing tree nodes to it
1678 NJ_decompose(DMAT *dmat,
1685 float x2clade, y2clade;
1687 /* compute the distance from the clade components to the new node */
1690 (dmat->val[NJ_MAP(x, y, dmat->size)]);
1693 (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
1694 ((dmat->r2[x] - dmat->r2[y])/2);
1697 vertex->nodes[x]->dist = x2clade;
1701 (dmat->val[NJ_MAP(x, y, dmat->size)]);
1704 (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
1705 ((dmat->r2[y] - dmat->r2[x])/2);
1708 vertex->nodes[y]->dist = y2clade;
1710 /* allocate new node to connect two sub-clades */
1711 new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
1713 new_node->left = vertex->nodes[x];
1714 new_node->right = vertex->nodes[y];
1715 new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */
1721 vertex->nodes[x] = new_node;
1722 vertex->nodes[y] = vertex->nodes[0];
1724 vertex->nodes = &(vertex->nodes[1]);
1734 * NJ_print_vertex() -
1736 * For debugging, print the contents of the vertex
1740 NJ_print_vertex(NJ_VERTEX *vertex) {
1744 printf("Number of active nodes: %ld\n", vertex->nactive);
1746 for(i=0;i<vertex->nactive;i++) {
1747 printf("%ld ", vertex->nodes[i]->taxa_index);
1767 NJ_print_r(DMAT *dmat) {
1772 for(i=0;i<dmat->size;i++) {
1773 printf("r[%ld] = %0.2f\n", i, dmat->r[i]);
1785 * NJ_print_taxanames() -
1787 * Print taxa names here
1791 NJ_print_taxanames(DMAT *dmat) {
1795 printf("Number of taxa: %ld\n", dmat->ntaxa);
1797 for(i=0;i<dmat->ntaxa;i++) {
1798 printf("%ld) %s\n", i, dmat->taxaname[i]);
1810 * NJ_shuffle_distance_matrix() -
1812 * Randomize a distance matrix here
1816 NJ_shuffle_distance_matrix(DMAT *dmat) {
1819 long int *perm = NULL;
1820 char **tmp_taxaname = NULL;
1821 float *tmp_val = NULL;
1825 /* alloc the random permutation and a new matrix to hold the shuffled vals */
1826 perm = (long int *)calloc(dmat->size, sizeof(long int));
1827 tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *));
1828 tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
1829 if(!tmp_taxaname || !perm || !tmp_val) {
1830 fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n");
1834 /* compute a permutation which will describe how to shuffle the matrix */
1835 NJ_permute(perm, dmat->size);
1837 for(i=0;i<dmat->size;i++) {
1838 for(j=i+1;j<dmat->size;j++) {
1840 if(perm[j] < perm[i]) {
1841 tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)];
1843 tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)];
1848 tmp_taxaname[i] = dmat->taxaname[perm[i]];
1851 /* free our random permutation */
1856 /* free the old value matrix */
1861 /* re-assign the value matrix pointers */
1862 dmat->val = tmp_val;
1863 dmat->valhandle = dmat->val;
1866 * Free our old taxaname with its particular ordering
1867 * and re-assign to the new.
1869 if(dmat->taxaname) {
1870 free(dmat->taxaname);
1872 dmat->taxaname = tmp_taxaname;
1882 * Free a given NJ tree
1885 NJ_free_tree(NJ_TREE *node) {
1892 NJ_free_tree(node->left);
1896 NJ_free_tree(node->right);
1913 * NJ_print_permutation()
1915 * Print a permutation
1919 NJ_print_permutation(long int *perm,
1924 for(i=0;i<size-1;i++) {
1925 printf("%ld,", perm[i]);
1927 printf("%ld\n", perm[size-1]);
1938 * Duplicate a distance matrix
1942 NJ_dup_dmat(DMAT *src) {
1947 /* allocate the resulting distance matrix */
1948 dest = (DMAT *)calloc(1, sizeof(DMAT));
1950 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1954 dest->ntaxa = src->ntaxa;
1955 dest->size = src->size;
1957 /* allocate space for array of pointers to taxanames */
1958 dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *));
1959 if(!dest->taxaname) {
1960 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1964 /* allocate space for the taxanames themselves */
1965 for(i=0;i<src->ntaxa;i++) {
1966 dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char));
1967 if(!dest->taxaname[i]) {
1968 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1973 /* allocate space for the distance values */
1974 dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float));
1976 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1980 /* allocate space for the r and r2 vectors */
1981 dest->r = (float *)calloc(src->ntaxa, sizeof(float));
1982 dest->r2 = (float *)calloc(src->ntaxa, sizeof(float));
1985 for(i=0;i<src->ntaxa;i++) {
1986 strcpy(dest->taxaname[i], src->taxaname[i]);
1990 memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float));
1993 memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float));
1994 memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float));
1996 /* track some memory addresses */
1997 dest->valhandle = dest->val;
1998 dest->rhandle = dest->r;
1999 dest->r2handle = dest->r2;
2005 /* free what we may have allocated */
2018 NJ_free_dmat(DMAT *dmat) {
2024 if(dmat->taxaname) {
2026 for(i=0;i<dmat->ntaxa;i++) {
2027 if(dmat->taxaname[i]) {
2028 free(dmat->taxaname[i]);
2032 free(dmat->taxaname);
2035 if(dmat->valhandle) {
2036 free(dmat->valhandle);
2040 free(dmat->rhandle);
2043 if(dmat->r2handle) {
2044 free(dmat->r2handle);
2058 * NJ_free_vertex() -
2060 * Free the vertex data structure
2064 NJ_free_vertex(NJ_VERTEX *vertex) {
2067 if(vertex->nodes_handle) {
2068 free(vertex->nodes_handle);
2086 * NJ_min_transform() - Find the smallest transformed value to identify
2087 * which nodes to join.
2091 * dmat -- The distance matrix
2095 * <float> -- The minimimum transformed distance
2096 * ret_i -- The row of the smallest transformed distance (by reference)
2097 * ret_j -- The col of the smallest transformed distance (by reference)
2103 * Used only with traditional Neighbor-Joining, this function checks the entire
2104 * working distance matrix and identifies the smallest transformed distance.
2105 * This requires traversing the entire diagonal matrix, which is itself a
2110 NJ_min_transform(DMAT *dmat,
2114 long int i, j; /* indices used for looping */
2115 long int tmp_i = 0;/* to limit pointer dereferencing */
2116 long int tmp_j = 0;/* to limit pointer dereferencing */
2117 float smallest; /* track the smallest trans. dist */
2118 float curval; /* the current trans. dist in loop */
2120 float *ptr; /* pointer into distance matrix */
2121 float *r2; /* pointer to r2 matrix for computing transformed dists */
2123 smallest = (float)HUGE_VAL;
2125 /* track these here to limit pointer dereferencing in inner loop */
2130 for(i=0;i<dmat->size;i++) {
2131 ptr++; /* skip diagonal */
2132 for(j=i+1;j<dmat->size;j++) { /* for every column */
2134 /* find transformed distance in matrix at i, j */
2135 curval = *(ptr++) - (r2[i] + r2[j]);
2137 /* if the transformed distanance is less than the known minimum */
2138 if(curval < smallest) {
2147 /* pass back (by reference) the coords of the min. transformed distance */
2151 return(smallest); /* return the min transformed distance */