6 *****************************************************************************
8 * Copyright (c) 2004, Luke Sheneman
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
15 * + Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * + Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
21 * + The names of its contributors may not be used to endorse or promote
22 * products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
37 *****************************************************************************
39 * An implementation of the Relaxed Neighbor-Joining algorithm
40 * of Evans, J., Sheneman, L., and Foster, J.
46 * sheneman@cs.uidaho.edu
74 * The entry point to the program.
81 DMAT *dmat; /* The working distance matrix */
82 DMAT *dmat_backup = NULL;/* A backup distance matrix */
83 NJ_TREE *tree; /* The phylogenetic tree */
84 NJ_ARGS *nj_args; /* Structure for holding command-line arguments */
87 /* some variables for tracking time */
89 unsigned long long startUs, endUs;
92 /* check and parse supplied command-line arguments */
93 nj_args = NJ_handle_args(argc, argv);
95 fprintf(stderr, "Clearcut: Error processing command-line arguments.\n");
99 /* for verbose reporting, print the random number seed to stdout */
100 if(nj_args->verbose_flag) {
101 printf("PRNG SEED: %d\n", nj_args->seed);
104 /* Initialize Mersenne Twister PRNG */
105 init_genrand(nj_args->seed);
108 switch(nj_args->input_mode) {
110 /* If the input type is a distance matrix */
111 case NJ_INPUT_MODE_DISTANCE:
113 /* parse the distance matrix */
114 dmat = NJ_parse_distance_matrix(nj_args);
121 /* If the input type is a multiple sequence alignment */
122 case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
124 /* build a distance matrix from a multiple sequence alignment */
125 dmat = NJ_build_distance_matrix(nj_args);
127 fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n");
135 fprintf(stderr, "Clearcut: Could not determine how to process input\n");
140 * Output the computed distance matrix,
141 * if the user specified one.
143 if(nj_args->matrixout) {
144 NJ_output_matrix(nj_args, dmat);
148 * If we are going to generate multiple trees from
149 * the same distance matrix, we need to make a backup
150 * of the original distance matrix.
152 if(nj_args->ntrees > 1) {
153 dmat_backup = NJ_dup_dmat(dmat);
156 /* process n trees */
157 for(i=0;i<nj_args->ntrees;i++) {
160 * If the user has specified matrix shuffling, we need
161 * to randomize the distance matrix
163 if(nj_args->shuffle) {
164 NJ_shuffle_distance_matrix(dmat);
167 /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */
168 gettimeofday(&tv, NULL);
169 startUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
170 + ((unsigned long long) tv.tv_usec);
174 * Invoke either the Relaxed Neighbor-Joining algorithm (default)
175 * or the "traditional" Neighbor-Joining algorithm
177 if(nj_args->neighbor) {
178 tree = NJ_neighbor_joining(nj_args, dmat);
180 tree = NJ_relaxed_nj(nj_args, dmat);
184 fprintf(stderr, "Clearcut: Failed to construct tree.\n");
188 /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */
189 gettimeofday(&tv, NULL);
190 endUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
191 + ((unsigned long long) tv.tv_usec);
193 /* print the time taken to perform the neighbor join */
194 if(nj_args->verbose_flag) {
195 if(nj_args->neighbor) {
196 fprintf(stderr, "NJ tree built in %llu.%06llu secs\n",
197 (endUs - startUs) / 1000000ULL,
198 (endUs - startUs) % 1000000ULL);
200 fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n",
201 (endUs - startUs) / 1000000ULL,
202 (endUs - startUs) % 1000000ULL);
206 /* Output the neighbor joining tree here */
207 NJ_output_tree(nj_args, tree, dmat, i);
209 NJ_free_tree(tree); /* Free the tree */
210 NJ_free_dmat(dmat); /* Free the working distance matrix */
213 * If we need to do another iteration, lets re-initialize
214 * our working distance matrix.
216 if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) {
217 dmat = NJ_dup_dmat(dmat_backup);
221 /* Free the backup distance matrix */
222 if(nj_args->ntrees > 1) {
223 NJ_free_dmat(dmat_backup);
226 /* If verbosity, describe where the tree output is */
227 if(nj_args->verbose_flag) {
228 if(nj_args->neighbor) {
229 printf("NJ tree(s) in %s\n", nj_args->outfilename);
231 printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename);
243 * NJ_find_hmin() - Find minimum transformed values along horizontal
248 * dmat -- The distance matrix
249 * a -- The index of the specific taxon in the distance matrix
253 * <float> -- The value of the selected minimum
254 * min -- Used to transport the index of the minima out
255 * of the function (by reference)
256 * hmincount -- Return the number of minima along the horizontal
263 * A fast, inline function to find the smallest transformed value
264 * along the "horizontal" portion of an entry in a distance matrix.
266 * Distance matrices are stored internally as continguously-allocated
267 * upper-diagonal structures. With the exception of the taxa at
268 * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
269 * and vertical component in the distance matrix. This function
270 * scans the horizonal portion of the entry in the distance matrix
271 * for the specified taxon and finds the minimum transformed value
272 * along that horizontal component.
274 * Since multiple minima can exist along the horizontal portion
275 * of the entry, I consider all minima and break ties
276 * stochastically to help avoid systematic bias.
278 * Just searching along the horizontal portion of a row is very fast
279 * since the data is stored linearly and contiguously in memory and
280 * cache locality is exploited in the distance matrix representation.
282 * Look at nj.h for more information on how the distance matrix
288 NJ_find_hmin(DMAT *dmat,
291 long int *hmincount) {
293 long int i; /* index variable for looping */
294 int size; /* current size of distance matrix */
295 int mindex = 0; /* holds the current index to the chosen minimum */
296 float curval; /* used to hold current transformed values */
297 float hmin; /* the value of the transformed minimum */
299 float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */
301 /* values used for stochastic selection among multiple minima */
305 /* initialize the min to something large */
306 hmin = (float)HUGE_VAL;
308 /* setup some pointers to limit dereferencing later */
313 /* initialize values associated with minima tie breaking */
318 ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */
319 for(i=a+1;i<size;i++) {
321 curval = *(ptr++) - (r2[a] + r2[i]); /* compute transformed distance */
323 if(NJ_FLT_EQ(curval, hmin)) { /* approx. equal */
327 p = 1.0/(float)smallcnt;
330 /* select this minimum in a way which is proportional to
331 the number of minima found along the row so far */
336 } else if (curval < hmin) {
344 /* save off the the minimum index to be returned via reference */
347 /* save off the number of minima */
348 *hmincount = smallcnt;
350 /* return the value of the smallest tranformed distance */
362 * NJ_find_vmin() - Find minimum transformed distance along vertical
367 * dmat -- The distance matrix
368 * a -- The index of the specific taxon in the distance matrix
373 * <float> -- The value of the selected minimum
374 * min -- Used to transport the index of the minima out
375 * of the function (by reference)
376 * vmincount -- The number of minima along the vertical
377 * return by reference.
382 * A fast, inline function to find the smallest transformed value
383 * along the "vertical" portion of an entry in a distance matrix.
385 * Distance matrices are stored internally as continguously-allocated
386 * upper-diagonal matrices. With the exception of the taxa at
387 * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
388 * and vertical component in the distance matrix. This function
389 * scans the vertical portion of the entry in the distance matrix
390 * for the specified taxon and finds the minimum transformed value
391 * along that vertical component.
393 * Since multiple minima can exist along the vertical portion
394 * of the entry, I consider all minima and break ties
395 * stochastically to help avoid systematic bias.
397 * Due to cache locality reasons, searching along the vertical
398 * component is going to be considerably slower than searching
399 * along the horizontal.
401 * Look at nj.h for more information on how the distance matrix
407 NJ_find_vmin(DMAT *dmat,
410 long int *vmincount) {
412 long int i; /* index variable used for looping */
413 long int size; /* track the size of the matrix */
414 long int mindex = 0;/* track the index to the minimum */
415 float curval; /* track value of current transformed distance */
416 float vmin; /* the index to the smallest "vertical" minimum */
418 /* pointers which are used to reduce pointer dereferencing in inner loop */
419 float *ptr, *r2, *val;
421 /* values used in stochastically breaking ties */
425 /* initialize the vertical min to something really big */
426 vmin = (float)HUGE_VAL;
428 /* save off some values to limit dereferencing later */
436 /* start on the first row and work down */
437 ptr = &(val[NJ_MAP(0, a, size)]);
440 curval = *ptr - (r2[i] + r2[a]); /* compute transformed distance */
442 if(NJ_FLT_EQ(curval, vmin)) { /* approx. equal */
446 p = 1.0/(float)smallcnt;
449 /* break ties stochastically to avoid systematic bias */
454 } else if (curval < vmin) {
461 /* increment our working pointer to the next row down */
465 /* pass back the index to the minimum found so far (by reference) */
468 /* pass back the number of minima along the vertical */
469 *vmincount = smallcnt;
471 /* return the value of the smallest transformed distance */
479 * NJ_permute() - Generate random permutation using the provably unbiased
480 * Fisher-Yates Shuffle.
484 * perm -- A pointer to the array of long ints which will be filled.
485 * size -- the length of the permutation vector
496 * Return a permuted list of numbers from 0 through size.
497 * This is accomplished by initializing the permutation
498 * as an ordered list of integers and then iterating
499 * through and swapping two values selected according to the
500 * Fisher-Yates method.
502 * This unbiased method for random permutation generation is
505 * Donald E. Knuth, The Art of Computer Programming,
506 * Addison-Wesley, Volumes 1, 2, and 3, 3rd edition, 1998
511 NJ_permute(long int *perm,
514 long int i; /* index used for looping */
515 long int swap; /* we swap values to generate permutation */
516 long int tmp; /* used for swapping values */
519 /* check to see if vector of long ints is valid */
521 fprintf(stderr, "Clearcut: NULL permutation pointer in NJ_permute()\n");
525 /* init permutation as an ordered list of integers */
526 for(i=0;i<size;i++) {
531 * Iterate across the array from i = 0 to size -1, swapping ith element
532 * with a randomly chosen element from a changing range of possible values
534 for(i=0;i<size;i++) {
536 /* choose which element we will swap with */
537 swap = i + NJ_genrand_int31_top(size-i);
539 /* swap elements here */
542 perm[swap] = perm[i];
555 * NJ_compute_r() - Compute post-join changes to r-vector. In this
556 * case, we decrement all of the accumulated distances
557 * in the r-vector for the two nodes that were
558 * recently joined (i.e. x, y)
562 * dmat -- The distance matrix
563 * a -- The index of one of the taxa that were joined
564 * b -- The index of the other taxa that was joined
573 * This vector of floats is used as a summary of overall distances from
574 * each entry in the distance matrix to every other entry. These values
575 * are then used when computing the transformed distances from which
576 * decisions concerning joining are made.
578 * For speed, we don't recompute r from scratch. Instead, we decrement
579 * all entries in r by the appropriate amount. That is, r[i] -= dist(i, a)
580 * and r[i] -= dist(i, b).
582 * As a speed optimization, I process the rows altogether for cache locality
583 * purposes, and then process columns.
585 * The processing of the scaled r matrix (r2) is handled on-the-fly elsewhere.
590 NJ_compute_r(DMAT *dmat,
594 long int i; /* a variable used in indexing */
595 float *ptrx, *ptry; /* pointers into the distance matrix */
597 /* some variables to limit pointer dereferencing in loop */
601 /* to limit pointer dereferencing */
607 * Loop through the rows and decrement the stored r values
608 * by the distances stored in the rows and columns of the distance
609 * matrix which are being removed post-join.
611 * We do the rows altogether in order to benefit from cache locality.
613 ptrx = &(val[NJ_MAP(a, a+1, size)]);
614 ptry = &(val[NJ_MAP(b, b+1, size)]);
616 for(i=a+1;i<size;i++) {
626 /* Similar to the above loop, we now do the columns */
627 ptrx = &(val[NJ_MAP(0, a, size)]);
628 ptry = &(val[NJ_MAP(0, b, size)]);
649 * NJ_check_additivity() - Check to make sure that addivity preserved by join
654 * dmat -- distance matrix
655 * a -- index into dmat for one of the rows to be joined
656 * b -- index into dmat for another row to be joined
660 * int 1 if join adheres to additivity constraint
661 * 0 if join does breaks additivity
666 * Here we perform the check to make sure that by joining a and b we do not
667 * also break consistency (i.e. preserves additivity) with the distances between
668 * the taxa in the new clade and other nodes in the tree. This is done quite
669 * efficiently by looking up the untransformed distance between node b and
670 * some other "target" taxa in the distance matrix (which is not a nor b) and
671 * comparing that distance to the distance computed by finding the distance
672 * from node a to the proposed internal node "x" which joins (a,b).
674 * If dist(x,b) + dist (b, target) == dist(b, target) then additivity is
675 * preserved, otherwise, additivity is not preserved. If we are in
676 * additivity mode, this join should be rejected.
681 NJ_check_additivity(DMAT *dmat,
685 float a2clade, b2clade;
690 /* determine target taxon here */
691 if(b == dmat->size-1) {
692 /* if we can't do a row here, lets do a column */
707 /* distance between a and the root of clade (a,b) */
709 ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
710 (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
712 /* distance between b and the root of clade (a,b) */
714 ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
715 (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
717 /* distance between the clade (a,b) and the target taxon */
720 /* compute the distance from the clade root to the target */
722 ( (dmat->val[NJ_MAP(a, target, dmat->size)] - a2clade) +
723 (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0;
726 * Check to see that distance from clade root to target + distance from
727 * b to clade root are equal to the distance from b to the target
729 if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)],
730 (clade_dist + b2clade))) {
731 return(1); /* join is legitimate */
733 return(0); /* join is illigitimate */
738 /* compute the distance from the clade root to the target */
740 ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) +
741 (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0;
744 * Check to see that distance from clade root to target + distance from
745 * b to clade root are equal to the distance from b to the target
747 if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)],
748 (clade_dist + b2clade))) {
749 return(1); /* join is legitimate */
751 return(0); /* join is illegitimate */
763 * NJ_check() - Check to see if two taxa can be joined
768 * nj_args -- Pointer to the data structure holding command-line args
769 * dmat -- distance matrix
770 * a -- index into dmat for one of the rows to be joined
771 * b -- index into dmat for another row to be joined
772 * min -- the minimum value found
773 * additivity -- a flag (0 = not additive mode, 1 = additive mode)
777 * int 1 if join is okay
778 * 0 if join is not okay
783 * This function ultimately takes two rows and makes sure that the
784 * intersection of those two rows, which has a transformed distance of
785 * "min", is actually the smallest (or equal to the smallest)
786 * transformed distance for both rows (a, b). If so, it returns
787 * 1, else it returns 0.
789 * Basically, we want to join two rows only if the minimum
790 * transformed distance on either row is at the intersection of
796 NJ_check(NJ_ARGS *nj_args,
805 float *ptr, *val, *r2;
808 /* some aliases for speed and readability reasons */
814 /* now determine if joining a, b will result in broken distances */
816 if(!NJ_check_additivity(dmat, a, b)) {
821 /* scan the horizontal of row b, punt if anything < min */
822 ptr = &(val[NJ_MAP(b, b+1, size)]);
823 for(i=b+1;i<size;i++) {
824 if( NJ_FLT_LT( (*ptr - (r2[b] + r2[i])), min) ) {
830 /* scan the vertical component of row a, punt if anything < min */
831 if(nj_args->norandom) { /* if we are doing random joins, we checked this */
834 if( NJ_FLT_LT( (*ptr - (r2[i] + r2[a])), min) ) {
841 /* scan the vertical component of row b, punt if anything < min */
844 if( NJ_FLT_LT( (*ptr - (r2[i] + r2[b])), min) && i!=a) {
860 * NJ_collapse() - Collapse the distance matrix by removing
861 * rows a and b from the distance matrix and
862 * replacing them with a single new row which
863 * represents the internal node joining a and b
868 * dmat -- A pointer to the distance matrix
869 * vertex -- A pointer to the vertex vector (vector of tree nodes)
870 * which is used in constructing the tree
871 * a -- An index to a row in the distance matrix from which we
872 * joined. This row will be collapsed.
873 * b -- An index to a row in the distance matrix from which we
874 * joined. This row will be collapsed.
884 * This function collapses the distance matrix in a way which optimizes
885 * cache locality and ultimately gives us a speed improvement due to
886 * cache. At this point, we've decided to join rows a and b from
887 * the distance matrix. We will remove rows a and b from the distance
888 * matrix and replace them with a new row which represents the internal
889 * node which joins rows a and b together.
891 * We always keep the matrix as compact as possible in order to
892 * get good performance from our cache in subsequent operations. Cache
893 * is the key to good performance here.
898 * 1) Fill the "a" row with the new distances of the internal node
899 * joining a and b to all other rows.
900 * 2) Copy row 0 into what was row b
901 * 3) Increment the pointer to the start of the distance matrix
903 * 4) Decrement the size of the matrix by one row.
904 * 5) Do roughly the same thing to the r vector in order to
905 * keep it in sync with the distance matrix.
906 * 6) Compute the scaled r vector (r2) based on the updated
909 * This keeps the distance matrix as compact as possible in memory, and
910 * is a relatively fast operation.
912 * This function requires that a < b
917 NJ_collapse(DMAT *dmat,
923 long int i; /* index used for looping */
924 long int size; /* size of dmat --> reduce pointer dereferencing */
925 float a2clade; /* distance from a to the new node that joins a and b */
926 float b2clade; /* distance from b to the new node that joins a and b */
927 float cval; /* stores distance information during loop */
928 float *vptr; /* pointer to elements in first row of dist matrix */
929 float *ptra; /* pointer to elements in row a of distance matrix */
930 float *ptrb; /* pointer to elements in row b of distance matrix */
932 float *val, *r, *r2; /* simply used to limit pointer dereferencing */
935 /* We must assume that a < b */
937 fprintf(stderr, "Clearcut: (a<b) constraint check failed in NJ_collapse()\n");
941 /* some shortcuts to help limit dereferencing */
947 /* compute the distance from the clade components (a, b) to the new node */
949 ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
951 ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
954 r[a] = 0.0; /* we are removing row a, so clear dist. in r */
957 * Fill the horizontal part of the "a" row and finish computing r and r2
958 * we handle the horizontal component first to maximize cache locality
960 ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */
961 ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */
962 for(i=a+1;i<size;i++) {
965 * Compute distance from new internal node to others in
966 * the distance matrix.
969 ( (*ptra - a2clade) +
970 (*ptrb - b2clade) ) / 2.0;
972 /* incr. row b pointer differently depending on where i is in loop */
974 ptrb += size-i-1; /* traverse vertically by incrementing by row */
976 ptrb++; /* traverse horiz. by incrementing by column */
979 /* assign the newly computed distance and increment a ptr by a column */
982 /* accumulate the distance onto the r vector */
986 /* scale r2 on the fly here */
987 r2[i] = r[i]/(float)(size-3);
990 /* fill the vertical part of the "a" column and finish computing r and r2 */
991 ptra = val + a; /* start at the top of the columb for "a" */
992 ptrb = val + b; /* start at the top of the columb for "b" */
996 * Compute distance from new internal node to others in
997 * the distance matrix.
1000 ( (*ptra - a2clade) +
1001 (*ptrb - b2clade) ) / 2.0;
1003 /* assign the newly computed distance and increment a ptr by a column */
1006 /* accumulate the distance onto the r vector */
1010 /* scale r2 on the fly here */
1011 r2[i] = r[i]/(float)(size-3);
1013 /* here, always increment by an entire row */
1019 /* scale r2 on the fly here */
1020 r2[a] = r[a]/(float)(size-3);
1025 * Copy row 0 into row b. Again, the code is structured into two
1026 * loops to maximize cache locality for writes along the horizontal
1027 * component of row b.
1035 vptr++; /* skip over the diagonal */
1036 ptrb = &(val[NJ_MAP(b, b+1, size)]);
1037 for(i=b+1;i<size;i++) {
1038 *(ptrb++) = *(vptr++);
1042 * Collapse r here by copying contents of r[0] into r[b] and
1043 * incrementing pointer to the beginning of r by one row
1050 * Collapse r2 here by copying contents of r2[0] into r2[b] and
1051 * incrementing pointer to the beginning of r2 by one row
1056 /* increment dmat pointer to next row */
1059 /* decrement the total size of the distance matrix by one row */
1074 * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining
1079 * nj_args -- A pointer to a structure containing the command-line arguments
1080 * dmat -- A pointer to the distance matrix
1084 * NJ_TREE * -- A pointer to the Neighbor-Joining tree.
1089 * This function performs a traditional Neighbor-Joining operation in which
1090 * the distance matrix is exhaustively searched for the global minimum
1091 * transformed distance. The two nodes which intersect at the global
1092 * minimum transformed distance are then joined and the distance
1093 * matrix is collapsed. This process continues until there are only
1094 * two nodes left, at which point those nodes are joined.
1098 NJ_neighbor_joining(NJ_ARGS *nj_args,
1102 NJ_TREE *tree = NULL;
1103 NJ_VERTEX *vertex = NULL;
1109 /* initialize the r and r2 vectors */
1112 /* allocate and initialize our vertex vector used for tree construction */
1113 vertex = NJ_init_vertex(dmat);
1115 fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n");
1119 /* we iterate until the working distance matrix has only 2 entries */
1120 while(vertex->nactive > 2) {
1123 * Find the global minimum transformed distance from the distance matrix
1125 min = NJ_min_transform(dmat, &a, &b);
1128 * Build the tree by removing nodes a and b from the vertex array
1129 * and inserting a new internal node which joins a and b. Collapse
1130 * the vertex array similarly to how the distance matrix and r and r2
1133 NJ_decompose(dmat, vertex, a, b, 0);
1135 /* decrement the r and r2 vectors by the distances corresponding to a, b */
1136 NJ_compute_r(dmat, a, b);
1138 /* compact the distance matrix and the r and r2 vectors */
1139 NJ_collapse(dmat, vertex, a, b);
1142 /* Properly join the last two nodes on the vertex list */
1143 tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
1145 /* return the computed tree to the calling function */
1157 * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining
1161 * nj_args -- A pointer to a data structure containing the command-line args
1162 * dmat -- A pointer to the distance matrix
1167 * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree
1172 * This function implements the Relaxed Neighbor-Joining algorithm of
1173 * Evans, J., Sheneman, L., and Foster, J.
1175 * Relaxed Neighbor-Joining works by choosing a local minimum transformed
1176 * distance when determining when to join two nodes. (Traditional
1177 * Neighbor-Joining chooses a global minimum transformed distance).
1179 * The algorithm shares the property with traditional NJ that if the
1180 * input distances are additive (self-consistent), then the algorithm
1181 * will manage to construct the true tree consistent with the additive
1182 * distances. Additivity state is tracked and every proposed join is checked
1183 * to make sure it maintains additivity constraints. If no
1184 * additivity-preserving join is possible in a single pass, then the distance
1185 * matrix is non-additive, and additivity checking is abandoned.
1187 * The algorithm will either attempt joins randomly, or it will perform joins
1188 * in a particular order. The default behavior is to perform joins randomly,
1189 * but this can be switched off with a command-line switch.
1191 * For randomized joins, all attempts are made to alleviate systematic bias
1192 * for the choice of rows to joins. All tie breaking is done in a way which
1193 * is virtually free of bias.
1195 * To perform randomized joins, a random permutation is constructed which
1196 * specifies the order in which to attempt joins. I iterate through the
1197 * random permutation, and for each row in the random permutation, I find
1198 * the minimum transformed distance for that row. If there are multiple
1199 * minima, I break ties evenly. For the row which intersects our
1200 * randomly chosen row at the chosen minimum, if we are are still in
1201 * additivity mode, I check to see if joining the two rows will break
1202 * our additivity constraints. If not, I check to see if there exists
1203 * a transformed distance which is smaller than the minimum found on the
1204 * original row. If there is, then we proceed through the random permutation
1205 * trying additional rows in the random order specified in the permutation.
1206 * If there is no smaller minimum transformed distance on either of the
1207 * two rows, then we join them, collapse the distance matrix, and compute
1208 * a new random permutation.
1210 * If the entire random permutation is traversed and no joins are possible
1211 * due to additivity constraints, then the distance matrix is not
1212 * additive, and additivity constraint-checking is disabled.
1216 NJ_relaxed_nj(NJ_ARGS *nj_args,
1222 long int a, b, t, bh, bv, i;
1223 float hmin, vmin, hvmin;
1226 int additivity_mode;
1227 long int hmincount, vmincount;
1228 long int *permutation = NULL;
1232 /* initialize the r and r2 vectors */
1235 additivity_mode = 1;
1237 /* allocate the permutation vector, if we are in randomize mode */
1238 if(!nj_args->norandom) {
1239 permutation = (long int *)calloc(dmat->size, sizeof(long int));
1241 fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n");
1246 /* allocate and initialize our vertex vector used for tree construction */
1247 vertex = NJ_init_vertex(dmat);
1249 /* loop until there are only 2 nodes left to join */
1250 while(vertex->nactive > 2) {
1252 switch(nj_args->norandom) {
1254 /* RANDOMIZED JOINS */
1259 NJ_permute(permutation, dmat->size-1);
1260 for(i=0;i<dmat->size-1 && (vertex->nactive>2) ;i++) {
1264 /* find min trans dist along horiz. of row a */
1265 hmin = NJ_find_hmin(dmat, a, &bh, &hmincount);
1267 /* find min trans dist along vert. of row a */
1268 vmin = NJ_find_vmin(dmat, a, &bv, &vmincount);
1275 if(NJ_FLT_EQ(hmin, vmin)) {
1278 * The minima along the vertical and horizontal are
1279 * the same. Compute the proportion of minima along
1280 * the horizonal (p) and the proportion of minima
1281 * along the vertical (q).
1283 * If the same minima exist along the horizonal and
1284 * vertical, we break the tie in a way which is
1285 * non-biased. That is, we break the tie based on the
1286 * proportion of horiz. minima versus vertical minima.
1289 p = (float)hmincount / ((float)hmincount + (float)vmincount);
1291 x = genrand_real2();
1300 } else if(NJ_FLT_LT(hmin, vmin) ) {
1308 if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) {
1310 /* swap a and b, if necessary, to make sure a < b */
1319 /* join taxa from rows a and b */
1320 NJ_decompose(dmat, vertex, a, b, 0);
1322 /* collapse matrix */
1323 NJ_compute_r(dmat, a, b);
1324 NJ_collapse(dmat, vertex, a, b);
1326 NJ_permute(permutation, dmat->size-1);
1330 /* turn off additivity if go through an entire cycle without joining */
1332 additivity_mode = 0;
1339 /* DETERMINISTIC JOINS */
1344 for(a=0;a<dmat->size-1 && (vertex->nactive > 2) ;) {
1346 /* find the min along the horizontal of row a */
1347 hmin = NJ_find_hmin(dmat, a, &b, &hmincount);
1349 if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) {
1353 /* join taxa from rows a and b */
1354 NJ_decompose(dmat, vertex, a, b, 0);
1356 /* collapse matrix */
1357 NJ_compute_r(dmat, a, b);
1358 NJ_collapse(dmat, vertex, a, b);
1369 /* turn off additivity if go through an entire cycle without joining */
1371 additivity_mode = 0;
1379 /* Join the last two nodes on the vertex list */
1380 tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
1382 if(nj_args->verbose_flag) {
1383 if(additivity_mode) {
1384 printf("Tree is additive\n");
1386 printf("Tree is not additive\n");
1391 NJ_free_vertex(vertex);
1394 if(!nj_args->norandom && permutation) {
1407 * NJ_print_distance_matrix() -
1409 * Print a distance matrix
1413 NJ_print_distance_matrix(DMAT *dmat) {
1417 printf("ntaxa: %ld\n", dmat->ntaxa);
1418 printf(" size: %ld\n", dmat->size);
1420 for(i=0;i<dmat->size;i++) {
1422 for(j=0;j<dmat->size;j++) {
1424 printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]);
1431 if(dmat->r && dmat->r2) {
1432 printf("\t\t%0.4f", dmat->r[i]);
1433 printf("\t%0.4f", dmat->r2[i]);
1439 for(j=0;j<dmat->size;j++) {
1441 printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j]));
1463 * NJ_output_tree() -
1465 * A wrapper for the function that really prints the tree,
1466 * basically to get a newline in there conveniently. :-)
1468 * Print n trees, as specified in command-args
1469 * using "count" variable from 0 to (n-1)
1473 NJ_output_tree(NJ_ARGS *nj_args,
1480 if(nj_args->stdout_flag) {
1485 fp = fopen(nj_args->outfilename, "w"); /* open for writing */
1487 fp = fopen(nj_args->outfilename, "a"); /* open for appending */
1491 fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename);
1496 NJ_output_tree2(fp, nj_args, tree, tree, dmat);
1499 if(!nj_args->stdout_flag) {
1511 * NJ_output_tree2() -
1516 NJ_output_tree2(FILE *fp,
1526 if(tree->taxa_index != NJ_INTERNAL_NODE) {
1528 if(nj_args->expblen) {
1529 fprintf(fp, "%s:%e",
1530 dmat->taxaname[tree->taxa_index],
1533 fprintf(fp, "%s:%f",
1534 dmat->taxaname[tree->taxa_index],
1541 if(tree->left && tree->right) {
1545 NJ_output_tree2(fp, nj_args, tree->left, root, dmat);
1548 if(tree->left && tree->right) {
1552 NJ_output_tree2(fp, nj_args, tree->right, root, dmat);
1555 if(tree != root->left) {
1556 if(tree->left && tree->right) {
1558 if(nj_args->expblen) {
1559 fprintf(fp, "):%e", tree->dist);
1561 fprintf(fp, "):%f", tree->dist);
1584 * This function computes the r column in our matrix
1588 NJ_init_r(DMAT *dmat) {
1590 long int i, j, size;
1592 float *r, *r2, *val;
1601 size2 = (float)(size-2);
1604 for(i=0;i<size1;i++) {
1606 for(j=i+1;j<size;j++) {
1629 * NJ_init_vertex() -
1631 * Construct a vertex, which we will use to construct our tree
1632 * in a true bottom-up approach. The vertex construct is
1633 * basically the center node in the initial star topology.
1637 NJ_init_vertex(DMAT *dmat) {
1642 /* allocate the vertex here */
1643 vertex = (NJ_VERTEX *)calloc(1, sizeof(NJ_VERTEX));
1645 /* allocate the nodes in the vertex */
1646 vertex->nodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *));
1647 vertex->nodes_handle = vertex->nodes;
1649 /* initialize our size and active variables */
1650 vertex->nactive = dmat->ntaxa;
1651 vertex->size = dmat->ntaxa;
1653 /* initialize the nodes themselves */
1654 for(i=0;i<dmat->ntaxa;i++) {
1656 vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
1658 vertex->nodes[i]->left = NULL;
1659 vertex->nodes[i]->right = NULL;
1661 vertex->nodes[i]->taxa_index = i;
1674 * This function decomposes the star by creating new internal nodes
1675 * and joining two existing tree nodes to it
1679 NJ_decompose(DMAT *dmat,
1686 float x2clade, y2clade;
1688 /* compute the distance from the clade components to the new node */
1691 (dmat->val[NJ_MAP(x, y, dmat->size)]);
1694 (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
1695 ((dmat->r2[x] - dmat->r2[y])/2);
1698 vertex->nodes[x]->dist = x2clade;
1702 (dmat->val[NJ_MAP(x, y, dmat->size)]);
1705 (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
1706 ((dmat->r2[y] - dmat->r2[x])/2);
1709 vertex->nodes[y]->dist = y2clade;
1711 /* allocate new node to connect two sub-clades */
1712 new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
1714 new_node->left = vertex->nodes[x];
1715 new_node->right = vertex->nodes[y];
1716 new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */
1722 vertex->nodes[x] = new_node;
1723 vertex->nodes[y] = vertex->nodes[0];
1725 vertex->nodes = &(vertex->nodes[1]);
1735 * NJ_print_vertex() -
1737 * For debugging, print the contents of the vertex
1741 NJ_print_vertex(NJ_VERTEX *vertex) {
1745 printf("Number of active nodes: %ld\n", vertex->nactive);
1747 for(i=0;i<vertex->nactive;i++) {
1748 printf("%ld ", vertex->nodes[i]->taxa_index);
1768 NJ_print_r(DMAT *dmat) {
1773 for(i=0;i<dmat->size;i++) {
1774 printf("r[%ld] = %0.2f\n", i, dmat->r[i]);
1786 * NJ_print_taxanames() -
1788 * Print taxa names here
1792 NJ_print_taxanames(DMAT *dmat) {
1796 printf("Number of taxa: %ld\n", dmat->ntaxa);
1798 for(i=0;i<dmat->ntaxa;i++) {
1799 printf("%ld) %s\n", i, dmat->taxaname[i]);
1811 * NJ_shuffle_distance_matrix() -
1813 * Randomize a distance matrix here
1817 NJ_shuffle_distance_matrix(DMAT *dmat) {
1820 long int *perm = NULL;
1821 char **tmp_taxaname = NULL;
1822 float *tmp_val = NULL;
1826 /* alloc the random permutation and a new matrix to hold the shuffled vals */
1827 perm = (long int *)calloc(dmat->size, sizeof(long int));
1828 tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *));
1829 tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
1830 if(!tmp_taxaname || !perm || !tmp_val) {
1831 fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n");
1835 /* compute a permutation which will describe how to shuffle the matrix */
1836 NJ_permute(perm, dmat->size);
1838 for(i=0;i<dmat->size;i++) {
1839 for(j=i+1;j<dmat->size;j++) {
1841 if(perm[j] < perm[i]) {
1842 tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)];
1844 tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)];
1849 tmp_taxaname[i] = dmat->taxaname[perm[i]];
1852 /* free our random permutation */
1857 /* free the old value matrix */
1862 /* re-assign the value matrix pointers */
1863 dmat->val = tmp_val;
1864 dmat->valhandle = dmat->val;
1867 * Free our old taxaname with its particular ordering
1868 * and re-assign to the new.
1870 if(dmat->taxaname) {
1871 free(dmat->taxaname);
1873 dmat->taxaname = tmp_taxaname;
1883 * Free a given NJ tree
1886 NJ_free_tree(NJ_TREE *node) {
1893 NJ_free_tree(node->left);
1897 NJ_free_tree(node->right);
1914 * NJ_print_permutation()
1916 * Print a permutation
1920 NJ_print_permutation(long int *perm,
1925 for(i=0;i<size-1;i++) {
1926 printf("%ld,", perm[i]);
1928 printf("%ld\n", perm[size-1]);
1939 * Duplicate a distance matrix
1943 NJ_dup_dmat(DMAT *src) {
1948 /* allocate the resulting distance matrix */
1949 dest = (DMAT *)calloc(1, sizeof(DMAT));
1951 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1955 dest->ntaxa = src->ntaxa;
1956 dest->size = src->size;
1958 /* allocate space for array of pointers to taxanames */
1959 dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *));
1960 if(!dest->taxaname) {
1961 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1965 /* allocate space for the taxanames themselves */
1966 for(i=0;i<src->ntaxa;i++) {
1967 dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char));
1968 if(!dest->taxaname[i]) {
1969 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1974 /* allocate space for the distance values */
1975 dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float));
1977 fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
1981 /* allocate space for the r and r2 vectors */
1982 dest->r = (float *)calloc(src->ntaxa, sizeof(float));
1983 dest->r2 = (float *)calloc(src->ntaxa, sizeof(float));
1986 for(i=0;i<src->ntaxa;i++) {
1987 strcpy(dest->taxaname[i], src->taxaname[i]);
1991 memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float));
1994 memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float));
1995 memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float));
1997 /* track some memory addresses */
1998 dest->valhandle = dest->val;
1999 dest->rhandle = dest->r;
2000 dest->r2handle = dest->r2;
2006 /* free what we may have allocated */
2019 NJ_free_dmat(DMAT *dmat) {
2025 if(dmat->taxaname) {
2027 for(i=0;i<dmat->ntaxa;i++) {
2028 if(dmat->taxaname[i]) {
2029 free(dmat->taxaname[i]);
2033 free(dmat->taxaname);
2036 if(dmat->valhandle) {
2037 free(dmat->valhandle);
2041 free(dmat->rhandle);
2044 if(dmat->r2handle) {
2045 free(dmat->r2handle);
2059 * NJ_free_vertex() -
2061 * Free the vertex data structure
2065 NJ_free_vertex(NJ_VERTEX *vertex) {
2068 if(vertex->nodes_handle) {
2069 free(vertex->nodes_handle);
2087 * NJ_min_transform() - Find the smallest transformed value to identify
2088 * which nodes to join.
2092 * dmat -- The distance matrix
2096 * <float> -- The minimimum transformed distance
2097 * ret_i -- The row of the smallest transformed distance (by reference)
2098 * ret_j -- The col of the smallest transformed distance (by reference)
2104 * Used only with traditional Neighbor-Joining, this function checks the entire
2105 * working distance matrix and identifies the smallest transformed distance.
2106 * This requires traversing the entire diagonal matrix, which is itself a
2111 NJ_min_transform(DMAT *dmat,
2115 long int i, j; /* indices used for looping */
2116 long int tmp_i = 0;/* to limit pointer dereferencing */
2117 long int tmp_j = 0;/* to limit pointer dereferencing */
2118 float smallest; /* track the smallest trans. dist */
2119 float curval; /* the current trans. dist in loop */
2121 float *ptr; /* pointer into distance matrix */
2122 float *r2; /* pointer to r2 matrix for computing transformed dists */
2124 smallest = (float)HUGE_VAL;
2126 /* track these here to limit pointer dereferencing in inner loop */
2131 for(i=0;i<dmat->size;i++) {
2132 ptr++; /* skip diagonal */
2133 for(j=i+1;j<dmat->size;j++) { /* for every column */
2135 /* find transformed distance in matrix at i, j */
2136 curval = *(ptr++) - (r2[i] + r2[j]);
2138 /* if the transformed distanance is less than the known minimum */
2139 if(curval < smallest) {
2148 /* pass back (by reference) the coords of the min. transformed distance */
2152 return(smallest); /* return the min transformed distance */