X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=clearcut.cpp;fp=clearcut.cpp;h=524683819e3adca6ffd8ffc96afdab27be8d3f39;hb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc;hp=0000000000000000000000000000000000000000;hpb=1b73ff67c83892a025e597dabd9df6fe7b58206a;p=mothur.git diff --git a/clearcut.cpp b/clearcut.cpp new file mode 100644 index 0000000..5246838 --- /dev/null +++ b/clearcut.cpp @@ -0,0 +1,2159 @@ + +/* + * clearcut.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * An implementation of the Relaxed Neighbor-Joining algorithm + * of Evans, J., Sheneman, L., and Foster, J. + * + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "distclearcut.h" +#include "dmat.h" +#include "fasta.h" +#include "cmdargs.h" +#include "common.h" +#include "clearcut.h" +#include "prng.h" + + +/* + * main() - + * + * The entry point to the program. + * + */ +int clearcut_main(int argc, char *argv[]) { + + DMAT *dmat; /* The working distance matrix */ + DMAT *dmat_backup = NULL;/* A backup distance matrix */ + NJ_TREE *tree; /* The phylogenetic tree */ + NJ_ARGS *nj_args; /* Structure for holding command-line arguments */ + long int i; + + /* some variables for tracking time */ + struct timeval tv; + unsigned long long startUs, endUs; + + + /* check and parse supplied command-line arguments */ + nj_args = NJ_handle_args(argc, argv); + + if(!nj_args) { + fprintf(stderr, "Clearcut: Error processing command-line arguments.\n"); + exit(-1); + } + + /* for verbose reporting, print the random number seed to stdout */ + if(nj_args->verbose_flag) { + printf("PRNG SEED: %d\n", nj_args->seed); + } + + /* Initialize Mersenne Twister PRNG */ + init_genrand(nj_args->seed); + + + switch(nj_args->input_mode) { + + /* If the input type is a distance matrix */ + case NJ_INPUT_MODE_DISTANCE: + + /* parse the distance matrix */ + dmat = NJ_parse_distance_matrix(nj_args); + if(!dmat) { + exit(-1); + } + + break; + + /* If the input type is a multiple sequence alignment */ + case NJ_INPUT_MODE_ALIGNED_SEQUENCES: + + /* build a distance matrix from a multiple sequence alignment */ + dmat = NJ_build_distance_matrix(nj_args); + if(!dmat) { + fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n"); + exit(-1); + } + + break; + + default: + + fprintf(stderr, "Clearcut: Could not determine how to process input\n"); + exit(-1); + } + + /* + * Output the computed distance matrix, + * if the user specified one. + */ + if(nj_args->matrixout) { + NJ_output_matrix(nj_args, dmat); + } + + /* + * If we are going to generate multiple trees from + * the same distance matrix, we need to make a backup + * of the original distance matrix. + */ + if(nj_args->ntrees > 1) { + dmat_backup = NJ_dup_dmat(dmat); + } + + /* process n trees */ + for(i=0;intrees;i++) { + + /* + * If the user has specified matrix shuffling, we need + * to randomize the distance matrix + */ + if(nj_args->shuffle) { + NJ_shuffle_distance_matrix(dmat); + } + + /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + startUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + + /* + * Invoke either the Relaxed Neighbor-Joining algorithm (default) + * or the "traditional" Neighbor-Joining algorithm + */ + if(nj_args->neighbor) { + tree = NJ_neighbor_joining(nj_args, dmat); + } else { + tree = NJ_relaxed_nj(nj_args, dmat); + } + + if(!tree) { + fprintf(stderr, "Clearcut: Failed to construct tree.\n"); + exit(0); + } + + /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + endUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + /* print the time taken to perform the neighbor join */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + fprintf(stderr, "NJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } else { + fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } + } + + /* Output the neighbor joining tree here */ + NJ_output_tree(nj_args, tree, dmat, i); + + NJ_free_tree(tree); /* Free the tree */ + NJ_free_dmat(dmat); /* Free the working distance matrix */ + + /* + * If we need to do another iteration, lets re-initialize + * our working distance matrix. + */ + if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) { + dmat = NJ_dup_dmat(dmat_backup); + } + } + + /* Free the backup distance matrix */ + if(nj_args->ntrees > 1) { + NJ_free_dmat(dmat_backup); + } + + /* If verbosity, describe where the tree output is */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + printf("NJ tree(s) in %s\n", nj_args->outfilename); + } else { + printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename); + } + } + + return 0; +} + + + + + +/* + * NJ_find_hmin() - Find minimum transformed values along horizontal + * + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * a -- The index of the specific taxon in the distance matrix + * + * RETURNS: + * -------- + * -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * hmincount -- Return the number of minima along the horizontal + * (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "horizontal" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal structures. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the horizonal portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that horizontal component. + * + * Since multiple minima can exist along the horizontal portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Just searching along the horizontal portion of a row is very fast + * since the data is stored linearly and contiguously in memory and + * cache locality is exploited in the distance matrix representation. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_hmin(DMAT *dmat, + long int a, + long int *min, + long int *hmincount) { + + long int i; /* index variable for looping */ + int size; /* current size of distance matrix */ + int mindex = 0; /* holds the current index to the chosen minimum */ + float curval; /* used to hold current transformed values */ + float hmin; /* the value of the transformed minimum */ + + float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */ + + /* values used for stochastic selection among multiple minima */ + float p, x; + long int smallcnt; + + /* initialize the min to something large */ + hmin = (float)HUGE_VAL; + + /* setup some pointers to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + /* initialize values associated with minima tie breaking */ + p = 1.0; + smallcnt = 0; + + + ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */ + for(i=a+1;i -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * vmincount -- The number of minima along the vertical + * return by reference. + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "vertical" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal matrices. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the vertical portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that vertical component. + * + * Since multiple minima can exist along the vertical portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Due to cache locality reasons, searching along the vertical + * component is going to be considerably slower than searching + * along the horizontal. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_vmin(DMAT *dmat, + long int a, + long int *min, + long int *vmincount) { + + long int i; /* index variable used for looping */ + long int size; /* track the size of the matrix */ + long int mindex = 0;/* track the index to the minimum */ + float curval; /* track value of current transformed distance */ + float vmin; /* the index to the smallest "vertical" minimum */ + + /* pointers which are used to reduce pointer dereferencing in inner loop */ + float *ptr, *r2, *val; + + /* values used in stochastically breaking ties */ + float p, x; + long int smallcnt; + + /* initialize the vertical min to something really big */ + vmin = (float)HUGE_VAL; + + /* save off some values to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + p = 1.0; + smallcnt = 0; + + /* start on the first row and work down */ + ptr = &(val[NJ_MAP(0, a, size)]); + for(i=0;isize; + val = dmat->val; + r = dmat->r+a+1; + + /* + * Loop through the rows and decrement the stored r values + * by the distances stored in the rows and columns of the distance + * matrix which are being removed post-join. + * + * We do the rows altogether in order to benefit from cache locality. + */ + ptrx = &(val[NJ_MAP(a, a+1, size)]); + ptry = &(val[NJ_MAP(b, b+1, size)]); + + for(i=a+1;ib) { + *r -= *(ptry++); + } + + r++; + } + + /* Similar to the above loop, we now do the columns */ + ptrx = &(val[NJ_MAP(0, a, size)]); + ptry = &(val[NJ_MAP(0, b, size)]); + r = dmat->r; + for(i=0;isize-1) { + /* if we can't do a row here, lets do a column */ + if(a==0) { + if(b==1) { + target = 2; + } else { + target = 1; + } + } else { + target = 0; + } + } else { + target = b+1; + } + + + /* distance between a and the root of clade (a,b) */ + a2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + + /* distance between b and the root of clade (a,b) */ + b2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + /* distance between the clade (a,b) and the target taxon */ + if(bval[NJ_MAP(a, target, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illigitimate */ + } + + } else { + + /* compute the distance from the clade root to the target */ + clade_dist = + ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illegitimate */ + } + } +} + + + + + + + +/* + * NJ_check() - Check to see if two taxa can be joined + + * + * INPUTS: + * ------- + * nj_args -- Pointer to the data structure holding command-line args + * dmat -- distance matrix + * a -- index into dmat for one of the rows to be joined + * b -- index into dmat for another row to be joined + * min -- the minimum value found + * additivity -- a flag (0 = not additive mode, 1 = additive mode) + * + * OUTPUTS: + * -------- + * int 1 if join is okay + * 0 if join is not okay + * + * DESCRIPTION: + * ------------ + * + * This function ultimately takes two rows and makes sure that the + * intersection of those two rows, which has a transformed distance of + * "min", is actually the smallest (or equal to the smallest) + * transformed distance for both rows (a, b). If so, it returns + * 1, else it returns 0. + * + * Basically, we want to join two rows only if the minimum + * transformed distance on either row is at the intersection of + * those two rows. + * + */ +static inline +int +NJ_check(NJ_ARGS *nj_args, + DMAT *dmat, + long int a, + long int b, + float min, + int additivity) { + + + long int i, size; + float *ptr, *val, *r2; + + + /* some aliases for speed and readability reasons */ + val = dmat->val; + r2 = dmat->r2; + size = dmat->size; + + + /* now determine if joining a, b will result in broken distances */ + if(additivity) { + if(!NJ_check_additivity(dmat, a, b)) { + return(0); + } + } + + /* scan the horizontal of row b, punt if anything < min */ + ptr = &(val[NJ_MAP(b, b+1, size)]); + for(i=b+1;inorandom) { /* if we are doing random joins, we checked this */ + ptr = val + a; + for(i=0;i reduce pointer dereferencing */ + float a2clade; /* distance from a to the new node that joins a and b */ + float b2clade; /* distance from b to the new node that joins a and b */ + float cval; /* stores distance information during loop */ + float *vptr; /* pointer to elements in first row of dist matrix */ + float *ptra; /* pointer to elements in row a of distance matrix */ + float *ptrb; /* pointer to elements in row b of distance matrix */ + + float *val, *r, *r2; /* simply used to limit pointer dereferencing */ + + + /* We must assume that a < b */ + if(a >= b) { + fprintf(stderr, "Clearcut: (aval; + r = dmat->r; + r2 = dmat->r2; + size = dmat->size; + + /* compute the distance from the clade components (a, b) to the new node */ + a2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + b2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + + r[a] = 0.0; /* we are removing row a, so clear dist. in r */ + + /* + * Fill the horizontal part of the "a" row and finish computing r and r2 + * we handle the horizontal component first to maximize cache locality + */ + ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */ + ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */ + for(i=a+1;ir = r+1; + + + /* + * Collapse r2 here by copying contents of r2[0] into r2[b] and + * incrementing pointer to the beginning of r2 by one row + */ + r2[b] = r2[0]; + dmat->r2 = r2+1; + + /* increment dmat pointer to next row */ + dmat->val += size; + + /* decrement the total size of the distance matrix by one row */ + dmat->size--; + + return; +} + + + + + + + + + +/* + * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining + * + * + * INPUTS: + * ------- + * nj_args -- A pointer to a structure containing the command-line arguments + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * NJ_TREE * -- A pointer to the Neighbor-Joining tree. + * + * DESCRIPTION: + * ------------ + * + * This function performs a traditional Neighbor-Joining operation in which + * the distance matrix is exhaustively searched for the global minimum + * transformed distance. The two nodes which intersect at the global + * minimum transformed distance are then joined and the distance + * matrix is collapsed. This process continues until there are only + * two nodes left, at which point those nodes are joined. + * + */ +NJ_TREE * +NJ_neighbor_joining(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree = NULL; + NJ_VERTEX *vertex = NULL; + + long int a, b; + float min; + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + if(!vertex) { + fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n"); + return(NULL); + } + + /* we iterate until the working distance matrix has only 2 entries */ + while(vertex->nactive > 2) { + + /* + * Find the global minimum transformed distance from the distance matrix + */ + min = NJ_min_transform(dmat, &a, &b); + + /* + * Build the tree by removing nodes a and b from the vertex array + * and inserting a new internal node which joins a and b. Collapse + * the vertex array similarly to how the distance matrix and r and r2 + * are compacted. + */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* decrement the r and r2 vectors by the distances corresponding to a, b */ + NJ_compute_r(dmat, a, b); + + /* compact the distance matrix and the r and r2 vectors */ + NJ_collapse(dmat, vertex, a, b); + } + + /* Properly join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + /* return the computed tree to the calling function */ + return(tree); +} + + + + + + + + +/* + * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining + * + * INPUTS: + * ------- + * nj_args -- A pointer to a data structure containing the command-line args + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * + * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree + * + * DESCRIPTION: + * ------------ + * + * This function implements the Relaxed Neighbor-Joining algorithm of + * Evans, J., Sheneman, L., and Foster, J. + * + * Relaxed Neighbor-Joining works by choosing a local minimum transformed + * distance when determining when to join two nodes. (Traditional + * Neighbor-Joining chooses a global minimum transformed distance). + * + * The algorithm shares the property with traditional NJ that if the + * input distances are additive (self-consistent), then the algorithm + * will manage to construct the true tree consistent with the additive + * distances. Additivity state is tracked and every proposed join is checked + * to make sure it maintains additivity constraints. If no + * additivity-preserving join is possible in a single pass, then the distance + * matrix is non-additive, and additivity checking is abandoned. + * + * The algorithm will either attempt joins randomly, or it will perform joins + * in a particular order. The default behavior is to perform joins randomly, + * but this can be switched off with a command-line switch. + * + * For randomized joins, all attempts are made to alleviate systematic bias + * for the choice of rows to joins. All tie breaking is done in a way which + * is virtually free of bias. + * + * To perform randomized joins, a random permutation is constructed which + * specifies the order in which to attempt joins. I iterate through the + * random permutation, and for each row in the random permutation, I find + * the minimum transformed distance for that row. If there are multiple + * minima, I break ties evenly. For the row which intersects our + * randomly chosen row at the chosen minimum, if we are are still in + * additivity mode, I check to see if joining the two rows will break + * our additivity constraints. If not, I check to see if there exists + * a transformed distance which is smaller than the minimum found on the + * original row. If there is, then we proceed through the random permutation + * trying additional rows in the random order specified in the permutation. + * If there is no smaller minimum transformed distance on either of the + * two rows, then we join them, collapse the distance matrix, and compute + * a new random permutation. + * + * If the entire random permutation is traversed and no joins are possible + * due to additivity constraints, then the distance matrix is not + * additive, and additivity constraint-checking is disabled. + * + */ +NJ_TREE * +NJ_relaxed_nj(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree; + NJ_VERTEX *vertex; + long int a, b, t, bh, bv, i; + float hmin, vmin, hvmin; + float p, q, x; + int join_flag; + int additivity_mode; + long int hmincount, vmincount; + long int *permutation = NULL; + + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + additivity_mode = 1; + + /* allocate the permutation vector, if we are in randomize mode */ + if(!nj_args->norandom) { + permutation = (long int *)calloc(dmat->size, sizeof(long int)); + if(!permutation) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n"); + return(NULL); + } + } + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + + /* loop until there are only 2 nodes left to join */ + while(vertex->nactive > 2) { + + switch(nj_args->norandom) { + + /* RANDOMIZED JOINS */ + case 0: + + join_flag = 0; + + NJ_permute(permutation, dmat->size-1); + for(i=0;isize-1 && (vertex->nactive>2) ;i++) { + + a = permutation[i]; + + /* find min trans dist along horiz. of row a */ + hmin = NJ_find_hmin(dmat, a, &bh, &hmincount); + if(a) { + /* find min trans dist along vert. of row a */ + vmin = NJ_find_vmin(dmat, a, &bv, &vmincount); + } else { + vmin = hmin; + bv = bh; + vmincount = 0; + } + + if(NJ_FLT_EQ(hmin, vmin)) { + + /* + * The minima along the vertical and horizontal are + * the same. Compute the proportion of minima along + * the horizonal (p) and the proportion of minima + * along the vertical (q). + * + * If the same minima exist along the horizonal and + * vertical, we break the tie in a way which is + * non-biased. That is, we break the tie based on the + * proportion of horiz. minima versus vertical minima. + * + */ + p = (float)hmincount / ((float)hmincount + (float)vmincount); + q = 1.0 - p; + x = genrand_real2(); + + if(x < p) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + } else if(NJ_FLT_LT(hmin, vmin) ) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + + if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) { + + /* swap a and b, if necessary, to make sure a < b */ + if(b < a) { + t = a; + a = b; + b = t; + } + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + NJ_permute(permutation, dmat->size-1); + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + + + + /* DETERMINISTIC JOINS */ + case 1: + + join_flag = 0; + + for(a=0;asize-1 && (vertex->nactive > 2) ;) { + + /* find the min along the horizontal of row a */ + hmin = NJ_find_hmin(dmat, a, &b, &hmincount); + + if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) { + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + if(a) { + a--; + } + + } else { + a++; + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + } + + } /* WHILE */ + + /* Join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + if(nj_args->verbose_flag) { + if(additivity_mode) { + printf("Tree is additive\n"); + } else { + printf("Tree is not additive\n"); + } + } + + if(vertex) { + NJ_free_vertex(vertex); + } + + if(!nj_args->norandom && permutation) { + free(permutation); + } + + return(tree); +} + + + + + + +/* + * NJ_print_distance_matrix() - + * + * Print a distance matrix + * + */ +void +NJ_print_distance_matrix(DMAT *dmat) { + + long int i, j; + + printf("ntaxa: %ld\n", dmat->ntaxa); + printf(" size: %ld\n", dmat->size); + + for(i=0;isize;i++) { + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]); + } else { + printf(" -"); + } + } + + + if(dmat->r && dmat->r2) { + printf("\t\t%0.4f", dmat->r[i]); + printf("\t%0.4f", dmat->r2[i]); + + + + printf("\n"); + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j])); + } else { + printf(" "); + } + } + + printf("\n"); + } + } + + printf("\n\n"); + + return; +} + + + + + + + +/* + * NJ_output_tree() - + * + * A wrapper for the function that really prints the tree, + * basically to get a newline in there conveniently. :-) + * + * Print n trees, as specified in command-args + * using "count" variable from 0 to (n-1) + * + */ +void +NJ_output_tree(NJ_ARGS *nj_args, + NJ_TREE *tree, + DMAT *dmat, + long int count) { + + FILE *fp; + + if(nj_args->stdout_flag) { + fp = stdout; + } else { + + if(count == 0) { + fp = fopen(nj_args->outfilename, "w"); /* open for writing */ + } else { + fp = fopen(nj_args->outfilename, "a"); /* open for appending */ + } + + if(!fp) { + fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename); + exit(-1); + } + } + + NJ_output_tree2(fp, nj_args, tree, tree, dmat); + fprintf(fp, ";\n"); + + if(!nj_args->stdout_flag) { + fclose(fp); + } + + return; +} + + + + + +/* + * NJ_output_tree2() - + * + * + */ +void +NJ_output_tree2(FILE *fp, + NJ_ARGS *nj_args, + NJ_TREE *tree, + NJ_TREE *root, + DMAT *dmat) { + + if(!tree) { + return; + } + + if(tree->taxa_index != NJ_INTERNAL_NODE) { + + if(nj_args->expblen) { + fprintf(fp, "%s:%e", + dmat->taxaname[tree->taxa_index], + tree->dist); + } else { + fprintf(fp, "%s:%f", + dmat->taxaname[tree->taxa_index], + tree->dist); + } + + } else { + + + if(tree->left && tree->right) { + fprintf(fp, "("); + } + if(tree->left) { + NJ_output_tree2(fp, nj_args, tree->left, root, dmat); + } + + if(tree->left && tree->right) { + fprintf(fp, ","); + } + if(tree->right) { + NJ_output_tree2(fp, nj_args, tree->right, root, dmat); + } + + if(tree != root->left) { + if(tree->left && tree->right) { + if(tree != root) { + if(nj_args->expblen) { + fprintf(fp, "):%e", tree->dist); + } else { + fprintf(fp, "):%f", tree->dist); + } + } else { + fprintf(fp, ")"); + } + } + } else { + fprintf(fp, ")"); + } + } + + return; +} + + + + + + + +/* + * NJ_init_r() + * + * This function computes the r column in our matrix + * + */ +void +NJ_init_r(DMAT *dmat) { + + long int i, j, size; + long int index; + float *r, *r2, *val; + long int size1; + float size2; + + r = dmat->r; + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + size1 = size-1; + size2 = (float)(size-2); + + index = 0; + for(i=0;inodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *)); + vertex->nodes_handle = vertex->nodes; + + /* initialize our size and active variables */ + vertex->nactive = dmat->ntaxa; + vertex->size = dmat->ntaxa; + + /* initialize the nodes themselves */ + for(i=0;intaxa;i++) { + + vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + vertex->nodes[i]->left = NULL; + vertex->nodes[i]->right = NULL; + + vertex->nodes[i]->taxa_index = i; + } + + return(vertex); +} + + + + + +/* + * NJ_decompose() - + * + * This function decomposes the star by creating new internal nodes + * and joining two existing tree nodes to it + * + */ +NJ_TREE * +NJ_decompose(DMAT *dmat, + NJ_VERTEX *vertex, + long int x, + long int y, + int last_flag) { + + NJ_TREE *new_node; + float x2clade, y2clade; + + /* compute the distance from the clade components to the new node */ + if(last_flag) { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[x] - dmat->r2[y])/2); + } + + vertex->nodes[x]->dist = x2clade; + + if(last_flag) { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[y] - dmat->r2[x])/2); + } + + vertex->nodes[y]->dist = y2clade; + + /* allocate new node to connect two sub-clades */ + new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + new_node->left = vertex->nodes[x]; + new_node->right = vertex->nodes[y]; + new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */ + + if(last_flag) { + return(new_node); + } + + vertex->nodes[x] = new_node; + vertex->nodes[y] = vertex->nodes[0]; + + vertex->nodes = &(vertex->nodes[1]); + + vertex->nactive--; + + return(new_node); +} + + + +/* + * NJ_print_vertex() - + * + * For debugging, print the contents of the vertex + * + */ +void +NJ_print_vertex(NJ_VERTEX *vertex) { + + long int i; + + printf("Number of active nodes: %ld\n", vertex->nactive); + + for(i=0;inactive;i++) { + printf("%ld ", vertex->nodes[i]->taxa_index); + } + printf("\n"); + + return; +} + + + + + + + + + +/* + * NJ_print_r() - + * + */ +void +NJ_print_r(DMAT *dmat) { + + long int i; + + printf("\n"); + for(i=0;isize;i++) { + printf("r[%ld] = %0.2f\n", i, dmat->r[i]); + } + printf("\n"); + + return; +} + + + + + +/* + * NJ_print_taxanames() - + * + * Print taxa names here + * + */ +void +NJ_print_taxanames(DMAT *dmat) { + + long int i; + + printf("Number of taxa: %ld\n", dmat->ntaxa); + + for(i=0;intaxa;i++) { + printf("%ld) %s\n", i, dmat->taxaname[i]); + } + + printf("\n"); + + return; +} + + + + +/* + * NJ_shuffle_distance_matrix() - + * + * Randomize a distance matrix here + * + */ +void +NJ_shuffle_distance_matrix(DMAT *dmat) { + + + long int *perm = NULL; + char **tmp_taxaname = NULL; + float *tmp_val = NULL; + long int i, j; + + + /* alloc the random permutation and a new matrix to hold the shuffled vals */ + perm = (long int *)calloc(dmat->size, sizeof(long int)); + tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *)); + tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); + if(!tmp_taxaname || !perm || !tmp_val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n"); + exit(-1); + } + + /* compute a permutation which will describe how to shuffle the matrix */ + NJ_permute(perm, dmat->size); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + if(perm[j] < perm[i]) { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)]; + } else { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)]; + } + + } + + tmp_taxaname[i] = dmat->taxaname[perm[i]]; + } + + /* free our random permutation */ + if(perm) { + free(perm); + } + + /* free the old value matrix */ + if(dmat->val) { + free(dmat->val); + } + + /* re-assign the value matrix pointers */ + dmat->val = tmp_val; + dmat->valhandle = dmat->val; + + /* + * Free our old taxaname with its particular ordering + * and re-assign to the new. + */ + if(dmat->taxaname) { + free(dmat->taxaname); + } + dmat->taxaname = tmp_taxaname; + + return; +} + + + +/* + * NJ_free_tree() - + * + * Free a given NJ tree + */ +void +NJ_free_tree(NJ_TREE *node) { + + if(!node) { + return; + } + + if(node->left) { + NJ_free_tree(node->left); + } + + if(node->right) { + NJ_free_tree(node->right); + } + + free(node); + + return; +} + + + + + + + + + +/* + * NJ_print_permutation() + * + * Print a permutation + * + */ +void +NJ_print_permutation(long int *perm, + long int size) { + + long int i; + + for(i=0;intaxa = src->ntaxa; + dest->size = src->size; + + /* allocate space for array of pointers to taxanames */ + dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *)); + if(!dest->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the taxanames themselves */ + for(i=0;intaxa;i++) { + dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char)); + if(!dest->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + } + + /* allocate space for the distance values */ + dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float)); + if(!dest->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the r and r2 vectors */ + dest->r = (float *)calloc(src->ntaxa, sizeof(float)); + dest->r2 = (float *)calloc(src->ntaxa, sizeof(float)); + + /* copy titles */ + for(i=0;intaxa;i++) { + strcpy(dest->taxaname[i], src->taxaname[i]); + } + + /* copy values */ + memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float)); + + /* copy r and r2 */ + memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float)); + memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float)); + + /* track some memory addresses */ + dest->valhandle = dest->val; + dest->rhandle = dest->r; + dest->r2handle = dest->r2; + + return(dest); + + XIT_BAD: + + /* free what we may have allocated */ + NJ_free_dmat(dest); + + return(NULL); +} + + + + +/* + * NJ_free_dmat() - + */ +void +NJ_free_dmat(DMAT *dmat) { + + long int i; + + if(dmat) { + + if(dmat->taxaname) { + + for(i=0;intaxa;i++) { + if(dmat->taxaname[i]) { + free(dmat->taxaname[i]); + } + } + + free(dmat->taxaname); + } + + if(dmat->valhandle) { + free(dmat->valhandle); + } + + if(dmat->rhandle) { + free(dmat->rhandle); + } + + if(dmat->r2handle) { + free(dmat->r2handle); + } + + free(dmat); + } + + return; +} + + + + + +/* + * NJ_free_vertex() - + * + * Free the vertex data structure + * + */ +void +NJ_free_vertex(NJ_VERTEX *vertex) { + + if(vertex) { + if(vertex->nodes_handle) { + free(vertex->nodes_handle); + } + free(vertex); + } + + return; +} + + + + + + + + + +/* + * + * NJ_min_transform() - Find the smallest transformed value to identify + * which nodes to join. + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * + * RETURNS: + * -------- + * -- The minimimum transformed distance + * ret_i -- The row of the smallest transformed distance (by reference) + * ret_j -- The col of the smallest transformed distance (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * Used only with traditional Neighbor-Joining, this function checks the entire + * working distance matrix and identifies the smallest transformed distance. + * This requires traversing the entire diagonal matrix, which is itself a + * O(N^2) operation. + * + */ +float +NJ_min_transform(DMAT *dmat, + long int *ret_i, + long int *ret_j) { + + long int i, j; /* indices used for looping */ + long int tmp_i = 0;/* to limit pointer dereferencing */ + long int tmp_j = 0;/* to limit pointer dereferencing */ + float smallest; /* track the smallest trans. dist */ + float curval; /* the current trans. dist in loop */ + + float *ptr; /* pointer into distance matrix */ + float *r2; /* pointer to r2 matrix for computing transformed dists */ + + smallest = (float)HUGE_VAL; + + /* track these here to limit pointer dereferencing in inner loop */ + ptr = dmat->val; + r2 = dmat->r2; + + /* for every row */ + for(i=0;isize;i++) { + ptr++; /* skip diagonal */ + for(j=i+1;jsize;j++) { /* for every column */ + + /* find transformed distance in matrix at i, j */ + curval = *(ptr++) - (r2[i] + r2[j]); + + /* if the transformed distanance is less than the known minimum */ + if(curval < smallest) { + + smallest = curval; + tmp_i = i; + tmp_j = j; + } + } + } + + /* pass back (by reference) the coords of the min. transformed distance */ + *ret_i = tmp_i; + *ret_j = tmp_j; + + return(smallest); /* return the min transformed distance */ +} + + + + + + +