+++ /dev/null
-
-/*
- * clearcut.c
- *
- * $Id$
- *
- *****************************************************************************
- *
- * Copyright (c) 2004, Luke Sheneman
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * + Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * + Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * + The names of its contributors may not be used to endorse or promote
- * products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *****************************************************************************
- *
- * An implementation of the Relaxed Neighbor-Joining algorithm
- * of Evans, J., Sheneman, L., and Foster, J.
- *
- *
- * AUTHOR:
- *
- * Luke Sheneman
- * sheneman@cs.uidaho.edu
- *
- */
-
-
-
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <math.h>
-#include <time.h>
-#include <sys/time.h>
-#include <float.h>
-
-#include "distclearcut.h"
-#include "dmat.h"
-#include "fasta.h"
-#include "cmdargs.h"
-#include "common.h"
-#include "clearcut.h"
-#include "prng.h"
-
-
-/*
- * main() -
- *
- * The entry point to the program.
- *
- */
-int clearcut_main(int argc, char *argv[]) {
-
- DMAT *dmat; /* The working distance matrix */
- DMAT *dmat_backup = NULL;/* A backup distance matrix */
- NJ_TREE *tree; /* The phylogenetic tree */
- NJ_ARGS *nj_args; /* Structure for holding command-line arguments */
- long int i;
-
- /* some variables for tracking time */
- struct timeval tv;
- unsigned long long startUs, endUs;
-
-
- /* check and parse supplied command-line arguments */
- nj_args = NJ_handle_args(argc, argv);
-
- if(!nj_args) {
- fprintf(stderr, "Clearcut: Error processing command-line arguments.\n");
- exit(-1);
- }
-
- /* for verbose reporting, print the random number seed to stdout */
- if(nj_args->verbose_flag) {
- printf("PRNG SEED: %d\n", nj_args->seed);
- }
-
- /* Initialize Mersenne Twister PRNG */
- init_genrand(nj_args->seed);
-
-
- switch(nj_args->input_mode) {
-
- /* If the input type is a distance matrix */
- case NJ_INPUT_MODE_DISTANCE:
-
- /* parse the distance matrix */
- dmat = NJ_parse_distance_matrix(nj_args);
- if(!dmat) {
- exit(-1);
- }
-
- break;
-
- /* If the input type is a multiple sequence alignment */
- case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
-
- /* build a distance matrix from a multiple sequence alignment */
- dmat = NJ_build_distance_matrix(nj_args);
- if(!dmat) {
- fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n");
- exit(-1);
- }
-
- break;
-
- default:
-
- fprintf(stderr, "Clearcut: Could not determine how to process input\n");
- exit(-1);
- }
-
- /*
- * Output the computed distance matrix,
- * if the user specified one.
- */
- if(nj_args->matrixout) {
- NJ_output_matrix(nj_args, dmat);
- }
-
- /*
- * If we are going to generate multiple trees from
- * the same distance matrix, we need to make a backup
- * of the original distance matrix.
- */
- if(nj_args->ntrees > 1) {
- dmat_backup = NJ_dup_dmat(dmat);
- }
-
- /* process n trees */
- for(i=0;i<nj_args->ntrees;i++) {
-
- /*
- * If the user has specified matrix shuffling, we need
- * to randomize the distance matrix
- */
- if(nj_args->shuffle) {
- NJ_shuffle_distance_matrix(dmat);
- }
-
- /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */
- gettimeofday(&tv, NULL);
- startUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
- + ((unsigned long long) tv.tv_usec);
-
-
- /*
- * Invoke either the Relaxed Neighbor-Joining algorithm (default)
- * or the "traditional" Neighbor-Joining algorithm
- */
- if(nj_args->neighbor) {
- tree = NJ_neighbor_joining(nj_args, dmat);
- } else {
- tree = NJ_relaxed_nj(nj_args, dmat);
- }
-
- if(!tree) {
- fprintf(stderr, "Clearcut: Failed to construct tree.\n");
- exit(0);
- }
-
- /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */
- gettimeofday(&tv, NULL);
- endUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
- + ((unsigned long long) tv.tv_usec);
-
- /* print the time taken to perform the neighbor join */
- if(nj_args->verbose_flag) {
- if(nj_args->neighbor) {
- fprintf(stderr, "NJ tree built in %llu.%06llu secs\n",
- (endUs - startUs) / 1000000ULL,
- (endUs - startUs) % 1000000ULL);
- } else {
- fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n",
- (endUs - startUs) / 1000000ULL,
- (endUs - startUs) % 1000000ULL);
- }
- }
-
- /* Output the neighbor joining tree here */
- NJ_output_tree(nj_args, tree, dmat, i);
-
- NJ_free_tree(tree); /* Free the tree */
- NJ_free_dmat(dmat); /* Free the working distance matrix */
-
- /*
- * If we need to do another iteration, lets re-initialize
- * our working distance matrix.
- */
- if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) {
- dmat = NJ_dup_dmat(dmat_backup);
- }
- }
-
- /* Free the backup distance matrix */
- if(nj_args->ntrees > 1) {
- NJ_free_dmat(dmat_backup);
- }
-
- /* If verbosity, describe where the tree output is */
- if(nj_args->verbose_flag) {
- if(nj_args->neighbor) {
- printf("NJ tree(s) in %s\n", nj_args->outfilename);
- } else {
- printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename);
- }
- }
-
- return 0;
-}
-
-
-
-
-
-/*
- * NJ_find_hmin() - Find minimum transformed values along horizontal
- *
- *
- * INPUTS:
- * -------
- * dmat -- The distance matrix
- * a -- The index of the specific taxon in the distance matrix
- *
- * RETURNS:
- * --------
- * <float> -- The value of the selected minimum
- * min -- Used to transport the index of the minima out
- * of the function (by reference)
- * hmincount -- Return the number of minima along the horizontal
- * (by reference)
- *
- *
- * DESCRIPTION:
- * ------------
- *
- * A fast, inline function to find the smallest transformed value
- * along the "horizontal" portion of an entry in a distance matrix.
- *
- * Distance matrices are stored internally as continguously-allocated
- * upper-diagonal structures. With the exception of the taxa at
- * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
- * and vertical component in the distance matrix. This function
- * scans the horizonal portion of the entry in the distance matrix
- * for the specified taxon and finds the minimum transformed value
- * along that horizontal component.
- *
- * Since multiple minima can exist along the horizontal portion
- * of the entry, I consider all minima and break ties
- * stochastically to help avoid systematic bias.
- *
- * Just searching along the horizontal portion of a row is very fast
- * since the data is stored linearly and contiguously in memory and
- * cache locality is exploited in the distance matrix representation.
- *
- * Look at nj.h for more information on how the distance matrix
- * is architected.
- *
- */
-static inline
-float
-NJ_find_hmin(DMAT *dmat,
- long int a,
- long int *min,
- long int *hmincount) {
-
- long int i; /* index variable for looping */
- int size; /* current size of distance matrix */
- int mindex = 0; /* holds the current index to the chosen minimum */
- float curval; /* used to hold current transformed values */
- float hmin; /* the value of the transformed minimum */
-
- float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */
-
- /* values used for stochastic selection among multiple minima */
- float p, x;
- long int smallcnt;
-
- /* initialize the min to something large */
- hmin = (float)HUGE_VAL;
-
- /* setup some pointers to limit dereferencing later */
- r2 = dmat->r2;
- val = dmat->val;
- size = dmat->size;
-
- /* initialize values associated with minima tie breaking */
- p = 1.0;
- smallcnt = 0;
-
-
- ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */
- for(i=a+1;i<size;i++) {
-
- curval = *(ptr++) - (r2[a] + r2[i]); /* compute transformed distance */
-
- if(NJ_FLT_EQ(curval, hmin)) { /* approx. equal */
-
- smallcnt++;
-
- p = 1.0/(float)smallcnt;
- x = genrand_real2();
-
- /* select this minimum in a way which is proportional to
- the number of minima found along the row so far */
- if( x < p ) {
- mindex = i;
- }
-
- } else if (curval < hmin) {
-
- smallcnt = 1;
- hmin = curval;
- mindex = i;
- }
- }
-
- /* save off the the minimum index to be returned via reference */
- *min = mindex;
-
- /* save off the number of minima */
- *hmincount = smallcnt;
-
- /* return the value of the smallest tranformed distance */
- return(hmin);
-}
-
-
-
-
-
-
-
-
-/*
- * NJ_find_vmin() - Find minimum transformed distance along vertical
- *
- *
- * INPUTS:
- * -------
- * dmat -- The distance matrix
- * a -- The index of the specific taxon in the distance matrix
- *
- *
- * RETURNS:
- * --------
- * <float> -- The value of the selected minimum
- * min -- Used to transport the index of the minima out
- * of the function (by reference)
- * vmincount -- The number of minima along the vertical
- * return by reference.
- *
- * DESCRIPTION:
- * ------------
- *
- * A fast, inline function to find the smallest transformed value
- * along the "vertical" portion of an entry in a distance matrix.
- *
- * Distance matrices are stored internally as continguously-allocated
- * upper-diagonal matrices. With the exception of the taxa at
- * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
- * and vertical component in the distance matrix. This function
- * scans the vertical portion of the entry in the distance matrix
- * for the specified taxon and finds the minimum transformed value
- * along that vertical component.
- *
- * Since multiple minima can exist along the vertical portion
- * of the entry, I consider all minima and break ties
- * stochastically to help avoid systematic bias.
- *
- * Due to cache locality reasons, searching along the vertical
- * component is going to be considerably slower than searching
- * along the horizontal.
- *
- * Look at nj.h for more information on how the distance matrix
- * is architected.
- *
- */
-static inline
-float
-NJ_find_vmin(DMAT *dmat,
- long int a,
- long int *min,
- long int *vmincount) {
-
- long int i; /* index variable used for looping */
- long int size; /* track the size of the matrix */
- long int mindex = 0;/* track the index to the minimum */
- float curval; /* track value of current transformed distance */
- float vmin; /* the index to the smallest "vertical" minimum */
-
- /* pointers which are used to reduce pointer dereferencing in inner loop */
- float *ptr, *r2, *val;
-
- /* values used in stochastically breaking ties */
- float p, x;
- long int smallcnt;
-
- /* initialize the vertical min to something really big */
- vmin = (float)HUGE_VAL;
-
- /* save off some values to limit dereferencing later */
- r2 = dmat->r2;
- val = dmat->val;
- size = dmat->size;
-
- p = 1.0;
- smallcnt = 0;
-
- /* start on the first row and work down */
- ptr = &(val[NJ_MAP(0, a, size)]);
- for(i=0;i<a;i++) {
-
- curval = *ptr - (r2[i] + r2[a]); /* compute transformed distance */
-
- if(NJ_FLT_EQ(curval, vmin)) { /* approx. equal */
-
- smallcnt++;
-
- p = 1.0/(float)smallcnt;
- x = genrand_real2();
-
- /* break ties stochastically to avoid systematic bias */
- if( x < p ) {
- mindex = i;
- }
-
- } else if (curval < vmin) {
-
- smallcnt = 1;
- vmin = curval;
- mindex = i;
- }
-
- /* increment our working pointer to the next row down */
- ptr += size-i-1;
- }
-
- /* pass back the index to the minimum found so far (by reference) */
- *min = mindex;
-
- /* pass back the number of minima along the vertical */
- *vmincount = smallcnt;
-
- /* return the value of the smallest transformed distance */
- return(vmin);
-}
-
-
-
-
-/*
- * NJ_permute() - Generate random permutation using the provably unbiased
- * Fisher-Yates Shuffle.
- *
- * INPUTS:
- * -------
- * perm -- A pointer to the array of long ints which will be filled.
- * size -- the length of the permutation vector
- *
- *
- * OUTPUTS:
- * -------
- * NONE
- *
- *
- * DESCRIPTION:
- * ------------
- *
- * Return a permuted list of numbers from 0 through size.
- * This is accomplished by initializing the permutation
- * as an ordered list of integers and then iterating
- * through and swapping two values selected according to the
- * Fisher-Yates method.
- *
- * This unbiased method for random permutation generation is
- * discussed in:
- *
- * Donald E. Knuth, The Art of Computer Programming,
- * Addison-Wesley, Volumes 1, 2, and 3, 3rd edition, 1998
- *
- */
-static inline
-void
-NJ_permute(long int *perm,
- long int size) {
-
- long int i; /* index used for looping */
- long int swap; /* we swap values to generate permutation */
- long int tmp; /* used for swapping values */
-
-
- /* check to see if vector of long ints is valid */
- if(!perm) {
- fprintf(stderr, "Clearcut: NULL permutation pointer in NJ_permute()\n");
- exit(-1);
- }
-
- /* init permutation as an ordered list of integers */
- for(i=0;i<size;i++) {
- perm[i] = i;
- }
-
- /*
- * Iterate across the array from i = 0 to size -1, swapping ith element
- * with a randomly chosen element from a changing range of possible values
- */
- for(i=0;i<size;i++) {
-
- /* choose which element we will swap with */
- swap = i + NJ_genrand_int31_top(size-i);
-
- /* swap elements here */
- if(i != swap) {
- tmp = perm[swap];
- perm[swap] = perm[i];
- perm[i] = tmp;
- }
- }
-
- return;
-}
-
-
-
-
-
-/*
- * NJ_compute_r() - Compute post-join changes to r-vector. In this
- * case, we decrement all of the accumulated distances
- * in the r-vector for the two nodes that were
- * recently joined (i.e. x, y)
- *
- * INPUTS:
- * -------
- * dmat -- The distance matrix
- * a -- The index of one of the taxa that were joined
- * b -- The index of the other taxa that was joined
- *
- * RETURNS:
- * --------
- * NONE
- *
- * DESCRIPTION:
- * ------------
- *
- * This vector of floats is used as a summary of overall distances from
- * each entry in the distance matrix to every other entry. These values
- * are then used when computing the transformed distances from which
- * decisions concerning joining are made.
- *
- * For speed, we don't recompute r from scratch. Instead, we decrement
- * all entries in r by the appropriate amount. That is, r[i] -= dist(i, a)
- * and r[i] -= dist(i, b).
- *
- * As a speed optimization, I process the rows altogether for cache locality
- * purposes, and then process columns.
- *
- * The processing of the scaled r matrix (r2) is handled on-the-fly elsewhere.
- *
- */
-static inline
-void
-NJ_compute_r(DMAT *dmat,
- long int a,
- long int b) {
-
- long int i; /* a variable used in indexing */
- float *ptrx, *ptry; /* pointers into the distance matrix */
-
- /* some variables to limit pointer dereferencing in loop */
- long int size;
- float *r, *val;
-
- /* to limit pointer dereferencing */
- size = dmat->size;
- val = dmat->val;
- r = dmat->r+a+1;
-
- /*
- * Loop through the rows and decrement the stored r values
- * by the distances stored in the rows and columns of the distance
- * matrix which are being removed post-join.
- *
- * We do the rows altogether in order to benefit from cache locality.
- */
- ptrx = &(val[NJ_MAP(a, a+1, size)]);
- ptry = &(val[NJ_MAP(b, b+1, size)]);
-
- for(i=a+1;i<size;i++) {
- *r -= *(ptrx++);
-
- if(i>b) {
- *r -= *(ptry++);
- }
-
- r++;
- }
-
- /* Similar to the above loop, we now do the columns */
- ptrx = &(val[NJ_MAP(0, a, size)]);
- ptry = &(val[NJ_MAP(0, b, size)]);
- r = dmat->r;
- for(i=0;i<b;i++) {
- if(i<a) {
- *r -= *ptrx;
- ptrx += size-i-1;
- }
-
- *r -= *ptry;
- ptry += size-i-1;
- r++;
- }
-
- return;
-}
-
-
-
-
-
-/*
- * NJ_check_additivity() - Check to make sure that addivity preserved by join
- *
- *
- * INPUTS:
- * -------
- * dmat -- distance matrix
- * a -- index into dmat for one of the rows to be joined
- * b -- index into dmat for another row to be joined
- *
- * OUTPUTS:
- * --------
- * int 1 if join adheres to additivity constraint
- * 0 if join does breaks additivity
- *
- * DESCRIPTION:
- * ------------
- *
- * Here we perform the check to make sure that by joining a and b we do not
- * also break consistency (i.e. preserves additivity) with the distances between
- * the taxa in the new clade and other nodes in the tree. This is done quite
- * efficiently by looking up the untransformed distance between node b and
- * some other "target" taxa in the distance matrix (which is not a nor b) and
- * comparing that distance to the distance computed by finding the distance
- * from node a to the proposed internal node "x" which joins (a,b).
- *
- * If dist(x,b) + dist (b, target) == dist(b, target) then additivity is
- * preserved, otherwise, additivity is not preserved. If we are in
- * additivity mode, this join should be rejected.
- *
- */
-static inline
-int
-NJ_check_additivity(DMAT *dmat,
- long int a,
- long int b) {
-
- float a2clade, b2clade;
- float clade_dist;
- long int target;
-
-
- /* determine target taxon here */
- if(b == dmat->size-1) {
- /* if we can't do a row here, lets do a column */
- if(a==0) {
- if(b==1) {
- target = 2;
- } else {
- target = 1;
- }
- } else {
- target = 0;
- }
- } else {
- target = b+1;
- }
-
-
- /* distance between a and the root of clade (a,b) */
- a2clade =
- ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
- (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
-
- /* distance between b and the root of clade (a,b) */
- b2clade =
- ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
- (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
-
- /* distance between the clade (a,b) and the target taxon */
- if(b<target) {
-
- /* compute the distance from the clade root to the target */
- clade_dist =
- ( (dmat->val[NJ_MAP(a, target, dmat->size)] - a2clade) +
- (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0;
-
- /*
- * Check to see that distance from clade root to target + distance from
- * b to clade root are equal to the distance from b to the target
- */
- if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)],
- (clade_dist + b2clade))) {
- return(1); /* join is legitimate */
- } else {
- return(0); /* join is illigitimate */
- }
-
- } else {
-
- /* compute the distance from the clade root to the target */
- clade_dist =
- ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) +
- (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0;
-
- /*
- * Check to see that distance from clade root to target + distance from
- * b to clade root are equal to the distance from b to the target
- */
- if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)],
- (clade_dist + b2clade))) {
- return(1); /* join is legitimate */
- } else {
- return(0); /* join is illegitimate */
- }
- }
-}
-
-
-
-
-
-
-
-/*
- * NJ_check() - Check to see if two taxa can be joined
-
- *
- * INPUTS:
- * -------
- * nj_args -- Pointer to the data structure holding command-line args
- * dmat -- distance matrix
- * a -- index into dmat for one of the rows to be joined
- * b -- index into dmat for another row to be joined
- * min -- the minimum value found
- * additivity -- a flag (0 = not additive mode, 1 = additive mode)
- *
- * OUTPUTS:
- * --------
- * int 1 if join is okay
- * 0 if join is not okay
- *
- * DESCRIPTION:
- * ------------
- *
- * This function ultimately takes two rows and makes sure that the
- * intersection of those two rows, which has a transformed distance of
- * "min", is actually the smallest (or equal to the smallest)
- * transformed distance for both rows (a, b). If so, it returns
- * 1, else it returns 0.
- *
- * Basically, we want to join two rows only if the minimum
- * transformed distance on either row is at the intersection of
- * those two rows.
- *
- */
-static inline
-int
-NJ_check(NJ_ARGS *nj_args,
- DMAT *dmat,
- long int a,
- long int b,
- float min,
- int additivity) {
-
-
- long int i, size;
- float *ptr, *val, *r2;
-
-
- /* some aliases for speed and readability reasons */
- val = dmat->val;
- r2 = dmat->r2;
- size = dmat->size;
-
-
- /* now determine if joining a, b will result in broken distances */
- if(additivity) {
- if(!NJ_check_additivity(dmat, a, b)) {
- return(0);
- }
- }
-
- /* scan the horizontal of row b, punt if anything < min */
- ptr = &(val[NJ_MAP(b, b+1, size)]);
- for(i=b+1;i<size;i++) {
- if( NJ_FLT_LT( (*ptr - (r2[b] + r2[i])), min) ) {
- return(0);
- }
- ptr++;
- }
-
- /* scan the vertical component of row a, punt if anything < min */
- if(nj_args->norandom) { /* if we are doing random joins, we checked this */
- ptr = val + a;
- for(i=0;i<a;i++) {
- if( NJ_FLT_LT( (*ptr - (r2[i] + r2[a])), min) ) {
- return(0);
- }
- ptr += size-i-1;
- }
- }
-
- /* scan the vertical component of row b, punt if anything < min */
- ptr = val + b;
- for(i=0;i<b;i++) {
- if( NJ_FLT_LT( (*ptr - (r2[i] + r2[b])), min) && i!=a) {
- return(0);
- }
- ptr += size-i-1;
- }
-
- return(1);
-}
-
-
-
-
-
-
-
-/*
- * NJ_collapse() - Collapse the distance matrix by removing
- * rows a and b from the distance matrix and
- * replacing them with a single new row which
- * represents the internal node joining a and b
- *
- *
- * INPUTS:
- * -------
- * dmat -- A pointer to the distance matrix
- * vertex -- A pointer to the vertex vector (vector of tree nodes)
- * which is used in constructing the tree
- * a -- An index to a row in the distance matrix from which we
- * joined. This row will be collapsed.
- * b -- An index to a row in the distance matrix from which we
- * joined. This row will be collapsed.
- *
- * RETURNS:
- * --------
- * NONE
- *
- *
- * DESCRIPTION:
- * ------------
- *
- * This function collapses the distance matrix in a way which optimizes
- * cache locality and ultimately gives us a speed improvement due to
- * cache. At this point, we've decided to join rows a and b from
- * the distance matrix. We will remove rows a and b from the distance
- * matrix and replace them with a new row which represents the internal
- * node which joins rows a and b together.
- *
- * We always keep the matrix as compact as possible in order to
- * get good performance from our cache in subsequent operations. Cache
- * is the key to good performance here.
- *
- * Key Steps:
- * ----------
- *
- * 1) Fill the "a" row with the new distances of the internal node
- * joining a and b to all other rows.
- * 2) Copy row 0 into what was row b
- * 3) Increment the pointer to the start of the distance matrix
- * by one row.
- * 4) Decrement the size of the matrix by one row.
- * 5) Do roughly the same thing to the r vector in order to
- * keep it in sync with the distance matrix.
- * 6) Compute the scaled r vector (r2) based on the updated
- * r vector
- *
- * This keeps the distance matrix as compact as possible in memory, and
- * is a relatively fast operation.
- *
- * This function requires that a < b
- *
- */
-static inline
-void
-NJ_collapse(DMAT *dmat,
- NJ_VERTEX *vertex,
- long int a,
- long int b) {
-
-
- long int i; /* index used for looping */
- long int size; /* size of dmat --> reduce pointer dereferencing */
- float a2clade; /* distance from a to the new node that joins a and b */
- float b2clade; /* distance from b to the new node that joins a and b */
- float cval; /* stores distance information during loop */
- float *vptr; /* pointer to elements in first row of dist matrix */
- float *ptra; /* pointer to elements in row a of distance matrix */
- float *ptrb; /* pointer to elements in row b of distance matrix */
-
- float *val, *r, *r2; /* simply used to limit pointer dereferencing */
-
-
- /* We must assume that a < b */
- if(a >= b) {
- fprintf(stderr, "Clearcut: (a<b) constraint check failed in NJ_collapse()\n");
- exit(0);
- }
-
- /* some shortcuts to help limit dereferencing */
- val = dmat->val;
- r = dmat->r;
- r2 = dmat->r2;
- size = dmat->size;
-
- /* compute the distance from the clade components (a, b) to the new node */
- a2clade =
- ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
- b2clade =
- ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
-
-
- r[a] = 0.0; /* we are removing row a, so clear dist. in r */
-
- /*
- * Fill the horizontal part of the "a" row and finish computing r and r2
- * we handle the horizontal component first to maximize cache locality
- */
- ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */
- ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */
- for(i=a+1;i<size;i++) {
-
- /*
- * Compute distance from new internal node to others in
- * the distance matrix.
- */
- cval =
- ( (*ptra - a2clade) +
- (*ptrb - b2clade) ) / 2.0;
-
- /* incr. row b pointer differently depending on where i is in loop */
- if(i<b) {
- ptrb += size-i-1; /* traverse vertically by incrementing by row */
- } else {
- ptrb++; /* traverse horiz. by incrementing by column */
- }
-
- /* assign the newly computed distance and increment a ptr by a column */
- *(ptra++) = cval;
-
- /* accumulate the distance onto the r vector */
- r[a] += cval;
- r[i] += cval;
-
- /* scale r2 on the fly here */
- r2[i] = r[i]/(float)(size-3);
- }
-
- /* fill the vertical part of the "a" column and finish computing r and r2 */
- ptra = val + a; /* start at the top of the columb for "a" */
- ptrb = val + b; /* start at the top of the columb for "b" */
- for(i=0;i<a;i++) {
-
- /*
- * Compute distance from new internal node to others in
- * the distance matrix.
- */
- cval =
- ( (*ptra - a2clade) +
- (*ptrb - b2clade) ) / 2.0;
-
- /* assign the newly computed distance and increment a ptr by a column */
- *ptra = cval;
-
- /* accumulate the distance onto the r vector */
- r[a] += cval;
- r[i] += cval;
-
- /* scale r2 on the fly here */
- r2[i] = r[i]/(float)(size-3);
-
- /* here, always increment by an entire row */
- ptra += size-i-1;
- ptrb += size-i-1;
- }
-
-
- /* scale r2 on the fly here */
- r2[a] = r[a]/(float)(size-3);
-
-
-
- /*
- * Copy row 0 into row b. Again, the code is structured into two
- * loops to maximize cache locality for writes along the horizontal
- * component of row b.
- */
- vptr = val;
- ptrb = val + b;
- for(i=0;i<b;i++) {
- *ptrb = *(vptr++);
- ptrb += size-i-1;
- }
- vptr++; /* skip over the diagonal */
- ptrb = &(val[NJ_MAP(b, b+1, size)]);
- for(i=b+1;i<size;i++) {
- *(ptrb++) = *(vptr++);
- }
-
- /*
- * Collapse r here by copying contents of r[0] into r[b] and
- * incrementing pointer to the beginning of r by one row
- */
- r[b] = r[0];
- dmat->r = r+1;
-
-
- /*
- * Collapse r2 here by copying contents of r2[0] into r2[b] and
- * incrementing pointer to the beginning of r2 by one row
- */
- r2[b] = r2[0];
- dmat->r2 = r2+1;
-
- /* increment dmat pointer to next row */
- dmat->val += size;
-
- /* decrement the total size of the distance matrix by one row */
- dmat->size--;
-
- return;
-}
-
-
-
-
-
-
-
-
-
-/*
- * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining
- *
- *
- * INPUTS:
- * -------
- * nj_args -- A pointer to a structure containing the command-line arguments
- * dmat -- A pointer to the distance matrix
- *
- * RETURNS:
- * --------
- * NJ_TREE * -- A pointer to the Neighbor-Joining tree.
- *
- * DESCRIPTION:
- * ------------
- *
- * This function performs a traditional Neighbor-Joining operation in which
- * the distance matrix is exhaustively searched for the global minimum
- * transformed distance. The two nodes which intersect at the global
- * minimum transformed distance are then joined and the distance
- * matrix is collapsed. This process continues until there are only
- * two nodes left, at which point those nodes are joined.
- *
- */
-NJ_TREE *
-NJ_neighbor_joining(NJ_ARGS *nj_args,
- DMAT *dmat) {
-
-
- NJ_TREE *tree = NULL;
- NJ_VERTEX *vertex = NULL;
-
- long int a, b;
- float min;
-
-
- /* initialize the r and r2 vectors */
- NJ_init_r(dmat);
-
- /* allocate and initialize our vertex vector used for tree construction */
- vertex = NJ_init_vertex(dmat);
- if(!vertex) {
- fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n");
- return(NULL);
- }
-
- /* we iterate until the working distance matrix has only 2 entries */
- while(vertex->nactive > 2) {
-
- /*
- * Find the global minimum transformed distance from the distance matrix
- */
- min = NJ_min_transform(dmat, &a, &b);
-
- /*
- * Build the tree by removing nodes a and b from the vertex array
- * and inserting a new internal node which joins a and b. Collapse
- * the vertex array similarly to how the distance matrix and r and r2
- * are compacted.
- */
- NJ_decompose(dmat, vertex, a, b, 0);
-
- /* decrement the r and r2 vectors by the distances corresponding to a, b */
- NJ_compute_r(dmat, a, b);
-
- /* compact the distance matrix and the r and r2 vectors */
- NJ_collapse(dmat, vertex, a, b);
- }
-
- /* Properly join the last two nodes on the vertex list */
- tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
-
- /* return the computed tree to the calling function */
- return(tree);
-}
-
-
-
-
-
-
-
-
-/*
- * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining
- *
- * INPUTS:
- * -------
- * nj_args -- A pointer to a data structure containing the command-line args
- * dmat -- A pointer to the distance matrix
- *
- * RETURNS:
- * --------
- *
- * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree
- *
- * DESCRIPTION:
- * ------------
- *
- * This function implements the Relaxed Neighbor-Joining algorithm of
- * Evans, J., Sheneman, L., and Foster, J.
- *
- * Relaxed Neighbor-Joining works by choosing a local minimum transformed
- * distance when determining when to join two nodes. (Traditional
- * Neighbor-Joining chooses a global minimum transformed distance).
- *
- * The algorithm shares the property with traditional NJ that if the
- * input distances are additive (self-consistent), then the algorithm
- * will manage to construct the true tree consistent with the additive
- * distances. Additivity state is tracked and every proposed join is checked
- * to make sure it maintains additivity constraints. If no
- * additivity-preserving join is possible in a single pass, then the distance
- * matrix is non-additive, and additivity checking is abandoned.
- *
- * The algorithm will either attempt joins randomly, or it will perform joins
- * in a particular order. The default behavior is to perform joins randomly,
- * but this can be switched off with a command-line switch.
- *
- * For randomized joins, all attempts are made to alleviate systematic bias
- * for the choice of rows to joins. All tie breaking is done in a way which
- * is virtually free of bias.
- *
- * To perform randomized joins, a random permutation is constructed which
- * specifies the order in which to attempt joins. I iterate through the
- * random permutation, and for each row in the random permutation, I find
- * the minimum transformed distance for that row. If there are multiple
- * minima, I break ties evenly. For the row which intersects our
- * randomly chosen row at the chosen minimum, if we are are still in
- * additivity mode, I check to see if joining the two rows will break
- * our additivity constraints. If not, I check to see if there exists
- * a transformed distance which is smaller than the minimum found on the
- * original row. If there is, then we proceed through the random permutation
- * trying additional rows in the random order specified in the permutation.
- * If there is no smaller minimum transformed distance on either of the
- * two rows, then we join them, collapse the distance matrix, and compute
- * a new random permutation.
- *
- * If the entire random permutation is traversed and no joins are possible
- * due to additivity constraints, then the distance matrix is not
- * additive, and additivity constraint-checking is disabled.
- *
- */
-NJ_TREE *
-NJ_relaxed_nj(NJ_ARGS *nj_args,
- DMAT *dmat) {
-
-
- NJ_TREE *tree;
- NJ_VERTEX *vertex;
- long int a, b, t, bh, bv, i;
- float hmin, vmin, hvmin;
- float p, q, x;
- int join_flag;
- int additivity_mode;
- long int hmincount, vmincount;
- long int *permutation = NULL;
-
-
-
- /* initialize the r and r2 vectors */
- NJ_init_r(dmat);
-
- additivity_mode = 1;
-
- /* allocate the permutation vector, if we are in randomize mode */
- if(!nj_args->norandom) {
- permutation = (long int *)calloc(dmat->size, sizeof(long int));
- if(!permutation) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n");
- return(NULL);
- }
- }
-
- /* allocate and initialize our vertex vector used for tree construction */
- vertex = NJ_init_vertex(dmat);
-
- /* loop until there are only 2 nodes left to join */
- while(vertex->nactive > 2) {
-
- switch(nj_args->norandom) {
-
- /* RANDOMIZED JOINS */
- case 0:
-
- join_flag = 0;
-
- NJ_permute(permutation, dmat->size-1);
- for(i=0;i<dmat->size-1 && (vertex->nactive>2) ;i++) {
-
- a = permutation[i];
-
- /* find min trans dist along horiz. of row a */
- hmin = NJ_find_hmin(dmat, a, &bh, &hmincount);
- if(a) {
- /* find min trans dist along vert. of row a */
- vmin = NJ_find_vmin(dmat, a, &bv, &vmincount);
- } else {
- vmin = hmin;
- bv = bh;
- vmincount = 0;
- }
-
- if(NJ_FLT_EQ(hmin, vmin)) {
-
- /*
- * The minima along the vertical and horizontal are
- * the same. Compute the proportion of minima along
- * the horizonal (p) and the proportion of minima
- * along the vertical (q).
- *
- * If the same minima exist along the horizonal and
- * vertical, we break the tie in a way which is
- * non-biased. That is, we break the tie based on the
- * proportion of horiz. minima versus vertical minima.
- *
- */
- p = (float)hmincount / ((float)hmincount + (float)vmincount);
- q = 1.0 - p;
- x = genrand_real2();
-
- if(x < p) {
- hvmin = hmin;
- b = bh;
- } else {
- hvmin = vmin;
- b = bv;
- }
- } else if(NJ_FLT_LT(hmin, vmin) ) {
- hvmin = hmin;
- b = bh;
- } else {
- hvmin = vmin;
- b = bv;
- }
-
- if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) {
-
- /* swap a and b, if necessary, to make sure a < b */
- if(b < a) {
- t = a;
- a = b;
- b = t;
- }
-
- join_flag = 1;
-
- /* join taxa from rows a and b */
- NJ_decompose(dmat, vertex, a, b, 0);
-
- /* collapse matrix */
- NJ_compute_r(dmat, a, b);
- NJ_collapse(dmat, vertex, a, b);
-
- NJ_permute(permutation, dmat->size-1);
- }
- }
-
- /* turn off additivity if go through an entire cycle without joining */
- if(!join_flag) {
- additivity_mode = 0;
- }
-
- break;
-
-
-
- /* DETERMINISTIC JOINS */
- case 1:
-
- join_flag = 0;
-
- for(a=0;a<dmat->size-1 && (vertex->nactive > 2) ;) {
-
- /* find the min along the horizontal of row a */
- hmin = NJ_find_hmin(dmat, a, &b, &hmincount);
-
- if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) {
-
- join_flag = 1;
-
- /* join taxa from rows a and b */
- NJ_decompose(dmat, vertex, a, b, 0);
-
- /* collapse matrix */
- NJ_compute_r(dmat, a, b);
- NJ_collapse(dmat, vertex, a, b);
-
- if(a) {
- a--;
- }
-
- } else {
- a++;
- }
- }
-
- /* turn off additivity if go through an entire cycle without joining */
- if(!join_flag) {
- additivity_mode = 0;
- }
-
- break;
- }
-
- } /* WHILE */
-
- /* Join the last two nodes on the vertex list */
- tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
-
- if(nj_args->verbose_flag) {
- if(additivity_mode) {
- printf("Tree is additive\n");
- } else {
- printf("Tree is not additive\n");
- }
- }
-
- if(vertex) {
- NJ_free_vertex(vertex);
- }
-
- if(!nj_args->norandom && permutation) {
- free(permutation);
- }
-
- return(tree);
-}
-
-
-
-
-
-
-/*
- * NJ_print_distance_matrix() -
- *
- * Print a distance matrix
- *
- */
-void
-NJ_print_distance_matrix(DMAT *dmat) {
-
- long int i, j;
-
- printf("ntaxa: %ld\n", dmat->ntaxa);
- printf(" size: %ld\n", dmat->size);
-
- for(i=0;i<dmat->size;i++) {
-
- for(j=0;j<dmat->size;j++) {
- if(j>i) {
- printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]);
- } else {
- printf(" -");
- }
- }
-
-
- if(dmat->r && dmat->r2) {
- printf("\t\t%0.4f", dmat->r[i]);
- printf("\t%0.4f", dmat->r2[i]);
-
-
-
- printf("\n");
-
- for(j=0;j<dmat->size;j++) {
- if(j>i) {
- printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j]));
- } else {
- printf(" ");
- }
- }
-
- printf("\n");
- }
- }
-
- printf("\n\n");
-
- return;
-}
-
-
-
-
-
-
-
-/*
- * NJ_output_tree() -
- *
- * A wrapper for the function that really prints the tree,
- * basically to get a newline in there conveniently. :-)
- *
- * Print n trees, as specified in command-args
- * using "count" variable from 0 to (n-1)
- *
- */
-void
-NJ_output_tree(NJ_ARGS *nj_args,
- NJ_TREE *tree,
- DMAT *dmat,
- long int count) {
-
- FILE *fp;
-
- if(nj_args->stdout_flag) {
- fp = stdout;
- } else {
-
- if(count == 0) {
- fp = fopen(nj_args->outfilename, "w"); /* open for writing */
- } else {
- fp = fopen(nj_args->outfilename, "a"); /* open for appending */
- }
-
- if(!fp) {
- fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename);
- exit(-1);
- }
- }
-
- NJ_output_tree2(fp, nj_args, tree, tree, dmat);
- fprintf(fp, ";\n");
-
- if(!nj_args->stdout_flag) {
- fclose(fp);
- }
-
- return;
-}
-
-
-
-
-
-/*
- * NJ_output_tree2() -
- *
- *
- */
-void
-NJ_output_tree2(FILE *fp,
- NJ_ARGS *nj_args,
- NJ_TREE *tree,
- NJ_TREE *root,
- DMAT *dmat) {
-
- if(!tree) {
- return;
- }
-
- if(tree->taxa_index != NJ_INTERNAL_NODE) {
-
- if(nj_args->expblen) {
- fprintf(fp, "%s:%e",
- dmat->taxaname[tree->taxa_index],
- tree->dist);
- } else {
- fprintf(fp, "%s:%f",
- dmat->taxaname[tree->taxa_index],
- tree->dist);
- }
-
- } else {
-
-
- if(tree->left && tree->right) {
- fprintf(fp, "(");
- }
- if(tree->left) {
- NJ_output_tree2(fp, nj_args, tree->left, root, dmat);
- }
-
- if(tree->left && tree->right) {
- fprintf(fp, ",");
- }
- if(tree->right) {
- NJ_output_tree2(fp, nj_args, tree->right, root, dmat);
- }
-
- if(tree != root->left) {
- if(tree->left && tree->right) {
- if(tree != root) {
- if(nj_args->expblen) {
- fprintf(fp, "):%e", tree->dist);
- } else {
- fprintf(fp, "):%f", tree->dist);
- }
- } else {
- fprintf(fp, ")");
- }
- }
- } else {
- fprintf(fp, ")");
- }
- }
-
- return;
-}
-
-
-
-
-
-
-
-/*
- * NJ_init_r()
- *
- * This function computes the r column in our matrix
- *
- */
-void
-NJ_init_r(DMAT *dmat) {
-
- long int i, j, size;
- long int index;
- float *r, *r2, *val;
- long int size1;
- float size2;
-
- r = dmat->r;
- r2 = dmat->r2;
- val = dmat->val;
- size = dmat->size;
- size1 = size-1;
- size2 = (float)(size-2);
-
- index = 0;
- for(i=0;i<size1;i++) {
- index++;
- for(j=i+1;j<size;j++) {
- r[i] += val[index];
- r[j] += val[index];
- index++;
- }
-
- r2[i] = r[i]/size2;
- }
-
- return;
-}
-
-
-
-
-
-
-
-
-
-
-
-/*
- * NJ_init_vertex() -
- *
- * Construct a vertex, which we will use to construct our tree
- * in a true bottom-up approach. The vertex construct is
- * basically the center node in the initial star topology.
- *
- */
-NJ_VERTEX *
-NJ_init_vertex(DMAT *dmat) {
-
- long int i;
- NJ_VERTEX *vertex;
-
- /* allocate the vertex here */
- vertex = (NJ_VERTEX *)calloc(1, sizeof(NJ_VERTEX));
-
- /* allocate the nodes in the vertex */
- vertex->nodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *));
- vertex->nodes_handle = vertex->nodes;
-
- /* initialize our size and active variables */
- vertex->nactive = dmat->ntaxa;
- vertex->size = dmat->ntaxa;
-
- /* initialize the nodes themselves */
- for(i=0;i<dmat->ntaxa;i++) {
-
- vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
-
- vertex->nodes[i]->left = NULL;
- vertex->nodes[i]->right = NULL;
-
- vertex->nodes[i]->taxa_index = i;
- }
-
- return(vertex);
-}
-
-
-
-
-
-/*
- * NJ_decompose() -
- *
- * This function decomposes the star by creating new internal nodes
- * and joining two existing tree nodes to it
- *
- */
-NJ_TREE *
-NJ_decompose(DMAT *dmat,
- NJ_VERTEX *vertex,
- long int x,
- long int y,
- int last_flag) {
-
- NJ_TREE *new_node;
- float x2clade, y2clade;
-
- /* compute the distance from the clade components to the new node */
- if(last_flag) {
- x2clade =
- (dmat->val[NJ_MAP(x, y, dmat->size)]);
- } else {
- x2clade =
- (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
- ((dmat->r2[x] - dmat->r2[y])/2);
- }
-
- vertex->nodes[x]->dist = x2clade;
-
- if(last_flag) {
- y2clade =
- (dmat->val[NJ_MAP(x, y, dmat->size)]);
- } else {
- y2clade =
- (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
- ((dmat->r2[y] - dmat->r2[x])/2);
- }
-
- vertex->nodes[y]->dist = y2clade;
-
- /* allocate new node to connect two sub-clades */
- new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
-
- new_node->left = vertex->nodes[x];
- new_node->right = vertex->nodes[y];
- new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */
-
- if(last_flag) {
- return(new_node);
- }
-
- vertex->nodes[x] = new_node;
- vertex->nodes[y] = vertex->nodes[0];
-
- vertex->nodes = &(vertex->nodes[1]);
-
- vertex->nactive--;
-
- return(new_node);
-}
-
-
-
-/*
- * NJ_print_vertex() -
- *
- * For debugging, print the contents of the vertex
- *
- */
-void
-NJ_print_vertex(NJ_VERTEX *vertex) {
-
- long int i;
-
- printf("Number of active nodes: %ld\n", vertex->nactive);
-
- for(i=0;i<vertex->nactive;i++) {
- printf("%ld ", vertex->nodes[i]->taxa_index);
- }
- printf("\n");
-
- return;
-}
-
-
-
-
-
-
-
-
-
-/*
- * NJ_print_r() -
- *
- */
-void
-NJ_print_r(DMAT *dmat) {
-
- long int i;
-
- printf("\n");
- for(i=0;i<dmat->size;i++) {
- printf("r[%ld] = %0.2f\n", i, dmat->r[i]);
- }
- printf("\n");
-
- return;
-}
-
-
-
-
-
-/*
- * NJ_print_taxanames() -
- *
- * Print taxa names here
- *
- */
-void
-NJ_print_taxanames(DMAT *dmat) {
-
- long int i;
-
- printf("Number of taxa: %ld\n", dmat->ntaxa);
-
- for(i=0;i<dmat->ntaxa;i++) {
- printf("%ld) %s\n", i, dmat->taxaname[i]);
- }
-
- printf("\n");
-
- return;
-}
-
-
-
-
-/*
- * NJ_shuffle_distance_matrix() -
- *
- * Randomize a distance matrix here
- *
- */
-void
-NJ_shuffle_distance_matrix(DMAT *dmat) {
-
-
- long int *perm = NULL;
- char **tmp_taxaname = NULL;
- float *tmp_val = NULL;
- long int i, j;
-
-
- /* alloc the random permutation and a new matrix to hold the shuffled vals */
- perm = (long int *)calloc(dmat->size, sizeof(long int));
- tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *));
- tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
- if(!tmp_taxaname || !perm || !tmp_val) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n");
- exit(-1);
- }
-
- /* compute a permutation which will describe how to shuffle the matrix */
- NJ_permute(perm, dmat->size);
-
- for(i=0;i<dmat->size;i++) {
- for(j=i+1;j<dmat->size;j++) {
-
- if(perm[j] < perm[i]) {
- tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)];
- } else {
- tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)];
- }
-
- }
-
- tmp_taxaname[i] = dmat->taxaname[perm[i]];
- }
-
- /* free our random permutation */
- if(perm) {
- free(perm);
- }
-
- /* free the old value matrix */
- if(dmat->val) {
- free(dmat->val);
- }
-
- /* re-assign the value matrix pointers */
- dmat->val = tmp_val;
- dmat->valhandle = dmat->val;
-
- /*
- * Free our old taxaname with its particular ordering
- * and re-assign to the new.
- */
- if(dmat->taxaname) {
- free(dmat->taxaname);
- }
- dmat->taxaname = tmp_taxaname;
-
- return;
-}
-
-
-
-/*
- * NJ_free_tree() -
- *
- * Free a given NJ tree
- */
-void
-NJ_free_tree(NJ_TREE *node) {
-
- if(!node) {
- return;
- }
-
- if(node->left) {
- NJ_free_tree(node->left);
- }
-
- if(node->right) {
- NJ_free_tree(node->right);
- }
-
- free(node);
-
- return;
-}
-
-
-
-
-
-
-
-
-
-/*
- * NJ_print_permutation()
- *
- * Print a permutation
- *
- */
-void
-NJ_print_permutation(long int *perm,
- long int size) {
-
- long int i;
-
- for(i=0;i<size-1;i++) {
- printf("%ld,", perm[i]);
- }
- printf("%ld\n", perm[size-1]);
-
- return;
-}
-
-
-
-
-/*
- * NJ_dup_dmat() -
- *
- * Duplicate a distance matrix
- *
- */
-DMAT *
-NJ_dup_dmat(DMAT *src) {
-
- long int i;
- DMAT *dest;
-
- /* allocate the resulting distance matrix */
- dest = (DMAT *)calloc(1, sizeof(DMAT));
- if(!dest) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
- goto XIT_BAD;
- }
-
- dest->ntaxa = src->ntaxa;
- dest->size = src->size;
-
- /* allocate space for array of pointers to taxanames */
- dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *));
- if(!dest->taxaname) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
- goto XIT_BAD;
- }
-
- /* allocate space for the taxanames themselves */
- for(i=0;i<src->ntaxa;i++) {
- dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char));
- if(!dest->taxaname[i]) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
- goto XIT_BAD;
- }
- }
-
- /* allocate space for the distance values */
- dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float));
- if(!dest->val) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
- goto XIT_BAD;
- }
-
- /* allocate space for the r and r2 vectors */
- dest->r = (float *)calloc(src->ntaxa, sizeof(float));
- dest->r2 = (float *)calloc(src->ntaxa, sizeof(float));
-
- /* copy titles */
- for(i=0;i<src->ntaxa;i++) {
- strcpy(dest->taxaname[i], src->taxaname[i]);
- }
-
- /* copy values */
- memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float));
-
- /* copy r and r2 */
- memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float));
- memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float));
-
- /* track some memory addresses */
- dest->valhandle = dest->val;
- dest->rhandle = dest->r;
- dest->r2handle = dest->r2;
-
- return(dest);
-
- XIT_BAD:
-
- /* free what we may have allocated */
- NJ_free_dmat(dest);
-
- return(NULL);
-}
-
-
-
-
-/*
- * NJ_free_dmat() -
- */
-void
-NJ_free_dmat(DMAT *dmat) {
-
- long int i;
-
- if(dmat) {
-
- if(dmat->taxaname) {
-
- for(i=0;i<dmat->ntaxa;i++) {
- if(dmat->taxaname[i]) {
- free(dmat->taxaname[i]);
- }
- }
-
- free(dmat->taxaname);
- }
-
- if(dmat->valhandle) {
- free(dmat->valhandle);
- }
-
- if(dmat->rhandle) {
- free(dmat->rhandle);
- }
-
- if(dmat->r2handle) {
- free(dmat->r2handle);
- }
-
- free(dmat);
- }
-
- return;
-}
-
-
-
-
-
-/*
- * NJ_free_vertex() -
- *
- * Free the vertex data structure
- *
- */
-void
-NJ_free_vertex(NJ_VERTEX *vertex) {
-
- if(vertex) {
- if(vertex->nodes_handle) {
- free(vertex->nodes_handle);
- }
- free(vertex);
- }
-
- return;
-}
-
-
-
-
-
-
-
-
-
-/*
- *
- * NJ_min_transform() - Find the smallest transformed value to identify
- * which nodes to join.
- *
- * INPUTS:
- * -------
- * dmat -- The distance matrix
- *
- * RETURNS:
- * --------
- * <float> -- The minimimum transformed distance
- * ret_i -- The row of the smallest transformed distance (by reference)
- * ret_j -- The col of the smallest transformed distance (by reference)
- *
- *
- * DESCRIPTION:
- * ------------
- *
- * Used only with traditional Neighbor-Joining, this function checks the entire
- * working distance matrix and identifies the smallest transformed distance.
- * This requires traversing the entire diagonal matrix, which is itself a
- * O(N^2) operation.
- *
- */
-float
-NJ_min_transform(DMAT *dmat,
- long int *ret_i,
- long int *ret_j) {
-
- long int i, j; /* indices used for looping */
- long int tmp_i = 0;/* to limit pointer dereferencing */
- long int tmp_j = 0;/* to limit pointer dereferencing */
- float smallest; /* track the smallest trans. dist */
- float curval; /* the current trans. dist in loop */
-
- float *ptr; /* pointer into distance matrix */
- float *r2; /* pointer to r2 matrix for computing transformed dists */
-
- smallest = (float)HUGE_VAL;
-
- /* track these here to limit pointer dereferencing in inner loop */
- ptr = dmat->val;
- r2 = dmat->r2;
-
- /* for every row */
- for(i=0;i<dmat->size;i++) {
- ptr++; /* skip diagonal */
- for(j=i+1;j<dmat->size;j++) { /* for every column */
-
- /* find transformed distance in matrix at i, j */
- curval = *(ptr++) - (r2[i] + r2[j]);
-
- /* if the transformed distanance is less than the known minimum */
- if(curval < smallest) {
-
- smallest = curval;
- tmp_i = i;
- tmp_j = j;
- }
- }
- }
-
- /* pass back (by reference) the coords of the min. transformed distance */
- *ret_i = tmp_i;
- *ret_j = tmp_j;
-
- return(smallest); /* return the min transformed distance */
-}
-
-
-
-
-
-
-