A7DA217A113FECD400BF472F /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = "<group>"; };
A7DA217B113FECD400BF472F /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
A7DA217C113FECD400BF472F /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+ A7DF0AD3121EBB14004A03EA /* clearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = clearcut.c; sourceTree = "<group>"; };
+ A7DF0AD4121EBB14004A03EA /* clearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearcut.h; sourceTree = "<group>"; };
+ A7DF0AD5121EBB14004A03EA /* cmdargs.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = cmdargs.c; sourceTree = "<group>"; };
+ A7DF0AD6121EBB14004A03EA /* cmdargs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cmdargs.h; sourceTree = "<group>"; };
+ A7DF0AD7121EBB14004A03EA /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = "<group>"; };
+ A7DF0AD8121EBB14004A03EA /* dayhoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dayhoff.h; sourceTree = "<group>"; };
+ A7DF0AD9121EBB14004A03EA /* distclearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = distclearcut.c; sourceTree = "<group>"; };
+ A7DF0ADA121EBB14004A03EA /* distclearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distclearcut.h; sourceTree = "<group>"; };
+ A7DF0ADB121EBB14004A03EA /* dmat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dmat.c; sourceTree = "<group>"; };
+ A7DF0ADC121EBB14004A03EA /* dmat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dmat.h; sourceTree = "<group>"; };
+ A7DF0ADD121EBB14004A03EA /* fasta.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fasta.c; sourceTree = "<group>"; };
+ A7DF0ADE121EBB14004A03EA /* fasta.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fasta.h; sourceTree = "<group>"; };
+ A7DF0ADF121EBB14004A03EA /* getopt_long.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getopt_long.c; sourceTree = "<group>"; };
+ A7DF0AE0121EBB14004A03EA /* getopt_long.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getopt_long.h; sourceTree = "<group>"; };
+ A7DF0AE1121EBB14004A03EA /* prng.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = prng.c; sourceTree = "<group>"; };
+ A7DF0AE2121EBB14004A03EA /* prng.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prng.h; sourceTree = "<group>"; };
A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsesffcommand.cpp; sourceTree = "<group>"; };
A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsesffcommand.h; sourceTree = "<group>"; };
/* End PBXFileReference section */
A7CB593B11402EF90010EB83 /* calculators */,
A7CB594A11402FB40010EB83 /* chimeras */,
A7CB594511402F6E0010EB83 /* classifers */,
+ A7DF0AE5121EBC26004A03EA /* clearcutsource */,
A7CB593E11402F110010EB83 /* commands */,
A7CB594211402F430010EB83 /* containers */,
A7DA2023113FECD400BF472F /* collect.cpp */,
name = read;
sourceTree = "<group>";
};
+ A7DF0AE5121EBC26004A03EA /* clearcutsource */ = {
+ isa = PBXGroup;
+ children = (
+ A7DF0AD4121EBB14004A03EA /* clearcut.h */,
+ A7DF0AD3121EBB14004A03EA /* clearcut.c */,
+ A7DF0AD6121EBB14004A03EA /* cmdargs.h */,
+ A7DF0AD7121EBB14004A03EA /* common.h */,
+ A7DF0AD8121EBB14004A03EA /* dayhoff.h */,
+ A7DF0AD5121EBB14004A03EA /* cmdargs.c */,
+ A7DF0AD9121EBB14004A03EA /* distclearcut.c */,
+ A7DF0ADA121EBB14004A03EA /* distclearcut.h */,
+ A7DF0ADB121EBB14004A03EA /* dmat.c */,
+ A7DF0ADC121EBB14004A03EA /* dmat.h */,
+ A7DF0ADD121EBB14004A03EA /* fasta.c */,
+ A7DF0ADE121EBB14004A03EA /* fasta.h */,
+ A7DF0ADF121EBB14004A03EA /* getopt_long.c */,
+ A7DF0AE0121EBB14004A03EA /* getopt_long.h */,
+ A7DF0AE2121EBB14004A03EA /* prng.h */,
+ A7DF0AE1121EBB14004A03EA /* prng.c */,
+ );
+ name = clearcutsource;
+ sourceTree = "<group>";
+ };
/* End PBXGroup section */
/* Begin PBXLegacyTarget section */
--- /dev/null
+/*
+ * clearcut.c
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * An implementation of the Relaxed Neighbor-Joining algorithm
+ * of Evans, J., Sheneman, L., and Foster, J.
+ *
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+#include <float.h>
+
+#include "dist.h"
+#include "dmat.h"
+#include "fasta.h"
+#include "cmdargs.h"
+#include "common.h"
+#include "clearcut.h"
+#include "prng.h"
+
+
+
+/*
+ * main() -
+ *
+ * The entry point to the program.
+ *
+ */
+int
+main(int argc,
+ char *argv[]) {
+
+ DMAT *dmat; /* The working distance matrix */
+ DMAT *dmat_backup = NULL;/* A backup distance matrix */
+ NJ_TREE *tree; /* The phylogenetic tree */
+ NJ_ARGS *nj_args; /* Structure for holding command-line arguments */
+ long int i;
+
+ /* some variables for tracking time */
+ struct timeval tv;
+ unsigned long long startUs, endUs;
+
+
+ /* check and parse supplied command-line arguments */
+ nj_args = NJ_handle_args(argc, argv);
+ if(!nj_args) {
+ fprintf(stderr, "Clearcut: Error processing command-line arguments.\n");
+ exit(-1);
+ }
+
+ /* for verbose reporting, print the random number seed to stdout */
+ if(nj_args->verbose_flag) {
+ printf("PRNG SEED: %d\n", nj_args->seed);
+ }
+
+ /* Initialize Mersenne Twister PRNG */
+ init_genrand(nj_args->seed);
+
+
+ switch(nj_args->input_mode) {
+
+ /* If the input type is a distance matrix */
+ case NJ_INPUT_MODE_DISTANCE:
+
+ /* parse the distance matrix */
+ dmat = NJ_parse_distance_matrix(nj_args);
+ if(!dmat) {
+ exit(-1);
+ }
+
+ break;
+
+ /* If the input type is a multiple sequence alignment */
+ case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
+
+ /* build a distance matrix from a multiple sequence alignment */
+ dmat = NJ_build_distance_matrix(nj_args);
+ if(!dmat) {
+ fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n");
+ exit(-1);
+ }
+
+ break;
+
+ default:
+
+ fprintf(stderr, "Clearcut: Could not determine how to process input\n");
+ exit(-1);
+ }
+
+ /*
+ * Output the computed distance matrix,
+ * if the user specified one.
+ */
+ if(nj_args->matrixout) {
+ NJ_output_matrix(nj_args, dmat);
+ }
+
+ /*
+ * If we are going to generate multiple trees from
+ * the same distance matrix, we need to make a backup
+ * of the original distance matrix.
+ */
+ if(nj_args->ntrees > 1) {
+ dmat_backup = NJ_dup_dmat(dmat);
+ }
+
+ /* process n trees */
+ for(i=0;i<nj_args->ntrees;i++) {
+
+ /*
+ * If the user has specified matrix shuffling, we need
+ * to randomize the distance matrix
+ */
+ if(nj_args->shuffle) {
+ NJ_shuffle_distance_matrix(dmat);
+ }
+
+ /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */
+ gettimeofday(&tv, NULL);
+ startUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
+ + ((unsigned long long) tv.tv_usec);
+
+
+ /*
+ * Invoke either the Relaxed Neighbor-Joining algorithm (default)
+ * or the "traditional" Neighbor-Joining algorithm
+ */
+ if(nj_args->neighbor) {
+ tree = NJ_neighbor_joining(nj_args, dmat);
+ } else {
+ tree = NJ_relaxed_nj(nj_args, dmat);
+ }
+
+ if(!tree) {
+ fprintf(stderr, "Clearcut: Failed to construct tree.\n");
+ exit(0);
+ }
+
+ /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */
+ gettimeofday(&tv, NULL);
+ endUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
+ + ((unsigned long long) tv.tv_usec);
+
+ /* print the time taken to perform the neighbor join */
+ if(nj_args->verbose_flag) {
+ if(nj_args->neighbor) {
+ fprintf(stderr, "NJ tree built in %llu.%06llu secs\n",
+ (endUs - startUs) / 1000000ULL,
+ (endUs - startUs) % 1000000ULL);
+ } else {
+ fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n",
+ (endUs - startUs) / 1000000ULL,
+ (endUs - startUs) % 1000000ULL);
+ }
+ }
+
+ /* Output the neighbor joining tree here */
+ NJ_output_tree(nj_args, tree, dmat, i);
+
+ NJ_free_tree(tree); /* Free the tree */
+ NJ_free_dmat(dmat); /* Free the working distance matrix */
+
+ /*
+ * If we need to do another iteration, lets re-initialize
+ * our working distance matrix.
+ */
+ if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) {
+ dmat = NJ_dup_dmat(dmat_backup);
+ }
+ }
+
+ /* Free the backup distance matrix */
+ if(nj_args->ntrees > 1) {
+ NJ_free_dmat(dmat_backup);
+ }
+
+ /* If verbosity, describe where the tree output is */
+ if(nj_args->verbose_flag) {
+ if(nj_args->neighbor) {
+ printf("NJ tree(s) in %s\n", nj_args->outfilename);
+ } else {
+ printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename);
+ }
+ }
+
+ exit(0);
+}
+
+
+
+
+
+/*
+ * NJ_find_hmin() - Find minimum transformed values along horizontal
+ *
+ *
+ * INPUTS:
+ * -------
+ * dmat -- The distance matrix
+ * a -- The index of the specific taxon in the distance matrix
+ *
+ * RETURNS:
+ * --------
+ * <float> -- The value of the selected minimum
+ * min -- Used to transport the index of the minima out
+ * of the function (by reference)
+ * hmincount -- Return the number of minima along the horizontal
+ * (by reference)
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * A fast, inline function to find the smallest transformed value
+ * along the "horizontal" portion of an entry in a distance matrix.
+ *
+ * Distance matrices are stored internally as continguously-allocated
+ * upper-diagonal structures. With the exception of the taxa at
+ * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
+ * and vertical component in the distance matrix. This function
+ * scans the horizonal portion of the entry in the distance matrix
+ * for the specified taxon and finds the minimum transformed value
+ * along that horizontal component.
+ *
+ * Since multiple minima can exist along the horizontal portion
+ * of the entry, I consider all minima and break ties
+ * stochastically to help avoid systematic bias.
+ *
+ * Just searching along the horizontal portion of a row is very fast
+ * since the data is stored linearly and contiguously in memory and
+ * cache locality is exploited in the distance matrix representation.
+ *
+ * Look at nj.h for more information on how the distance matrix
+ * is architected.
+ *
+ */
+static inline
+float
+NJ_find_hmin(DMAT *dmat,
+ long int a,
+ long int *min,
+ long int *hmincount) {
+
+ long int i; /* index variable for looping */
+ int size; /* current size of distance matrix */
+ int mindex = 0; /* holds the current index to the chosen minimum */
+ float curval; /* used to hold current transformed values */
+ float hmin; /* the value of the transformed minimum */
+
+ float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */
+
+ /* values used for stochastic selection among multiple minima */
+ float p, x;
+ long int smallcnt;
+
+ /* initialize the min to something large */
+ hmin = (float)HUGE_VAL;
+
+ /* setup some pointers to limit dereferencing later */
+ r2 = dmat->r2;
+ val = dmat->val;
+ size = dmat->size;
+
+ /* initialize values associated with minima tie breaking */
+ p = 1.0;
+ smallcnt = 0;
+
+
+ ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */
+ for(i=a+1;i<size;i++) {
+
+ curval = *(ptr++) - (r2[a] + r2[i]); /* compute transformed distance */
+
+ if(NJ_FLT_EQ(curval, hmin)) { /* approx. equal */
+
+ smallcnt++;
+
+ p = 1.0/(float)smallcnt;
+ x = genrand_real2();
+
+ /* select this minimum in a way which is proportional to
+ the number of minima found along the row so far */
+ if( x < p ) {
+ mindex = i;
+ }
+
+ } else if (curval < hmin) {
+
+ smallcnt = 1;
+ hmin = curval;
+ mindex = i;
+ }
+ }
+
+ /* save off the the minimum index to be returned via reference */
+ *min = mindex;
+
+ /* save off the number of minima */
+ *hmincount = smallcnt;
+
+ /* return the value of the smallest tranformed distance */
+ return(hmin);
+}
+
+
+
+
+
+
+
+
+/*
+ * NJ_find_vmin() - Find minimum transformed distance along vertical
+ *
+ *
+ * INPUTS:
+ * -------
+ * dmat -- The distance matrix
+ * a -- The index of the specific taxon in the distance matrix
+ *
+ *
+ * RETURNS:
+ * --------
+ * <float> -- The value of the selected minimum
+ * min -- Used to transport the index of the minima out
+ * of the function (by reference)
+ * vmincount -- The number of minima along the vertical
+ * return by reference.
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * A fast, inline function to find the smallest transformed value
+ * along the "vertical" portion of an entry in a distance matrix.
+ *
+ * Distance matrices are stored internally as continguously-allocated
+ * upper-diagonal matrices. With the exception of the taxa at
+ * row 0 of this upper-diagonal matrix, all taxa have both a horizontal
+ * and vertical component in the distance matrix. This function
+ * scans the vertical portion of the entry in the distance matrix
+ * for the specified taxon and finds the minimum transformed value
+ * along that vertical component.
+ *
+ * Since multiple minima can exist along the vertical portion
+ * of the entry, I consider all minima and break ties
+ * stochastically to help avoid systematic bias.
+ *
+ * Due to cache locality reasons, searching along the vertical
+ * component is going to be considerably slower than searching
+ * along the horizontal.
+ *
+ * Look at nj.h for more information on how the distance matrix
+ * is architected.
+ *
+ */
+static inline
+float
+NJ_find_vmin(DMAT *dmat,
+ long int a,
+ long int *min,
+ long int *vmincount) {
+
+ long int i; /* index variable used for looping */
+ long int size; /* track the size of the matrix */
+ long int mindex = 0;/* track the index to the minimum */
+ float curval; /* track value of current transformed distance */
+ float vmin; /* the index to the smallest "vertical" minimum */
+
+ /* pointers which are used to reduce pointer dereferencing in inner loop */
+ float *ptr, *r2, *val;
+
+ /* values used in stochastically breaking ties */
+ float p, x;
+ long int smallcnt;
+
+ /* initialize the vertical min to something really big */
+ vmin = (float)HUGE_VAL;
+
+ /* save off some values to limit dereferencing later */
+ r2 = dmat->r2;
+ val = dmat->val;
+ size = dmat->size;
+
+ p = 1.0;
+ smallcnt = 0;
+
+ /* start on the first row and work down */
+ ptr = &(val[NJ_MAP(0, a, size)]);
+ for(i=0;i<a;i++) {
+
+ curval = *ptr - (r2[i] + r2[a]); /* compute transformed distance */
+
+ if(NJ_FLT_EQ(curval, vmin)) { /* approx. equal */
+
+ smallcnt++;
+
+ p = 1.0/(float)smallcnt;
+ x = genrand_real2();
+
+ /* break ties stochastically to avoid systematic bias */
+ if( x < p ) {
+ mindex = i;
+ }
+
+ } else if (curval < vmin) {
+
+ smallcnt = 1;
+ vmin = curval;
+ mindex = i;
+ }
+
+ /* increment our working pointer to the next row down */
+ ptr += size-i-1;
+ }
+
+ /* pass back the index to the minimum found so far (by reference) */
+ *min = mindex;
+
+ /* pass back the number of minima along the vertical */
+ *vmincount = smallcnt;
+
+ /* return the value of the smallest transformed distance */
+ return(vmin);
+}
+
+
+
+
+/*
+ * NJ_permute() - Generate random permutation using the provably unbiased
+ * Fisher-Yates Shuffle.
+ *
+ * INPUTS:
+ * -------
+ * perm -- A pointer to the array of long ints which will be filled.
+ * size -- the length of the permutation vector
+ *
+ *
+ * OUTPUTS:
+ * -------
+ * NONE
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * Return a permuted list of numbers from 0 through size.
+ * This is accomplished by initializing the permutation
+ * as an ordered list of integers and then iterating
+ * through and swapping two values selected according to the
+ * Fisher-Yates method.
+ *
+ * This unbiased method for random permutation generation is
+ * discussed in:
+ *
+ * Donald E. Knuth, The Art of Computer Programming,
+ * Addison-Wesley, Volumes 1, 2, and 3, 3rd edition, 1998
+ *
+ */
+static inline
+void
+NJ_permute(long int *perm,
+ long int size) {
+
+ long int i; /* index used for looping */
+ long int swap; /* we swap values to generate permutation */
+ long int tmp; /* used for swapping values */
+
+
+ /* check to see if vector of long ints is valid */
+ if(!perm) {
+ fprintf(stderr, "Clearcut: NULL permutation pointer in NJ_permute()\n");
+ exit(-1);
+ }
+
+ /* init permutation as an ordered list of integers */
+ for(i=0;i<size;i++) {
+ perm[i] = i;
+ }
+
+ /*
+ * Iterate across the array from i = 0 to size -1, swapping ith element
+ * with a randomly chosen element from a changing range of possible values
+ */
+ for(i=0;i<size;i++) {
+
+ /* choose which element we will swap with */
+ swap = i + NJ_genrand_int31_top(size-i);
+
+ /* swap elements here */
+ if(i != swap) {
+ tmp = perm[swap];
+ perm[swap] = perm[i];
+ perm[i] = tmp;
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_compute_r() - Compute post-join changes to r-vector. In this
+ * case, we decrement all of the accumulated distances
+ * in the r-vector for the two nodes that were
+ * recently joined (i.e. x, y)
+ *
+ * INPUTS:
+ * -------
+ * dmat -- The distance matrix
+ * a -- The index of one of the taxa that were joined
+ * b -- The index of the other taxa that was joined
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * This vector of floats is used as a summary of overall distances from
+ * each entry in the distance matrix to every other entry. These values
+ * are then used when computing the transformed distances from which
+ * decisions concerning joining are made.
+ *
+ * For speed, we don't recompute r from scratch. Instead, we decrement
+ * all entries in r by the appropriate amount. That is, r[i] -= dist(i, a)
+ * and r[i] -= dist(i, b).
+ *
+ * As a speed optimization, I process the rows altogether for cache locality
+ * purposes, and then process columns.
+ *
+ * The processing of the scaled r matrix (r2) is handled on-the-fly elsewhere.
+ *
+ */
+static inline
+void
+NJ_compute_r(DMAT *dmat,
+ long int a,
+ long int b) {
+
+ long int i; /* a variable used in indexing */
+ float *ptrx, *ptry; /* pointers into the distance matrix */
+
+ /* some variables to limit pointer dereferencing in loop */
+ long int size;
+ float *r, *val;
+
+ /* to limit pointer dereferencing */
+ size = dmat->size;
+ val = dmat->val;
+ r = dmat->r+a+1;
+
+ /*
+ * Loop through the rows and decrement the stored r values
+ * by the distances stored in the rows and columns of the distance
+ * matrix which are being removed post-join.
+ *
+ * We do the rows altogether in order to benefit from cache locality.
+ */
+ ptrx = &(val[NJ_MAP(a, a+1, size)]);
+ ptry = &(val[NJ_MAP(b, b+1, size)]);
+
+ for(i=a+1;i<size;i++) {
+ *r -= *(ptrx++);
+
+ if(i>b) {
+ *r -= *(ptry++);
+ }
+
+ r++;
+ }
+
+ /* Similar to the above loop, we now do the columns */
+ ptrx = &(val[NJ_MAP(0, a, size)]);
+ ptry = &(val[NJ_MAP(0, b, size)]);
+ r = dmat->r;
+ for(i=0;i<b;i++) {
+ if(i<a) {
+ *r -= *ptrx;
+ ptrx += size-i-1;
+ }
+
+ *r -= *ptry;
+ ptry += size-i-1;
+ r++;
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_check_additivity() - Check to make sure that addivity preserved by join
+ *
+ *
+ * INPUTS:
+ * -------
+ * dmat -- distance matrix
+ * a -- index into dmat for one of the rows to be joined
+ * b -- index into dmat for another row to be joined
+ *
+ * OUTPUTS:
+ * --------
+ * int 1 if join adheres to additivity constraint
+ * 0 if join does breaks additivity
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * Here we perform the check to make sure that by joining a and b we do not
+ * also break consistency (i.e. preserves additivity) with the distances between
+ * the taxa in the new clade and other nodes in the tree. This is done quite
+ * efficiently by looking up the untransformed distance between node b and
+ * some other "target" taxa in the distance matrix (which is not a nor b) and
+ * comparing that distance to the distance computed by finding the distance
+ * from node a to the proposed internal node "x" which joins (a,b).
+ *
+ * If dist(x,b) + dist (b, target) == dist(b, target) then additivity is
+ * preserved, otherwise, additivity is not preserved. If we are in
+ * additivity mode, this join should be rejected.
+ *
+ */
+static inline
+int
+NJ_check_additivity(DMAT *dmat,
+ long int a,
+ long int b) {
+
+ float a2clade, b2clade;
+ float clade_dist;
+ long int target;
+
+
+ /* determine target taxon here */
+ if(b == dmat->size-1) {
+ /* if we can't do a row here, lets do a column */
+ if(a==0) {
+ if(b==1) {
+ target = 2;
+ } else {
+ target = 1;
+ }
+ } else {
+ target = 0;
+ }
+ } else {
+ target = b+1;
+ }
+
+
+ /* distance between a and the root of clade (a,b) */
+ a2clade =
+ ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
+ (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
+
+ /* distance between b and the root of clade (a,b) */
+ b2clade =
+ ( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
+ (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
+
+ /* distance between the clade (a,b) and the target taxon */
+ if(b<target) {
+
+ /* compute the distance from the clade root to the target */
+ clade_dist =
+ ( (dmat->val[NJ_MAP(a, target, dmat->size)] - a2clade) +
+ (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0;
+
+ /*
+ * Check to see that distance from clade root to target + distance from
+ * b to clade root are equal to the distance from b to the target
+ */
+ if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)],
+ (clade_dist + b2clade))) {
+ return(1); /* join is legitimate */
+ } else {
+ return(0); /* join is illigitimate */
+ }
+
+ } else {
+
+ /* compute the distance from the clade root to the target */
+ clade_dist =
+ ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) +
+ (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0;
+
+ /*
+ * Check to see that distance from clade root to target + distance from
+ * b to clade root are equal to the distance from b to the target
+ */
+ if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)],
+ (clade_dist + b2clade))) {
+ return(1); /* join is legitimate */
+ } else {
+ return(0); /* join is illegitimate */
+ }
+ }
+}
+
+
+
+
+
+
+
+/*
+ * NJ_check() - Check to see if two taxa can be joined
+
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- Pointer to the data structure holding command-line args
+ * dmat -- distance matrix
+ * a -- index into dmat for one of the rows to be joined
+ * b -- index into dmat for another row to be joined
+ * min -- the minimum value found
+ * additivity -- a flag (0 = not additive mode, 1 = additive mode)
+ *
+ * OUTPUTS:
+ * --------
+ * int 1 if join is okay
+ * 0 if join is not okay
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * This function ultimately takes two rows and makes sure that the
+ * intersection of those two rows, which has a transformed distance of
+ * "min", is actually the smallest (or equal to the smallest)
+ * transformed distance for both rows (a, b). If so, it returns
+ * 1, else it returns 0.
+ *
+ * Basically, we want to join two rows only if the minimum
+ * transformed distance on either row is at the intersection of
+ * those two rows.
+ *
+ */
+static inline
+int
+NJ_check(NJ_ARGS *nj_args,
+ DMAT *dmat,
+ long int a,
+ long int b,
+ float min,
+ int additivity) {
+
+
+ long int i, size;
+ float *ptr, *val, *r2;
+
+
+ /* some aliases for speed and readability reasons */
+ val = dmat->val;
+ r2 = dmat->r2;
+ size = dmat->size;
+
+
+ /* now determine if joining a, b will result in broken distances */
+ if(additivity) {
+ if(!NJ_check_additivity(dmat, a, b)) {
+ return(0);
+ }
+ }
+
+ /* scan the horizontal of row b, punt if anything < min */
+ ptr = &(val[NJ_MAP(b, b+1, size)]);
+ for(i=b+1;i<size;i++) {
+ if( NJ_FLT_LT( (*ptr - (r2[b] + r2[i])), min) ) {
+ return(0);
+ }
+ ptr++;
+ }
+
+ /* scan the vertical component of row a, punt if anything < min */
+ if(nj_args->norandom) { /* if we are doing random joins, we checked this */
+ ptr = val + a;
+ for(i=0;i<a;i++) {
+ if( NJ_FLT_LT( (*ptr - (r2[i] + r2[a])), min) ) {
+ return(0);
+ }
+ ptr += size-i-1;
+ }
+ }
+
+ /* scan the vertical component of row b, punt if anything < min */
+ ptr = val + b;
+ for(i=0;i<b;i++) {
+ if( NJ_FLT_LT( (*ptr - (r2[i] + r2[b])), min) && i!=a) {
+ return(0);
+ }
+ ptr += size-i-1;
+ }
+
+ return(1);
+}
+
+
+
+
+
+
+
+/*
+ * NJ_collapse() - Collapse the distance matrix by removing
+ * rows a and b from the distance matrix and
+ * replacing them with a single new row which
+ * represents the internal node joining a and b
+ *
+ *
+ * INPUTS:
+ * -------
+ * dmat -- A pointer to the distance matrix
+ * vertex -- A pointer to the vertex vector (vector of tree nodes)
+ * which is used in constructing the tree
+ * a -- An index to a row in the distance matrix from which we
+ * joined. This row will be collapsed.
+ * b -- An index to a row in the distance matrix from which we
+ * joined. This row will be collapsed.
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * This function collapses the distance matrix in a way which optimizes
+ * cache locality and ultimately gives us a speed improvement due to
+ * cache. At this point, we've decided to join rows a and b from
+ * the distance matrix. We will remove rows a and b from the distance
+ * matrix and replace them with a new row which represents the internal
+ * node which joins rows a and b together.
+ *
+ * We always keep the matrix as compact as possible in order to
+ * get good performance from our cache in subsequent operations. Cache
+ * is the key to good performance here.
+ *
+ * Key Steps:
+ * ----------
+ *
+ * 1) Fill the "a" row with the new distances of the internal node
+ * joining a and b to all other rows.
+ * 2) Copy row 0 into what was row b
+ * 3) Increment the pointer to the start of the distance matrix
+ * by one row.
+ * 4) Decrement the size of the matrix by one row.
+ * 5) Do roughly the same thing to the r vector in order to
+ * keep it in sync with the distance matrix.
+ * 6) Compute the scaled r vector (r2) based on the updated
+ * r vector
+ *
+ * This keeps the distance matrix as compact as possible in memory, and
+ * is a relatively fast operation.
+ *
+ * This function requires that a < b
+ *
+ */
+static inline
+void
+NJ_collapse(DMAT *dmat,
+ NJ_VERTEX *vertex,
+ long int a,
+ long int b) {
+
+
+ long int i; /* index used for looping */
+ long int size; /* size of dmat --> reduce pointer dereferencing */
+ float a2clade; /* distance from a to the new node that joins a and b */
+ float b2clade; /* distance from b to the new node that joins a and b */
+ float cval; /* stores distance information during loop */
+ float *vptr; /* pointer to elements in first row of dist matrix */
+ float *ptra; /* pointer to elements in row a of distance matrix */
+ float *ptrb; /* pointer to elements in row b of distance matrix */
+
+ float *val, *r, *r2; /* simply used to limit pointer dereferencing */
+
+
+ /* We must assume that a < b */
+ if(a >= b) {
+ fprintf(stderr, "Clearcut: (a<b) constraint check failed in NJ_collapse()\n");
+ exit(0);
+ }
+
+ /* some shortcuts to help limit dereferencing */
+ val = dmat->val;
+ r = dmat->r;
+ r2 = dmat->r2;
+ size = dmat->size;
+
+ /* compute the distance from the clade components (a, b) to the new node */
+ a2clade =
+ ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
+ b2clade =
+ ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
+
+
+ r[a] = 0.0; /* we are removing row a, so clear dist. in r */
+
+ /*
+ * Fill the horizontal part of the "a" row and finish computing r and r2
+ * we handle the horizontal component first to maximize cache locality
+ */
+ ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */
+ ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */
+ for(i=a+1;i<size;i++) {
+
+ /*
+ * Compute distance from new internal node to others in
+ * the distance matrix.
+ */
+ cval =
+ ( (*ptra - a2clade) +
+ (*ptrb - b2clade) ) / 2.0;
+
+ /* incr. row b pointer differently depending on where i is in loop */
+ if(i<b) {
+ ptrb += size-i-1; /* traverse vertically by incrementing by row */
+ } else {
+ ptrb++; /* traverse horiz. by incrementing by column */
+ }
+
+ /* assign the newly computed distance and increment a ptr by a column */
+ *(ptra++) = cval;
+
+ /* accumulate the distance onto the r vector */
+ r[a] += cval;
+ r[i] += cval;
+
+ /* scale r2 on the fly here */
+ r2[i] = r[i]/(float)(size-3);
+ }
+
+ /* fill the vertical part of the "a" column and finish computing r and r2 */
+ ptra = val + a; /* start at the top of the columb for "a" */
+ ptrb = val + b; /* start at the top of the columb for "b" */
+ for(i=0;i<a;i++) {
+
+ /*
+ * Compute distance from new internal node to others in
+ * the distance matrix.
+ */
+ cval =
+ ( (*ptra - a2clade) +
+ (*ptrb - b2clade) ) / 2.0;
+
+ /* assign the newly computed distance and increment a ptr by a column */
+ *ptra = cval;
+
+ /* accumulate the distance onto the r vector */
+ r[a] += cval;
+ r[i] += cval;
+
+ /* scale r2 on the fly here */
+ r2[i] = r[i]/(float)(size-3);
+
+ /* here, always increment by an entire row */
+ ptra += size-i-1;
+ ptrb += size-i-1;
+ }
+
+
+ /* scale r2 on the fly here */
+ r2[a] = r[a]/(float)(size-3);
+
+
+
+ /*
+ * Copy row 0 into row b. Again, the code is structured into two
+ * loops to maximize cache locality for writes along the horizontal
+ * component of row b.
+ */
+ vptr = val;
+ ptrb = val + b;
+ for(i=0;i<b;i++) {
+ *ptrb = *(vptr++);
+ ptrb += size-i-1;
+ }
+ vptr++; /* skip over the diagonal */
+ ptrb = &(val[NJ_MAP(b, b+1, size)]);
+ for(i=b+1;i<size;i++) {
+ *(ptrb++) = *(vptr++);
+ }
+
+ /*
+ * Collapse r here by copying contents of r[0] into r[b] and
+ * incrementing pointer to the beginning of r by one row
+ */
+ r[b] = r[0];
+ dmat->r = r+1;
+
+
+ /*
+ * Collapse r2 here by copying contents of r2[0] into r2[b] and
+ * incrementing pointer to the beginning of r2 by one row
+ */
+ r2[b] = r2[0];
+ dmat->r2 = r2+1;
+
+ /* increment dmat pointer to next row */
+ dmat->val += size;
+
+ /* decrement the total size of the distance matrix by one row */
+ dmat->size--;
+
+ return;
+}
+
+
+
+
+
+
+
+
+
+/*
+ * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining
+ *
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- A pointer to a structure containing the command-line arguments
+ * dmat -- A pointer to the distance matrix
+ *
+ * RETURNS:
+ * --------
+ * NJ_TREE * -- A pointer to the Neighbor-Joining tree.
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * This function performs a traditional Neighbor-Joining operation in which
+ * the distance matrix is exhaustively searched for the global minimum
+ * transformed distance. The two nodes which intersect at the global
+ * minimum transformed distance are then joined and the distance
+ * matrix is collapsed. This process continues until there are only
+ * two nodes left, at which point those nodes are joined.
+ *
+ */
+NJ_TREE *
+NJ_neighbor_joining(NJ_ARGS *nj_args,
+ DMAT *dmat) {
+
+
+ NJ_TREE *tree = NULL;
+ NJ_VERTEX *vertex = NULL;
+
+ long int a, b;
+ float min;
+
+
+ /* initialize the r and r2 vectors */
+ NJ_init_r(dmat);
+
+ /* allocate and initialize our vertex vector used for tree construction */
+ vertex = NJ_init_vertex(dmat);
+ if(!vertex) {
+ fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n");
+ return(NULL);
+ }
+
+ /* we iterate until the working distance matrix has only 2 entries */
+ while(vertex->nactive > 2) {
+
+ /*
+ * Find the global minimum transformed distance from the distance matrix
+ */
+ min = NJ_min_transform(dmat, &a, &b);
+
+ /*
+ * Build the tree by removing nodes a and b from the vertex array
+ * and inserting a new internal node which joins a and b. Collapse
+ * the vertex array similarly to how the distance matrix and r and r2
+ * are compacted.
+ */
+ NJ_decompose(dmat, vertex, a, b, 0);
+
+ /* decrement the r and r2 vectors by the distances corresponding to a, b */
+ NJ_compute_r(dmat, a, b);
+
+ /* compact the distance matrix and the r and r2 vectors */
+ NJ_collapse(dmat, vertex, a, b);
+ }
+
+ /* Properly join the last two nodes on the vertex list */
+ tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
+
+ /* return the computed tree to the calling function */
+ return(tree);
+}
+
+
+
+
+
+
+
+
+/*
+ * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- A pointer to a data structure containing the command-line args
+ * dmat -- A pointer to the distance matrix
+ *
+ * RETURNS:
+ * --------
+ *
+ * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * This function implements the Relaxed Neighbor-Joining algorithm of
+ * Evans, J., Sheneman, L., and Foster, J.
+ *
+ * Relaxed Neighbor-Joining works by choosing a local minimum transformed
+ * distance when determining when to join two nodes. (Traditional
+ * Neighbor-Joining chooses a global minimum transformed distance).
+ *
+ * The algorithm shares the property with traditional NJ that if the
+ * input distances are additive (self-consistent), then the algorithm
+ * will manage to construct the true tree consistent with the additive
+ * distances. Additivity state is tracked and every proposed join is checked
+ * to make sure it maintains additivity constraints. If no
+ * additivity-preserving join is possible in a single pass, then the distance
+ * matrix is non-additive, and additivity checking is abandoned.
+ *
+ * The algorithm will either attempt joins randomly, or it will perform joins
+ * in a particular order. The default behavior is to perform joins randomly,
+ * but this can be switched off with a command-line switch.
+ *
+ * For randomized joins, all attempts are made to alleviate systematic bias
+ * for the choice of rows to joins. All tie breaking is done in a way which
+ * is virtually free of bias.
+ *
+ * To perform randomized joins, a random permutation is constructed which
+ * specifies the order in which to attempt joins. I iterate through the
+ * random permutation, and for each row in the random permutation, I find
+ * the minimum transformed distance for that row. If there are multiple
+ * minima, I break ties evenly. For the row which intersects our
+ * randomly chosen row at the chosen minimum, if we are are still in
+ * additivity mode, I check to see if joining the two rows will break
+ * our additivity constraints. If not, I check to see if there exists
+ * a transformed distance which is smaller than the minimum found on the
+ * original row. If there is, then we proceed through the random permutation
+ * trying additional rows in the random order specified in the permutation.
+ * If there is no smaller minimum transformed distance on either of the
+ * two rows, then we join them, collapse the distance matrix, and compute
+ * a new random permutation.
+ *
+ * If the entire random permutation is traversed and no joins are possible
+ * due to additivity constraints, then the distance matrix is not
+ * additive, and additivity constraint-checking is disabled.
+ *
+ */
+NJ_TREE *
+NJ_relaxed_nj(NJ_ARGS *nj_args,
+ DMAT *dmat) {
+
+
+ NJ_TREE *tree;
+ NJ_VERTEX *vertex;
+ long int a, b, t, bh, bv, i;
+ float hmin, vmin, hvmin;
+ float p, q, x;
+ int join_flag;
+ int additivity_mode;
+ long int hmincount, vmincount;
+ long int *permutation = NULL;
+
+
+
+ /* initialize the r and r2 vectors */
+ NJ_init_r(dmat);
+
+ additivity_mode = 1;
+
+ /* allocate the permutation vector, if we are in randomize mode */
+ if(!nj_args->norandom) {
+ permutation = (long int *)calloc(dmat->size, sizeof(long int));
+ if(!permutation) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n");
+ return(NULL);
+ }
+ }
+
+ /* allocate and initialize our vertex vector used for tree construction */
+ vertex = NJ_init_vertex(dmat);
+
+ /* loop until there are only 2 nodes left to join */
+ while(vertex->nactive > 2) {
+
+ switch(nj_args->norandom) {
+
+ /* RANDOMIZED JOINS */
+ case 0:
+
+ join_flag = 0;
+
+ NJ_permute(permutation, dmat->size-1);
+ for(i=0;i<dmat->size-1 && (vertex->nactive>2) ;i++) {
+
+ a = permutation[i];
+
+ /* find min trans dist along horiz. of row a */
+ hmin = NJ_find_hmin(dmat, a, &bh, &hmincount);
+ if(a) {
+ /* find min trans dist along vert. of row a */
+ vmin = NJ_find_vmin(dmat, a, &bv, &vmincount);
+ } else {
+ vmin = hmin;
+ bv = bh;
+ vmincount = 0;
+ }
+
+ if(NJ_FLT_EQ(hmin, vmin)) {
+
+ /*
+ * The minima along the vertical and horizontal are
+ * the same. Compute the proportion of minima along
+ * the horizonal (p) and the proportion of minima
+ * along the vertical (q).
+ *
+ * If the same minima exist along the horizonal and
+ * vertical, we break the tie in a way which is
+ * non-biased. That is, we break the tie based on the
+ * proportion of horiz. minima versus vertical minima.
+ *
+ */
+ p = (float)hmincount / ((float)hmincount + (float)vmincount);
+ q = 1.0 - p;
+ x = genrand_real2();
+
+ if(x < p) {
+ hvmin = hmin;
+ b = bh;
+ } else {
+ hvmin = vmin;
+ b = bv;
+ }
+ } else if(NJ_FLT_LT(hmin, vmin) ) {
+ hvmin = hmin;
+ b = bh;
+ } else {
+ hvmin = vmin;
+ b = bv;
+ }
+
+ if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) {
+
+ /* swap a and b, if necessary, to make sure a < b */
+ if(b < a) {
+ t = a;
+ a = b;
+ b = t;
+ }
+
+ join_flag = 1;
+
+ /* join taxa from rows a and b */
+ NJ_decompose(dmat, vertex, a, b, 0);
+
+ /* collapse matrix */
+ NJ_compute_r(dmat, a, b);
+ NJ_collapse(dmat, vertex, a, b);
+
+ NJ_permute(permutation, dmat->size-1);
+ }
+ }
+
+ /* turn off additivity if go through an entire cycle without joining */
+ if(!join_flag) {
+ additivity_mode = 0;
+ }
+
+ break;
+
+
+
+ /* DETERMINISTIC JOINS */
+ case 1:
+
+ join_flag = 0;
+
+ for(a=0;a<dmat->size-1 && (vertex->nactive > 2) ;) {
+
+ /* find the min along the horizontal of row a */
+ hmin = NJ_find_hmin(dmat, a, &b, &hmincount);
+
+ if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) {
+
+ join_flag = 1;
+
+ /* join taxa from rows a and b */
+ NJ_decompose(dmat, vertex, a, b, 0);
+
+ /* collapse matrix */
+ NJ_compute_r(dmat, a, b);
+ NJ_collapse(dmat, vertex, a, b);
+
+ if(a) {
+ a--;
+ }
+
+ } else {
+ a++;
+ }
+ }
+
+ /* turn off additivity if go through an entire cycle without joining */
+ if(!join_flag) {
+ additivity_mode = 0;
+ }
+
+ break;
+ }
+
+ } /* WHILE */
+
+ /* Join the last two nodes on the vertex list */
+ tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST);
+
+ if(nj_args->verbose_flag) {
+ if(additivity_mode) {
+ printf("Tree is additive\n");
+ } else {
+ printf("Tree is not additive\n");
+ }
+ }
+
+ if(vertex) {
+ NJ_free_vertex(vertex);
+ }
+
+ if(!nj_args->norandom && permutation) {
+ free(permutation);
+ }
+
+ return(tree);
+}
+
+
+
+
+
+
+/*
+ * NJ_print_distance_matrix() -
+ *
+ * Print a distance matrix
+ *
+ */
+void
+NJ_print_distance_matrix(DMAT *dmat) {
+
+ long int i, j;
+
+ printf("ntaxa: %ld\n", dmat->ntaxa);
+ printf(" size: %ld\n", dmat->size);
+
+ for(i=0;i<dmat->size;i++) {
+
+ for(j=0;j<dmat->size;j++) {
+ if(j>i) {
+ printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]);
+ } else {
+ printf(" -");
+ }
+ }
+
+
+ if(dmat->r && dmat->r2) {
+ printf("\t\t%0.4f", dmat->r[i]);
+ printf("\t%0.4f", dmat->r2[i]);
+
+
+
+ printf("\n");
+
+ for(j=0;j<dmat->size;j++) {
+ if(j>i) {
+ printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j]));
+ } else {
+ printf(" ");
+ }
+ }
+
+ printf("\n\n");
+ }
+ }
+
+ printf("\n");
+
+ return;
+}
+
+
+
+
+
+
+
+/*
+ * NJ_output_tree() -
+ *
+ * A wrapper for the function that really prints the tree,
+ * basically to get a newline in there conveniently. :-)
+ *
+ * Print n trees, as specified in command-args
+ * using "count" variable from 0 to (n-1)
+ *
+ */
+void
+NJ_output_tree(NJ_ARGS *nj_args,
+ NJ_TREE *tree,
+ DMAT *dmat,
+ long int count) {
+
+ FILE *fp;
+
+ if(nj_args->stdout_flag) {
+ fp = stdout;
+ } else {
+
+ if(count == 0) {
+ fp = fopen(nj_args->outfilename, "w"); /* open for writing */
+ } else {
+ fp = fopen(nj_args->outfilename, "a"); /* open for appending */
+ }
+
+ if(!fp) {
+ fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename);
+ exit(-1);
+ }
+ }
+
+ NJ_output_tree2(fp, nj_args, tree, tree, dmat);
+ fprintf(fp, ";\n");
+
+ if(!nj_args->stdout_flag) {
+ fclose(fp);
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_output_tree2() -
+ *
+ *
+ */
+void
+NJ_output_tree2(FILE *fp,
+ NJ_ARGS *nj_args,
+ NJ_TREE *tree,
+ NJ_TREE *root,
+ DMAT *dmat) {
+
+ if(!tree) {
+ return;
+ }
+
+ if(tree->taxa_index != NJ_INTERNAL_NODE) {
+
+ if(nj_args->expblen) {
+ fprintf(fp, "%s:%e",
+ dmat->taxaname[tree->taxa_index],
+ tree->dist);
+ } else {
+ fprintf(fp, "%s:%f",
+ dmat->taxaname[tree->taxa_index],
+ tree->dist);
+ }
+
+ } else {
+
+
+ if(tree->left && tree->right) {
+ fprintf(fp, "(");
+ }
+ if(tree->left) {
+ NJ_output_tree2(fp, nj_args, tree->left, root, dmat);
+ }
+
+ if(tree->left && tree->right) {
+ fprintf(fp, ",");
+ }
+ if(tree->right) {
+ NJ_output_tree2(fp, nj_args, tree->right, root, dmat);
+ }
+
+ if(tree != root->left) {
+ if(tree->left && tree->right) {
+ if(tree != root) {
+ if(nj_args->expblen) {
+ fprintf(fp, "):%e", tree->dist);
+ } else {
+ fprintf(fp, "):%f", tree->dist);
+ }
+ } else {
+ fprintf(fp, ")");
+ }
+ }
+ } else {
+ fprintf(fp, ")");
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+
+
+/*
+ * NJ_init_r()
+ *
+ * This function computes the r column in our matrix
+ *
+ */
+void
+NJ_init_r(DMAT *dmat) {
+
+ long int i, j, size;
+ long int index;
+ float *r, *r2, *val;
+ long int size1;
+ float size2;
+
+ r = dmat->r;
+ r2 = dmat->r2;
+ val = dmat->val;
+ size = dmat->size;
+ size1 = size-1;
+ size2 = (float)(size-2);
+
+ index = 0;
+ for(i=0;i<size1;i++) {
+ index++;
+ for(j=i+1;j<size;j++) {
+ r[i] += val[index];
+ r[j] += val[index];
+ index++;
+ }
+
+ r2[i] = r[i]/size2;
+ }
+
+ return;
+}
+
+
+
+
+
+
+
+
+
+
+
+/*
+ * NJ_init_vertex() -
+ *
+ * Construct a vertex, which we will use to construct our tree
+ * in a true bottom-up approach. The vertex construct is
+ * basically the center node in the initial star topology.
+ *
+ */
+NJ_VERTEX *
+NJ_init_vertex(DMAT *dmat) {
+
+ long int i;
+ NJ_VERTEX *vertex;
+
+ /* allocate the vertex here */
+ vertex = (NJ_VERTEX *)calloc(1, sizeof(NJ_VERTEX));
+
+ /* allocate the nodes in the vertex */
+ vertex->nodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *));
+ vertex->nodes_handle = vertex->nodes;
+
+ /* initialize our size and active variables */
+ vertex->nactive = dmat->ntaxa;
+ vertex->size = dmat->ntaxa;
+
+ /* initialize the nodes themselves */
+ for(i=0;i<dmat->ntaxa;i++) {
+
+ vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
+
+ vertex->nodes[i]->left = NULL;
+ vertex->nodes[i]->right = NULL;
+
+ vertex->nodes[i]->taxa_index = i;
+ }
+
+ return(vertex);
+}
+
+
+
+
+
+/*
+ * NJ_decompose() -
+ *
+ * This function decomposes the star by creating new internal nodes
+ * and joining two existing tree nodes to it
+ *
+ */
+NJ_TREE *
+NJ_decompose(DMAT *dmat,
+ NJ_VERTEX *vertex,
+ long int x,
+ long int y,
+ int last_flag) {
+
+ NJ_TREE *new_node;
+ float x2clade, y2clade;
+
+ /* compute the distance from the clade components to the new node */
+ if(last_flag) {
+ x2clade =
+ (dmat->val[NJ_MAP(x, y, dmat->size)]);
+ } else {
+ x2clade =
+ (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
+ ((dmat->r2[x] - dmat->r2[y])/2);
+ }
+
+ vertex->nodes[x]->dist = x2clade;
+
+ if(last_flag) {
+ y2clade =
+ (dmat->val[NJ_MAP(x, y, dmat->size)]);
+ } else {
+ y2clade =
+ (dmat->val[NJ_MAP(x, y, dmat->size)])/2 +
+ ((dmat->r2[y] - dmat->r2[x])/2);
+ }
+
+ vertex->nodes[y]->dist = y2clade;
+
+ /* allocate new node to connect two sub-clades */
+ new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE));
+
+ new_node->left = vertex->nodes[x];
+ new_node->right = vertex->nodes[y];
+ new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */
+
+ if(last_flag) {
+ return(new_node);
+ }
+
+ vertex->nodes[x] = new_node;
+ vertex->nodes[y] = vertex->nodes[0];
+
+ vertex->nodes = &(vertex->nodes[1]);
+
+ vertex->nactive--;
+
+ return(new_node);
+}
+
+
+
+/*
+ * NJ_print_vertex() -
+ *
+ * For debugging, print the contents of the vertex
+ *
+ */
+void
+NJ_print_vertex(NJ_VERTEX *vertex) {
+
+ long int i;
+
+ printf("Number of active nodes: %ld\n", vertex->nactive);
+
+ for(i=0;i<vertex->nactive;i++) {
+ printf("%ld ", vertex->nodes[i]->taxa_index);
+ }
+ printf("\n");
+
+ return;
+}
+
+
+
+
+
+
+
+
+
+/*
+ * NJ_print_r() -
+ *
+ */
+void
+NJ_print_r(DMAT *dmat) {
+
+ long int i;
+
+ printf("\n");
+ for(i=0;i<dmat->size;i++) {
+ printf("r[%ld] = %0.2f\n", i, dmat->r[i]);
+ }
+ printf("\n");
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_print_taxanames() -
+ *
+ * Print taxa names here
+ *
+ */
+void
+NJ_print_taxanames(DMAT *dmat) {
+
+ long int i;
+
+ printf("Number of taxa: %ld\n", dmat->ntaxa);
+
+ for(i=0;i<dmat->ntaxa;i++) {
+ printf("%ld) %s\n", i, dmat->taxaname[i]);
+ }
+
+ printf("\n");
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_shuffle_distance_matrix() -
+ *
+ * Randomize a distance matrix here
+ *
+ */
+void
+NJ_shuffle_distance_matrix(DMAT *dmat) {
+
+
+ long int *perm = NULL;
+ char **tmp_taxaname = NULL;
+ float *tmp_val = NULL;
+ long int i, j;
+
+
+ /* alloc the random permutation and a new matrix to hold the shuffled vals */
+ perm = (long int *)calloc(dmat->size, sizeof(long int));
+ tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *));
+ tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
+ if(!tmp_taxaname || !perm || !tmp_val) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n");
+ exit(-1);
+ }
+
+ /* compute a permutation which will describe how to shuffle the matrix */
+ NJ_permute(perm, dmat->size);
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ if(perm[j] < perm[i]) {
+ tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)];
+ } else {
+ tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)];
+ }
+
+ }
+
+ tmp_taxaname[i] = dmat->taxaname[perm[i]];
+ }
+
+ /* free our random permutation */
+ if(perm) {
+ free(perm);
+ }
+
+ /* free the old value matrix */
+ if(dmat->val) {
+ free(dmat->val);
+ }
+
+ /* re-assign the value matrix pointers */
+ dmat->val = tmp_val;
+ dmat->valhandle = dmat->val;
+
+ /*
+ * Free our old taxaname with its particular ordering
+ * and re-assign to the new.
+ */
+ if(dmat->taxaname) {
+ free(dmat->taxaname);
+ }
+ dmat->taxaname = tmp_taxaname;
+
+ return;
+}
+
+
+
+/*
+ * NJ_free_tree() -
+ *
+ * Free a given NJ tree
+ */
+void
+NJ_free_tree(NJ_TREE *node) {
+
+ if(!node) {
+ return;
+ }
+
+ if(node->left) {
+ NJ_free_tree(node->left);
+ }
+
+ if(node->right) {
+ NJ_free_tree(node->right);
+ }
+
+ free(node);
+
+ return;
+}
+
+
+
+
+
+
+
+
+
+/*
+ * NJ_print_permutation()
+ *
+ * Print a permutation
+ *
+ */
+void
+NJ_print_permutation(long int *perm,
+ long int size) {
+
+ long int i;
+
+ for(i=0;i<size-1;i++) {
+ printf("%ld,", perm[i]);
+ }
+ printf("%ld\n", perm[size-1]);
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_dup_dmat() -
+ *
+ * Duplicate a distance matrix
+ *
+ */
+DMAT *
+NJ_dup_dmat(DMAT *src) {
+
+ long int i;
+ DMAT *dest;
+
+ /* allocate the resulting distance matrix */
+ dest = (DMAT *)calloc(1, sizeof(DMAT));
+ if(!dest) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
+ goto XIT_BAD;
+ }
+
+ dest->ntaxa = src->ntaxa;
+ dest->size = src->size;
+
+ /* allocate space for array of pointers to taxanames */
+ dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *));
+ if(!dest->taxaname) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
+ goto XIT_BAD;
+ }
+
+ /* allocate space for the taxanames themselves */
+ for(i=0;i<src->ntaxa;i++) {
+ dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char));
+ if(!dest->taxaname[i]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ /* allocate space for the distance values */
+ dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float));
+ if(!dest->val) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n");
+ goto XIT_BAD;
+ }
+
+ /* allocate space for the r and r2 vectors */
+ dest->r = (float *)calloc(src->ntaxa, sizeof(float));
+ dest->r2 = (float *)calloc(src->ntaxa, sizeof(float));
+
+ /* copy titles */
+ for(i=0;i<src->ntaxa;i++) {
+ strcpy(dest->taxaname[i], src->taxaname[i]);
+ }
+
+ /* copy values */
+ memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float));
+
+ /* copy r and r2 */
+ memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float));
+ memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float));
+
+ /* track some memory addresses */
+ dest->valhandle = dest->val;
+ dest->rhandle = dest->r;
+ dest->r2handle = dest->r2;
+
+ return(dest);
+
+ XIT_BAD:
+
+ /* free what we may have allocated */
+ NJ_free_dmat(dest);
+
+ return(NULL);
+}
+
+
+
+
+/*
+ * NJ_free_dmat() -
+ */
+void
+NJ_free_dmat(DMAT *dmat) {
+
+ long int i;
+
+ if(dmat) {
+
+ if(dmat->taxaname) {
+
+ for(i=0;i<dmat->ntaxa;i++) {
+ if(dmat->taxaname[i]) {
+ free(dmat->taxaname[i]);
+ }
+ }
+
+ free(dmat->taxaname);
+ }
+
+ if(dmat->valhandle) {
+ free(dmat->valhandle);
+ }
+
+ if(dmat->rhandle) {
+ free(dmat->rhandle);
+ }
+
+ if(dmat->r2handle) {
+ free(dmat->r2handle);
+ }
+
+ free(dmat);
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_free_vertex() -
+ *
+ * Free the vertex data structure
+ *
+ */
+void
+NJ_free_vertex(NJ_VERTEX *vertex) {
+
+ if(vertex) {
+ if(vertex->nodes_handle) {
+ free(vertex->nodes_handle);
+ }
+ free(vertex);
+ }
+
+ return;
+}
+
+
+
+
+
+
+
+
+
+/*
+ *
+ * NJ_min_transform() - Find the smallest transformed value to identify
+ * which nodes to join.
+ *
+ * INPUTS:
+ * -------
+ * dmat -- The distance matrix
+ *
+ * RETURNS:
+ * --------
+ * <float> -- The minimimum transformed distance
+ * ret_i -- The row of the smallest transformed distance (by reference)
+ * ret_j -- The col of the smallest transformed distance (by reference)
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * Used only with traditional Neighbor-Joining, this function checks the entire
+ * working distance matrix and identifies the smallest transformed distance.
+ * This requires traversing the entire diagonal matrix, which is itself a
+ * O(N^2) operation.
+ *
+ */
+float
+NJ_min_transform(DMAT *dmat,
+ long int *ret_i,
+ long int *ret_j) {
+
+ long int i, j; /* indices used for looping */
+ long int tmp_i = 0;/* to limit pointer dereferencing */
+ long int tmp_j = 0;/* to limit pointer dereferencing */
+ float smallest; /* track the smallest trans. dist */
+ float curval; /* the current trans. dist in loop */
+
+ float *ptr; /* pointer into distance matrix */
+ float *r2; /* pointer to r2 matrix for computing transformed dists */
+
+ smallest = (float)HUGE_VAL;
+
+ /* track these here to limit pointer dereferencing in inner loop */
+ ptr = dmat->val;
+ r2 = dmat->r2;
+
+ /* for every row */
+ for(i=0;i<dmat->size;i++) {
+ ptr++; /* skip diagonal */
+ for(j=i+1;j<dmat->size;j++) { /* for every column */
+
+ /* find transformed distance in matrix at i, j */
+ curval = *(ptr++) - (r2[i] + r2[j]);
+
+ /* if the transformed distanance is less than the known minimum */
+ if(curval < smallest) {
+
+ smallest = curval;
+ tmp_i = i;
+ tmp_j = j;
+ }
+ }
+ }
+
+ /* pass back (by reference) the coords of the min. transformed distance */
+ *ret_i = tmp_i;
+ *ret_j = tmp_j;
+
+ return(smallest); /* return the min transformed distance */
+}
+
+
+
+
+
--- /dev/null
+/*
+ * clearcut.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_CLEARCUT_H_
+#define _INC_CLEARCUT_H_ 1
+
+#include "common.h"
+#include "cmdargs.h"
+
+
+#define NJ_VERSION "1.0.9"
+
+
+#define NJ_INTERNAL_NODE -1
+#define NJ_LAST 101
+
+#define NJ_INPUT_MODE_UNKNOWN 0
+#define NJ_INPUT_MODE_DISTANCE 100
+#define NJ_INPUT_MODE_UNALIGNED_SEQUENCES 101
+#define NJ_INPUT_MODE_ALIGNED_SEQUENCES 102
+
+#define NJ_MODEL_NONE 100
+#define NJ_MODEL_JUKES 101
+#define NJ_MODEL_KIMURA 102
+
+
+
+
+/*
+ * DMAT - Distance Matrix
+ *
+ * This is arguably the most important structure in the
+ * program. This is the distance matrix, and it is used
+ * by many functions throughout the application.
+ *
+ * The matrix is architected as a contiguously allocated
+ * upper-diagonal matrix of floats which include the
+ * diagonal.
+ *
+ * Example:
+ *
+ * 0 1 2 3 4 5
+ * 0 0.0 1.0 0.3 0.2 0.1 0.3
+ * 1 0.0 0.3 0.2 0.1 0.8
+ * 2 0.0 0.1 0.3 0.5
+ * 3 0.0 0.2 0.1
+ * 4 0.0 0.2
+ * 5 0.0
+ *
+ * The distance matrix shrinks with every join operation,
+ * so I track the original and working size of the matrix
+ * inside the matrix.
+ *
+ * One fast optimization to shrink the distance matrix
+ * involves incrementing the "val" pointer. Thus, in
+ * addition to tracking the pointer to the distances,
+ * I also track the original pointer to that I can
+ * free the memory associated with the working distance
+ * matrix.
+ *
+ * This also applies to the r and r2 vectors which are
+ * used to compute the transformed distances in the
+ * matrix.
+ *
+ */
+
+typedef struct _STRUCT_DMAT {
+
+ long int ntaxa; /* the original size of the distance matrix */
+ long int size; /* the current/effective size of the distance matrix */
+
+ char **taxaname; /* a pointer to an array of taxa name strings */
+
+ float *val; /* the distances */
+ float *valhandle; /* to track the orig. pointer to free memory */
+
+ float *r, *r2; /* r and r2 vectors (used to compute transformed dists) */
+ float *rhandle, *r2handle; /* track orig. pointers to free memory */
+
+} DMAT;
+
+
+
+/*
+ * NJ_TREE - The Tree Data Structure
+ *
+ *
+ * The tree is represented internally as a rooted
+ * binary tree. Each internal node has a left and a right child.
+ *
+ * Additionally, I track the distance between the current node
+ * and that node's parent (i.e. the branch length).
+ *
+ * Finally, I track the index of the taxa for leaf nodes.
+ *
+ */
+typedef struct _STRUCT_NJ_TREE {
+
+ struct _STRUCT_NJ_TREE *left; /* left child */
+ struct _STRUCT_NJ_TREE *right; /* right child */
+
+ float dist; /* branch length. i.e. dist from node to parent */
+
+ long int taxa_index; /* for terminal nodes, track the taxon index */
+
+} NJ_TREE;
+
+
+
+/*
+ * NJ_VERTEX
+ *
+ * This structure is used for building trees. It is a vector
+ * which, represents the center of the star when building the RNJ/NJ
+ * tree through star-decomposition.
+ *
+ * It contains a vector of tree (node) pointers. These pointers
+ * get joined together by a new internal node, and the new internal
+ * node is placed back into the vector of nodes (which is now smaller).
+ *
+ * To keep this vector in sync. with the shrinking matrix, parts of
+ * the vector are shuffled around, and so a pointer to the originally
+ * allocated vector is stored such that it can be freed from memory
+ * later.
+ *
+ * The original and working sizes of the vector are also tracked.
+ *
+ */
+typedef struct _STRUCT_NJ_VERTEX {
+
+ NJ_TREE **nodes;
+ NJ_TREE **nodes_handle; /* original memory handle for freeing */
+
+ long int nactive; /* number of active nodes in the list */
+ long int size; /* the total size of the vertex */
+
+} NJ_VERTEX;
+
+
+
+
+
+
+/* some function prototypes */
+
+/* core function for performing Relaxed Neighbor Joining */
+NJ_TREE *
+NJ_relaxed_nj(NJ_ARGS *nj_args, DMAT *dmat);
+
+/* function for performing traditional Neighbor-Joining */
+NJ_TREE *
+NJ_neighbor_joining(NJ_ARGS *nj_args, DMAT *dmat);
+
+/* print the distance matrix (for debugging) */
+void
+NJ_print_distance_matrix(DMAT *dmat);
+
+/* output the computed tree to stdout or to the specified file */
+void
+NJ_output_tree(NJ_ARGS *nj_args,
+ NJ_TREE *tree,
+ DMAT *dmat,
+ long int count);
+
+/* the recursive function for outputting trees */
+void
+NJ_output_tree2(FILE *fp,
+ NJ_ARGS *nj_args,
+ NJ_TREE *tree,
+ NJ_TREE *root,
+ DMAT *dmat);
+
+/* initialize vertex */
+NJ_VERTEX *
+NJ_init_vertex(DMAT *dmat);
+
+/* used to decompose the star topology and build the tree */
+NJ_TREE *
+NJ_decompose(DMAT *dmat,
+ NJ_VERTEX *vertex,
+ long int x,
+ long int y,
+ int last_flag);
+
+/* print the vertex vector (for debugging) */
+void
+NJ_print_vertex(NJ_VERTEX *vertex);
+
+/* print taxa names (for debugging) */
+void
+NJ_print_taxanames(DMAT *dmat);
+
+/* initialize r-vector prior to RNJ/NJ */
+void
+NJ_init_r(DMAT *dmat);
+
+/* print the r-vector (for debugging) */
+void
+NJ_print_r(DMAT *dmat);
+
+/* shuffle the distance matrix, usually after reading in input */
+void
+NJ_shuffle_distance_matrix(DMAT *dmat);
+
+/* free memory from the tree */
+void
+NJ_free_tree(NJ_TREE *node);
+
+/* print permutations (for debugging) */
+void
+NJ_print_permutation(long int *perm,
+ long int size);
+
+/* duplicate a distance matrix for multiple iterations */
+DMAT *
+NJ_dup_dmat(DMAT *src);
+
+/* free the distance matrix */
+void
+NJ_free_dmat(DMAT *dmat);
+
+/* free the vertex vector */
+void
+NJ_free_vertex(NJ_VERTEX *vertex);
+
+/* for computing the global minimum transformed distance in traditional NJ */
+float
+NJ_min_transform(DMAT *dmat,
+ long int *ret_i,
+ long int *ret_j);
+
+#endif /* _INC_CLEARCUT_H_ */
+
+
+
+
+
+
--- /dev/null
+/*
+ * cmdargs.c
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+
+#ifdef USE_GNU
+#include <getopt.h>
+#else
+#include "getopt_long.h"
+#endif /* USE_GNU*/
+
+
+#include "clearcut.h"
+#include "cmdargs.h"
+
+
+/*
+ * NJ_handle_args() -
+ *
+ */
+NJ_ARGS *
+NJ_handle_args(int argc,
+ char *argv[]) {
+
+ static NJ_ARGS nj_args;
+ int option_index, c;
+
+ struct option NJ_long_options[] = {
+
+ /* These options don't set a flag */
+ {"in", required_argument, NULL, 'i'},
+ {"out", required_argument, NULL, 'o'},
+ {"seed", required_argument, NULL, 's'},
+ {"matrixout", required_argument, NULL, 'm'},
+ {"ntrees", required_argument, NULL, 'n'},
+
+ /* These options set a flag */
+ {"verbose", no_argument, &(nj_args.verbose_flag), 1},
+ {"quiet", no_argument, &(nj_args.quiet_flag), 1},
+ {"distance", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_DISTANCE},
+ {"alignment", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_ALIGNED_SEQUENCES},
+ {"help", no_argument, &(nj_args.help), 1},
+ {"version", no_argument, &(nj_args.version), 1},
+ {"norandom", no_argument, &(nj_args.norandom), 1},
+ {"shuffle", no_argument, &(nj_args.shuffle), 1},
+ {"stdin", no_argument, &(nj_args.stdin_flag), 1},
+ {"stdout", no_argument, &(nj_args.stdout_flag), 1},
+ {"dna", no_argument, &(nj_args.dna_flag), 1},
+ {"DNA", no_argument, &(nj_args.dna_flag), 1},
+ {"protein", no_argument, &(nj_args.protein_flag), 1},
+ {"neighbor", no_argument, &(nj_args.neighbor), 1},
+ {"expblen", no_argument, &(nj_args.expblen), 1},
+ {"expdist", no_argument, &(nj_args.expdist), 1},
+
+ {"jukes", no_argument, &(nj_args.jukes_flag), 1},
+ {"kimura", no_argument, &(nj_args.kimura_flag), 1},
+
+ {0, 0, 0, 0}
+
+ };
+
+ /* initializes options to their default */
+ nj_args.infilename = NULL;
+ nj_args.outfilename = NULL;
+ nj_args.matrixout = NULL;
+ nj_args.seed = time(0);
+ nj_args.verbose_flag = 0;
+ nj_args.quiet_flag = 0;
+ nj_args.input_mode = NJ_INPUT_MODE_DISTANCE;
+ nj_args.help = 0;
+ nj_args.version = 0;
+ nj_args.norandom = 0;
+ nj_args.shuffle = 0;
+ nj_args.stdin_flag = 0;
+ nj_args.stdout_flag = 0;
+ nj_args.dna_flag = 0;
+ nj_args.protein_flag = 0;
+ nj_args.correction_model = NJ_MODEL_NONE;
+ nj_args.jukes_flag = 0;
+ nj_args.kimura_flag = 0;
+ nj_args.neighbor = 0;
+ nj_args.ntrees = 1;
+ nj_args.expblen = 0;
+ nj_args.expdist = 0;
+
+ while(1) {
+ c = getopt_long(argc,
+ argv,
+ "i:o:s:m:n:vqduahVSIOrDPjkNeE",
+ NJ_long_options,
+ &option_index);
+
+ if(c == -1) {
+ break;
+ }
+
+ switch(c) {
+
+ case 0:
+ if(NJ_long_options[option_index].flag) {
+ break;
+ }
+
+ printf("option %s", NJ_long_options[option_index].name);
+ if(optarg) {
+ printf(" with arg %s", optarg);
+ }
+ printf("\n");
+ break;
+
+ case 'i':
+ nj_args.infilename = optarg;
+ break;
+
+ case 'o':
+ nj_args.outfilename = optarg;
+ break;
+
+ case 's':
+ nj_args.seed = atoi(optarg);
+ break;
+
+ case 'm':
+ nj_args.matrixout = optarg;
+ break;
+
+ case 'n':
+ nj_args.ntrees = atoi(optarg);
+ break;
+
+ case 'v':
+ nj_args.verbose_flag = 1;
+ break;
+
+ case 'q':
+ nj_args.quiet_flag = 1;
+ break;
+
+ case 'd':
+ nj_args.input_mode = NJ_INPUT_MODE_DISTANCE;
+ break;
+
+ case 'a':
+ nj_args.input_mode = NJ_INPUT_MODE_ALIGNED_SEQUENCES;
+ break;
+
+ case 'h':
+ nj_args.help = 1;
+ break;
+
+ case 'V':
+ nj_args.version = 1;
+ break;
+
+ case 'S':
+ nj_args.shuffle = 1;
+ break;
+
+ case 'I':
+ nj_args.stdin_flag = 1;
+ break;
+
+ case 'O':
+ nj_args.stdin_flag = 1;
+ break;
+
+ case 'r':
+ nj_args.norandom = 1;
+ break;
+
+ case 'D':
+ nj_args.dna_flag = 1;
+ break;
+
+ case 'P':
+ nj_args.protein_flag = 1;
+ break;
+
+ case 'j':
+ nj_args.jukes_flag = 1;
+ break;
+
+ case 'k':
+ nj_args.kimura_flag = 1;
+ break;
+
+ case 'N':
+ nj_args.neighbor = 1;
+ break;
+
+ case 'e':
+ nj_args.expblen = 1;
+ break;
+
+ case 'E':
+ nj_args.expdist = 1;
+ break;
+
+ default:
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ if(optind < argc) {
+ fprintf(stderr, "Clearcut: Unknown command-line argument:\n --> %s\n", argv[optind]);
+ NJ_usage();
+ exit(-1);
+ }
+
+ if(nj_args.version) {
+ printf("Clearcut Version: %s\n", NJ_VERSION);
+ exit(0);
+ }
+
+ if(nj_args.help) {
+ NJ_usage();
+ exit(0);
+ }
+
+ /* if stdin & explicit filename are specified for input */
+ if(nj_args.stdin_flag) {
+ if(nj_args.infilename) {
+ fprintf(stderr, "Clearcut: Ambiguous input source specified. Specify input filename OR stdin.\n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* if stdout & explicit filename are specified for output */
+ if(nj_args.stdout_flag) {
+ if(nj_args.outfilename) {
+ fprintf(stderr, "Clearcut: Ambiguous output specified. Specify output filename OR stdout.\n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* if user did not specify stdin or filename, default to stdin */
+ if(!nj_args.stdin_flag) {
+ if(!nj_args.infilename) {
+
+ fprintf(stderr, "Clearcut: No input file specified. Using stdin.\n");
+ nj_args.stdin_flag = 1;
+ }
+ }
+
+ /* if user did not specify stdout or filename, default to stdout */
+ if(!nj_args.stdout_flag) {
+ if(!nj_args.outfilename) {
+
+ fprintf(stderr, "Clearcut: No output file specified. Using stdout.\n");
+ nj_args.stdout_flag = 1;
+ }
+ }
+
+ /* User must specify distance matrix or alignment */
+ if(nj_args.input_mode == NJ_INPUT_MODE_UNKNOWN) {
+ fprintf(stderr, "Clearcut: Must specify input type (--distance | --alignment)\n");
+ NJ_usage();
+ exit(-1);
+ }
+
+ /* do not allow protein or DNA options for distance matrix input */
+ if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) {
+ if(nj_args.dna_flag || nj_args.protein_flag) {
+ fprintf(stderr, "Clearcut: Ambiguous arguments. (--protein | --DNA) do not apply to distance \n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* make sure different filenames were specified for input and output */
+ if(!nj_args.stdin_flag && !nj_args.stdout_flag) {
+
+ if(!strcmp(nj_args.infilename, nj_args.outfilename)) {
+ fprintf(stderr, "Clearcut: Input filename and output filename must be unique.\n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* make sure that user specifies DNA or Protein if dealing with alignment input */
+ if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) {
+ if(!nj_args.dna_flag && !nj_args.protein_flag) {
+ fprintf(stderr, "Clearcut: Must specify protein or DNA for alignment input.\n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* make sure that user does not specify both protein and DNA when dealing with alignment input */
+ if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) {
+ if(nj_args.dna_flag && nj_args.protein_flag) {
+ fprintf(stderr, "Clearcut: Specifying protein and DNA sequences are mutually exclusive options\n");
+ NJ_usage();
+ exit(-1);
+ }
+ }
+
+ /* make sure verbose and quiet were not specified together */
+ if(nj_args.verbose_flag && nj_args.quiet_flag) {
+ fprintf(stderr, "Clearcut: Verbose and Quiet mode are mutually exclusive.\n");
+ NJ_usage();
+ exit(-1);
+ }
+
+ /* make sure that a correction model was specified only when providing an alignment */
+ if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) {
+ if(nj_args.jukes_flag || nj_args.kimura_flag) {
+ fprintf(stderr, "Clearcut: Only specify correction model for alignment input.\n");
+ NJ_usage();
+ exit(-1);
+ }
+ } else {
+ if(nj_args.jukes_flag && nj_args.kimura_flag) {
+ fprintf(stderr, "Clearcut: Only specify one correction model\n");
+ NJ_usage();
+ exit(-1);
+ } else {
+ if(nj_args.jukes_flag && !nj_args.kimura_flag) {
+ nj_args.correction_model = NJ_MODEL_JUKES;
+ } else if(nj_args.kimura_flag && !nj_args.jukes_flag) {
+ nj_args.correction_model = NJ_MODEL_KIMURA;
+ } else {
+ nj_args.correction_model = NJ_MODEL_NONE; /* DEFAULT */
+ }
+ }
+ }
+
+ /* make sure that the number of output trees is reasonable */
+ if(nj_args.ntrees <= 0) {
+ fprintf(stderr, "Clearcut: Number of output trees must be a positive integer.\n");
+ NJ_usage();
+ exit(-1);
+ }
+
+ /*
+ * make sure that if exponential distances are specified,
+ * we are dealing with alignment input
+ */
+ if(nj_args.expdist && nj_args.input_mode != NJ_INPUT_MODE_ALIGNED_SEQUENCES) {
+ fprintf(stderr, "Clearcut: Exponential notation for distance matrix output requires that input be an alignment\n");
+ NJ_usage();
+ exit(-1);
+ }
+
+ return(&nj_args);
+}
+
+
+
+
+
+/*
+ * NJ_print_args() -
+ *
+ */
+void
+NJ_print_args(NJ_ARGS *nj_args) {
+
+ char input_mode[32];
+
+ switch (nj_args->input_mode) {
+ case NJ_INPUT_MODE_DISTANCE:
+ sprintf(input_mode, "Distance Matrix");
+ break;
+ case NJ_INPUT_MODE_UNALIGNED_SEQUENCES:
+ sprintf(input_mode, "Unaligned Sequences");
+ break;
+ case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
+ sprintf(input_mode, "Aligned Sequences");
+ break;
+ default:
+ sprintf(input_mode, "UNKNOWN");
+ break;
+ }
+
+ printf("\n*** Command Line Arguments ***\n");
+
+ printf("Input Mode: %s\n", input_mode);
+
+ if(nj_args->stdin_flag) {
+ printf("Input from STDIN\n");
+ } else {
+ printf("Input File: %s\n", nj_args->infilename);
+ }
+
+ if(nj_args->stdout_flag) {
+ printf("Output from STDOUT\n");
+ } else {
+ printf("Output File: %s\n", nj_args->outfilename);
+ }
+
+ if(nj_args->input_mode != NJ_INPUT_MODE_DISTANCE) {
+ if(nj_args->aligned_flag) {
+ printf("Input Sequences Aligned: YES\n");
+ } else {
+ printf("Input Sequences Aligned: NO\n");
+ }
+ }
+
+ if(nj_args->verbose_flag) {
+ printf("Verbose Mode: ON\n");
+ } else {
+ printf("Verbose Mode: OFF\n");
+ }
+
+ if(nj_args->quiet_flag) {
+ printf("Quiet Mode: ON\n");
+ } else {
+ printf("Quiet Mode: OFF\n");
+ }
+
+ if(nj_args->seed) {
+ printf("Random Seed: %d\n", nj_args->seed);
+ }
+
+ printf("\n*******\n");
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_usage() -
+ *
+ * Print a usage message
+ *
+ */
+void
+NJ_usage(void) {
+
+ printf("Usage: clearcut --in=<infilename> --out=<outfilename> [options]...\n");
+ printf("GENERAL OPTIONS:\n");
+ printf(" -h, --help Display this information.\n");
+ printf(" -V, --version Print the version of this program.\n");
+ printf(" -v, --verbose More output. (Default: OFF)\n");
+ printf(" -q, --quiet Silent operation. (Default: ON)\n");
+ printf(" -s, --seed=<seed> Explicitly set the PRNG seed to a specific value.\n");
+ printf(" -r, --norandom Attempt joins deterministically. (Default: OFF)\n");
+ printf(" -S, --shuffle Randomly shuffle the distance matrix. (Default: OFF)\n");
+ printf(" -N, --neighbor Use traditional Neighbor-Joining algorithm. (Default: OFF)\n");
+
+ printf("\n");
+ printf("INPUT OPTIONS:\n");
+ printf(" -I, --stdin Read input from STDIN.\n");
+ printf(" -d, --distance Input file is a distance matrix. (Default: ON)\n");
+ printf(" -a, --alignment Input file is a set of aligned sequences. (Default: OFF)\n");
+ printf(" -D, --DNA Input alignment are DNA sequences.\n");
+ printf(" -P, --protein Input alignment are protein sequences.\n");
+
+ printf("\n");
+ printf("CORRECTION MODEL FOR COMPUTING DISTANCE MATRIX (Default: NO Correction):\n");
+ printf(" -j, --jukes Use Jukes-Cantor correction for computing distance matrix.\n");
+ printf(" -k, --kimura Use Kimura correction for distance matrix.\n");
+
+ printf("\n");
+ printf("OUTPUT OPTIONS:\n");
+ printf(" -O, --stdout Output tree to STDOUT.\n");
+ printf(" -m, --matrixout=<file> Output distance matrix to specified file.\n");
+ printf(" -n, --ntrees=<n> Output n trees. (Default: 1)\n");
+ printf(" -e, --expblen Exponential notation for branch lengths. (Default: OFF)\n");
+ printf(" -E, --expdist Exponential notation in distance output. (Default: OFF)\n");
+
+ printf("\n");
+ printf("EXAMPLES:\n");
+ printf(" Compute tree by supplying distance matrix via stdin:\n");
+ printf(" clearcut --distance < distances.txt > treefile.tre\n");
+ printf("\n");
+ printf(" Compute tree by supplying an alignment of DNA sequences from a file:\n");
+ printf(" clearcut --alignment --DNA --in=alignment.txt --out=treefile.tre\n");
+
+ return;
+}
+
+
+
--- /dev/null
+/*
+ * njdist.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_NJ_CMDARGS_H_
+#define _INC_NJ_CMDARGS_H_ 1
+
+#include "clearcut.h"
+
+
+/* some datatypes */
+typedef struct _STRUCT_NJ_ARGS {
+
+ char *infilename; /* the name of the input file */
+ char *outfilename; /* the name of the output tree */
+ char *matrixout; /* the name of the distance matrix output file */
+
+ int input_mode;
+ int aligned_flag;
+
+ int verbose_flag;
+ int quiet_flag;
+
+ int stdin_flag;
+ int stdout_flag;
+
+ int help;
+ int version;
+
+ int norandom;
+ int shuffle;
+
+ int dna_flag;
+ int protein_flag;
+
+ int seed;
+
+ /* correction models for distance */
+ int correction_model;
+ int jukes_flag;
+ int kimura_flag;
+
+ /* flag for using traditional neighbor-joining */
+ int neighbor;
+
+ /* number of trees to output */
+ int ntrees;
+
+ /* exponential notation output */
+ int expblen; /* exp notation for tree branch lengths */
+ int expdist; /* exp notation for distances in matrix output */
+
+} NJ_ARGS;
+
+
+
+/* some function prototypes */
+
+NJ_ARGS *
+NJ_handle_args(int argc,
+ char *argv[]);
+
+void
+NJ_print_args(NJ_ARGS *nj_args);
+
+void
+NJ_usage(void);
+
+
+#endif /* _INC_NJ_CMDARGS_H_ */
+
--- /dev/null
+/*
+ * common.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * A header file filled with common definitions and simple inline functions
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_NJ_COMMON_H_
+#define _INC_NJ_COMMON_H_ 1
+
+#include <math.h>
+#include <float.h>
+
+
+#define NJ_AMBIGUITY_CHAR 63 /* ? character */
+
+
+/*
+ * this macro defines the number of cells in the diagonal matrix
+ * based on the number of taxa involved
+ *
+ */
+#define NJ_NCELLS(a) ( ((a)*(a+1))/2 )
+
+
+
+
+/*
+ * NJ_MAP() -
+ *
+ * Thus function maps i, j coordinates to the correct offset into
+ * the distance matrix
+ *
+ */
+static inline
+long int
+NJ_MAP(long int i,
+ long int j,
+ long int ntaxa) {
+
+ return((i*(2*ntaxa-i-1))/2 + j);
+}
+
+
+static inline
+int
+NJ_FLT_EQ(float x,
+ float y) {
+
+ if(fabs(x - y)<FLT_EPSILON) {
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+static inline
+int
+NJ_FLT_LT(float x,
+ float y) {
+
+ if(NJ_FLT_EQ(x, y)) {
+ return(0);
+ } else {
+ if(x < y) {
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+}
+
+
+static inline
+int
+NJ_FLT_GT(float x,
+ float y) {
+
+ if(NJ_FLT_EQ(x, y)) {
+ return(0);
+ } else {
+ if(x > y) {
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+}
+
+
+
+
+#endif /* _INC_NJ_COMMON_H_ */
+
+
+
--- /dev/null
+/*
+ * dayhoff.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_NJ_DAYHOFF_H_
+#define _INC_NJ_DAYHOFF_H_ 1
+
+/*
+ * As sequence divergence increases, we need to correct for multiple hits
+ * when using Kimura distance correction method for amino acid sequences.
+ *
+ * This matrix of values represents the estimated "Accepted Point Mutations"
+ * or PAMs for a range of amino acid sequence divergence, starting at 75%
+ * up through 93% (in 0.1% increments).
+ *
+ * This model is derived from Dayhoff (1978).
+ *
+ * This Dayhoff matrix and the shortcut methods for dealing with Kimura
+ * correction at high sequence divergence (> 75%) are derived from similar
+ * work in Clustal W:
+ *
+ * Thompson, J.D., Higgins, D.G., Gibson, T.J., "CLUSTAL W:
+ * improving the sensitivity of progressive multiple sequence
+ * alignment through sequence weighting, position-specific gap
+ * penalties and weight matrix choice.",
+ * Nucleic Acids Research, 22:4673-4680, 1994
+ *
+ */
+
+
+int NJ_dayhoff[]={
+ 195, 196, 197, 198, 199, 200, 200, 201, 202, 203,
+ 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
+ 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
+ 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
+ 246, 248, 249, 250, 252, 253, 254, 255, 257, 258,
+ 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
+ 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
+ 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
+ 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
+ 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
+ 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
+ 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
+ 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
+ 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
+ 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
+ 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
+ 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
+ 988
+};
+
+
+
+#endif /* _INC_NJ_DAYHOFF_H_ */
+
+
+
--- /dev/null
+/*
+ * dist.c
+ *
+ * $Id$
+ *
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Compute a distance matrix given a set of sequences
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "common.h"
+#include "dayhoff.h"
+#include "fasta.h"
+#include "dist.h"
+
+
+
+
+/*
+ * NJ_build_distance_matrix() -
+ *
+ * Given a filename for an alignment, read the alignment
+ * into memory and then compute the distance matrix
+ * using the appropriate correction model
+ */
+DMAT *
+NJ_build_distance_matrix(NJ_ARGS *nj_args) {
+
+ DMAT *dmat;
+ NJ_alignment *alignment;
+
+ /* Read an alignment in FASTA format */
+ alignment =
+ NJ_read_fasta(nj_args);
+
+ if(!alignment) {
+ return(NULL);
+ }
+
+ /*
+ * Given a global multiple sequence alignment (MSA) and
+ * a specified distance correction model, compute a
+ * corrected distance matrix
+ *
+ * From proteins, we may want to allow users to specify
+ * a substitution matrix (feature)
+ */
+ dmat =
+ NJ_compute_dmat(nj_args,
+ alignment);
+
+ // NJ_print_taxanames(dmat);
+
+ if(!dmat) {
+ fprintf(stderr, "Clearcut: Error computing distance matrix\n");
+ }
+
+ /* now free the memory associated with the alignment */
+ NJ_free_alignment(alignment);
+
+ return(dmat);
+}
+
+
+
+
+
+/*
+ * NJ_compute_dmat() -
+ *
+ * Given an alignment and a correction model, compute the
+ * distance matrix and return it
+ *
+ */
+DMAT *
+NJ_compute_dmat(NJ_ARGS *nj_args,
+ NJ_alignment *alignment) {
+
+ DMAT *dmat;
+ long int i;
+
+
+ /* allocate distance matrix here */
+ dmat = (DMAT *)calloc(1, sizeof(DMAT));
+ if(!dmat) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ dmat->ntaxa = alignment->nseq;
+ dmat->size = alignment->nseq;
+
+ /* allocate memory to hold the taxa names */
+ dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *));
+ if(!dmat->taxaname) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ /* copy sequence titles */
+ for(i=0;i<alignment->nseq;i++) {
+ dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char));
+ if(!dmat->taxaname[i]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i]));
+ }
+
+ /* allocate val matrix in dmat */
+ dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float));
+ if(!dmat->val) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ /* now lets allocate space for the r and r2 columns */
+ dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
+ dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
+
+ /* track some memory addresses */
+ dmat->rhandle = dmat->r;
+ dmat->r2handle = dmat->r2;
+ dmat->valhandle = dmat->val;
+
+ /* apply model correction to matrix */
+ switch(nj_args->correction_model) {
+
+ case NJ_MODEL_JUKES:
+
+ if(nj_args->dna_flag) {
+ NJ_DNA_jc_correction(dmat, alignment);
+ } else if(nj_args->protein_flag) {
+ NJ_PROTEIN_jc_correction(dmat, alignment);
+ } else {
+ fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n");
+ return(NULL);
+ }
+
+ break;
+
+ case NJ_MODEL_KIMURA:
+
+ if(nj_args->dna_flag) {
+ NJ_DNA_k2p_correction(dmat, alignment);
+ } else if(nj_args->protein_flag) {
+ NJ_PROTEIN_kimura_correction(dmat, alignment);
+ } else {
+ fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n");
+ return(NULL);
+ }
+
+ break;
+
+ case NJ_MODEL_NONE:
+
+ NJ_no_correction(dmat, alignment);
+
+ break;
+
+ default:
+ fprintf(stderr, "Clearcut: Invalid distance correction model.\n");
+ return(NULL);
+ }
+
+ return(dmat);
+}
+
+
+
+
+
+/*
+ * NJ_no_correction() -
+ *
+ * Compute the distance matrix without correction
+ * (straight percent ID)
+ *
+ * Resolve ambiguities in sequence data by skipping
+ * those nucleotides/residues
+ *
+ */
+void
+NJ_no_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ float pdiff;
+
+ /* compute pairwise percent identity */
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+ pdiff = 1.0 - NJ_pw_percentid(alignment, i, j);
+ dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff;
+ }
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_DNA_jc_correction() -
+ *
+ * Compute the distance matrix with jukes-cantor correction
+ * and assign high distance if sequence divergence exceeds
+ * 0.75
+ *
+ * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
+ * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
+ * New York: Academic Press
+ *
+ */
+void
+NJ_DNA_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int k;
+ float d, cutoff, dist;
+ long int residues;
+
+ cutoff = 0.75;
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ k = NJ_pw_differences(alignment, i, j, &residues);
+ d = 1.0 - NJ_pw_percentid(alignment, i, j);
+
+ if(d > cutoff) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = (-0.75) * log(1.0 - (4.0/3.0)*d);
+ }
+
+ if(fabs(dist) < FLT_EPSILON) {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
+ } else {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+ }
+
+
+
+ return;
+}
+
+
+
+
+
+
+/*
+ * NJ_PROTEIN_jc_correction() -
+ *
+ * This function performs modified jukes/cantor correction on
+ * a protein alignment
+ *
+ * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
+ * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
+ * New York: Academic Press
+ *
+ */
+void
+NJ_PROTEIN_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int residues;
+ long int diff;
+ float dist, x;
+
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ diff = NJ_pw_differences(alignment, i, j, &residues);
+
+ if(!diff || !residues) {
+ dist = 0.0;
+ } else {
+
+ dist = (float)diff/(float)residues;
+ x = ((20.0/19.0)*dist);
+
+ if(NJ_FLT_GT(x, 1.0)) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = -(19.0/20.0) * log(1.0 - x);
+ }
+ }
+
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+
+/*
+ * NJ_DNA_k2p_correction() -
+ *
+ * Correct a distance matrix using k2p correction using
+ * cutoffs to avoid problems with logarithms.
+ *
+ * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q)
+ *
+ * But due to the logarithms, this is only valid when
+ *
+ * (2P+Q <= 1) &&
+ * (2Q <= 1)
+ *
+ * So assign arbitary distances when these constraints are
+ * not strictly followed.
+ *
+ * Kimura, M. (1980), A simple method for estimating evolutionary
+ * rates of base substitutions through comparative studies of
+ * nucleotide sequences. J. Mol. Evol., 16, 111-120
+ *
+ */
+void
+NJ_DNA_k2p_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ float P; /* proportion of transitions */
+ float Q; /* proportion of transversions */
+ long int nucleotides;
+ long int transitions, transversions;
+ float dist;
+ float log_x = 0.0; /* the params for the first log */
+ float log_y = 0.0; /* the params for the second log */
+
+ int blowup; /* a flag to specify if we have a log blowup */
+
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ blowup = 0;
+
+ /* count the number of transitions and transversions */
+ NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides);
+
+ if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */
+ P = 0.0;
+ Q = 0.0;
+ } else {
+ P = (float)transitions / (float)nucleotides;
+ Q = (float)transversions / (float)nucleotides;
+ }
+
+ /* the first log blows up if 2*P+Q = 1.0 */
+ if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) {
+ blowup = 1;
+ } else {
+ if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) {
+ blowup = 1;
+ } else {
+ log_x = log(1.0 - 2.0*P - Q);
+ }
+ }
+
+ /* the second log blows up if 2*Q >= 1.0 */
+ if( NJ_FLT_EQ((2.0 * Q), 1.0) ||
+ NJ_FLT_GT((2.0 * Q), 1.0) ) {
+ blowup = 1;
+ } else {
+ log_y = log(1.0 - 2.0*Q);
+ }
+
+ /* if our logarithms blow up, we just set the distance to the max */
+ if(blowup) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = (-0.5)*log_x - 0.25*log_y;
+ }
+
+ if(fabs(dist) < FLT_EPSILON) {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
+ } else {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_PROTEIN_kimura_correction() -
+ *
+ * Perform Kimura correction for distances derived from protein
+ * alignments.
+ *
+ * Kimura, M. (1983), The Neutral Theory of Molecular Evolution.
+ * p. 75., Cambridge University Press, Cambridge, England
+ *
+ */
+void
+NJ_PROTEIN_kimura_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int residues;
+ long int diff;
+ float dist;
+
+
+ printf("NJ_PROTEIN_kimura_correction()\n");
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+ diff = NJ_pw_differences(alignment, i, j, &residues);
+
+ if(!diff || !residues) {
+ dist = 0.0;
+ } else {
+ dist = (float)diff/(float)residues;
+ }
+
+ if(NJ_FLT_LT(dist, 0.75)) {
+ if(NJ_FLT_GT(dist, 0.0) ) {
+ dist = -log(1.0 - dist - (dist * dist/5.0) );
+ }
+ } else {
+ if(NJ_FLT_GT(dist, 0.93) ) {
+ dist = 10.0;
+ } else {
+ dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ;
+ }
+ }
+
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_DNA_count_tt() -
+ *
+ * Count the number of transitions and transversions
+ * between two aligned DNA sequences
+ *
+ * This routine automatically skips ambiguities when
+ * counting transitions and transversions.
+ *
+ */
+void
+NJ_DNA_count_tt(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *transitions,
+ long int *transversions,
+ long int *residues) {
+
+ long int tmp_transitions = 0;
+ long int tmp_transversions = 0;
+ long int tmp_residues = 0;
+ char a, b;
+ long int i;
+
+ for(i=0;i<alignment->length;i++) {
+
+ a = toupper(alignment->data[x*alignment->length+i]);
+ b = toupper(alignment->data[y*alignment->length+i]);
+
+ if( (a == 'A' && b == 'T') ||
+ (a == 'T' && b == 'A') ||
+ (a == 'A' && b == 'C') ||
+ (a == 'C' && b == 'A') ||
+ (a == 'T' && b == 'G') ||
+ (a == 'G' && b == 'T') ||
+ (a == 'C' && b == 'G') ||
+ (a == 'G' && b == 'C') ) {
+ tmp_transversions++;
+ }
+
+ if( (a == 'C' && b == 'T') ||
+ (a == 'T' && b == 'C') ||
+ (a == 'G' && b == 'A') ||
+ (a == 'A' && b == 'G') ) {
+ tmp_transitions++;
+ }
+
+ /* count the number of residues */
+ if(a != NJ_AMBIGUITY_CHAR &&
+ b != NJ_AMBIGUITY_CHAR ) {
+ tmp_residues++;
+ }
+
+ }
+
+ *transitions = tmp_transitions;
+ *transversions = tmp_transversions;
+
+ if(residues) {
+ *residues = tmp_residues;
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_pw_percentid() -
+ *
+ * Given an alignment and a specification
+ * for two rows, compute the pairwise
+ * percent identity between the two
+ *
+ */
+float
+NJ_pw_percentid(NJ_alignment *alignment,
+ long int x,
+ long int y) {
+
+ float pid;
+ long int i;
+ long int residues;
+ long int same;
+ char c1, c2;
+
+ residues = 0;
+ same = 0;
+ for(i=0;i<alignment->length;i++) {
+
+ c1 = alignment->data[x*alignment->length+i];
+ c2 = alignment->data[y*alignment->length+i];
+
+ if( c1 != NJ_AMBIGUITY_CHAR ||
+ c2 != NJ_AMBIGUITY_CHAR ) {
+
+ residues++;
+
+ if(c1 == c2) {
+ same++;
+ }
+ }
+
+ }
+
+ pid = (float)same/(float)residues;
+
+ return(pid);
+}
+
+
+
+/*
+ * NJ_pw_differences() -
+ *
+ * Given an alignment and a specification
+ * for two rows in the alignment, compute the
+ * number of differences between the two sequences
+ *
+ * With respect to ambiguity codes, we will want to
+ * disregard those sites entirely in our count.
+ *
+ */
+long int
+NJ_pw_differences(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *residues) {
+
+ long int i;
+ long int diff;
+ char c1, c2;
+ long int tmp_residues;
+
+ diff = 0;
+ tmp_residues = 0;
+ for(i=0;i<alignment->length;i++) {
+
+ c1 = alignment->data[x*alignment->length+i];
+ c2 = alignment->data[y*alignment->length+i];
+
+ if( c1 != NJ_AMBIGUITY_CHAR ||
+ c2 != NJ_AMBIGUITY_CHAR ) {
+
+ tmp_residues++;
+
+ if(c1 != c2) {
+ diff++;
+ }
+ }
+
+ }
+
+ *residues = tmp_residues;
+
+ return(diff);
+}
+
+
+
+
+
+
--- /dev/null
+/*
+ * dist.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Compute a distance matrix given a set of sequences
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_DIST_H_
+#define _INC_DIST_H_ 1
+
+#include "fasta.h"
+#include "clearcut.h"
+
+
+
+/*
+ * An arbitrarily large distance to represent distances
+ * which are too great to accurately correct.
+ */
+#define NJ_BIGDIST 10.0
+
+
+
+/* some function prototypes */
+DMAT *
+NJ_build_distance_matrix(NJ_ARGS *nj_args);
+
+DMAT *
+NJ_compute_dmat(NJ_ARGS *nj_args,
+ NJ_alignment *alignment);
+
+
+float
+NJ_pw_percentid(NJ_alignment *alignment,
+ long int x,
+ long int y);
+
+long int
+NJ_pw_differences(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *residues);
+
+void
+NJ_no_correction(DMAT *dmat,
+ NJ_alignment *alignment);
+
+void
+NJ_DNA_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment);
+
+void
+NJ_PROTEIN_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment);
+
+void
+NJ_DNA_k2p_correction(DMAT *dmat,
+ NJ_alignment *alignment);
+
+void
+NJ_PROTEIN_kimura_correction(DMAT *dmat,
+ NJ_alignment *alignment);
+
+void
+NJ_DNA_count_tt(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *transitions,
+ long int *transversions,
+ long int *residues);
+
+
+#endif /* _INC_DIST_H_ */
+
+
+
+
+
+
+
+
+
--- /dev/null
+/*
+ * dmat.c
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Distance matrix parser
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <errno.h>
+#include <string.h>
+
+
+
+#include "common.h"
+#include "clearcut.h"
+
+#include "dmat.h"
+
+
+
+
+
+/*
+ *
+ * NJ_is_alpha() - determine if character is an alphabetic character
+ *
+ * INPUT:
+ * ------
+ * c -- character to test
+ *
+ * RETURN:
+ * -------
+ * int -- 1 if character is alphabetic (A-Z || a-z)
+ * 0 if character is NOT alphabetic
+ *
+ */
+static inline
+int
+NJ_is_alpha(char c) {
+
+ if( (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z') ) {
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+/*
+ *
+ * NJ_is_whitespace() - determine if character is a whitespace character
+ *
+ * INPUT:
+ * ------
+ * c -- character to test
+ *
+ * RETURN:
+ * -------
+ * int -- 1 if character is whitespace (space, tab, CR, LF)
+ * 0 if character is NOT whitespace
+ *
+ */
+static inline
+int
+NJ_is_whitespace(char c) {
+
+ if( c == ' ' || /* space */
+ c == '\n' || /* newline */
+ c == '\r' || /* carriage-return */
+ c == '\v' || /* vertical tab */
+ c == '\f' || /* form feed */
+ c == '\t' ) { /* horizontal tab */
+ return(1);
+ } else {
+ return(0);
+ }
+
+}
+
+
+
+
+/*
+ *
+ * NJ_is_number() - determine if character is a number
+ *
+ * INPUT:
+ * ------
+ * c -- character to test
+ *
+ * RETURN:
+ * -------
+ * int -- 1 if character is a number (0-9)
+ * 0 if character is NOT a number
+ *
+ */
+static inline
+int
+NJ_is_number(char c) {
+ if(c >= '0' && c <= '9') {
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+/*
+ * NJ_is_distance() - check if string is a properly formatted distance value
+ *
+ */
+static inline
+int
+NJ_is_distance(char *token) {
+
+ int i;
+ char c;
+ int exponent_state;
+ int expsign_state;
+ int dpoint_state;
+
+ /* if token is NULL return failure */
+ if(!token) {
+ return(0);
+ }
+
+ exponent_state = 0;
+ expsign_state = 0;
+ dpoint_state = 0;
+
+ /* The first character must be a number, a decimal point or a sign */
+ c = token[0];
+ if(!NJ_is_number(c) &&
+ c != '.' &&
+ c != '-' &&
+ c != '+' ) {
+
+ goto BAD;
+ }
+
+ /*
+ * if the first character is not a number, and string is only one
+ * character long, then we return failure.
+ */
+ if(strlen(token) == 1) {
+ if(!NJ_is_number(c)) {
+ goto BAD;
+ }
+ }
+
+ for(i=0;i<strlen(token);i++) {
+
+ c = token[i];
+
+ /* make sure that all chars in dist string are in list of valid chars */
+ if(!NJ_is_number(c) &&
+ c != '.' &&
+ c != '-' &&
+ c != '+' &&
+ c != 'e' &&
+ c != 'E' ) {
+
+ goto BAD;
+ }
+
+ /* not the first char and we are not in exponent state but read (+,-) */
+ if(i>0 && !exponent_state) {
+ if(c == '-' ||
+ c == '+') {
+ goto BAD;
+ }
+ }
+
+ /* if we are in the exponent state, and we've already seen a sign */
+ if(exponent_state && expsign_state) {
+ if(c == '-' ||
+ c == '+') {
+ goto BAD;
+ }
+ }
+
+ /* if we are in the exponent state and we see a decimal point */
+ if(exponent_state) {
+ if(c == '.') {
+ goto BAD;
+ }
+ }
+
+ /* if we are in the exponent state and see another e or E */
+ if(exponent_state) {
+ if(c == 'e' ||
+ c == 'E') {
+ goto BAD;
+ }
+ }
+
+ /* if we are dpoint_state and see another decimal point */
+ if(dpoint_state) {
+ if(c == '.') {
+ goto BAD;
+ }
+ }
+
+
+ /* enter the exponent state if we need to */
+ if(!exponent_state) {
+ if(c == 'e' ||
+ c == 'E') {
+ exponent_state = 1;
+ }
+ }
+
+ /* enter the expsign_state if we need to */
+ if(exponent_state && !expsign_state) {
+ if(c == '-' ||
+ c == '+') {
+ expsign_state = 1;
+ }
+ }
+
+ /* if not in dpoint state and we see a dpoint */
+ if(!dpoint_state) {
+ if(c == '.') {
+ dpoint_state = 1;
+ }
+ }
+
+ }
+
+ /* the token must end in a number char */
+ if(!NJ_is_number(token[strlen(token)-1])) {
+ goto BAD;
+ }
+
+ /* token is a valid numerical distance */
+ return(1);
+
+ BAD:
+
+ /* token is invalid distance format */
+ return(0);
+}
+
+
+
+
+/*
+ * NJ_is_label() -
+ *
+ * Simply, if token is not a valid number, then it is a name
+ *
+ */
+static inline
+int
+NJ_is_label(char *token) {
+ if(NJ_is_distance(token)) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+
+
+/*
+ * NJ_get_token() - get a token from an input stream
+ *
+ */
+static inline
+int
+NJ_get_token(FILE *fp,
+ NJ_DIST_TOKEN *token) {
+
+ char c;
+ int index;
+
+ c = fgetc(fp);
+ if(feof(fp)) {
+ token->type = NJ_EOF_STATE;
+ return(token->type);
+ }
+
+ if(NJ_is_whitespace(c)) {
+ token->buf[0] = c;
+ token->buf[1] = '\0';
+ token->type = NJ_WS_STATE;
+
+ return NJ_WS_STATE;
+ }
+
+ index = 0;
+ while(!NJ_is_whitespace(c)) {
+
+ /* reallocate our buffer if necessary */
+ if(index >= token->bufsize) {
+ token->bufsize *= 2;
+ token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char));
+ if(!token->buf) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n");
+ exit(-1);
+ }
+ }
+
+ token->buf[index++] = c;
+
+ c = fgetc(fp);
+ if(feof(fp)) {
+ token->type = NJ_EOF_STATE;
+ break;
+ }
+ }
+
+ token->buf[index] = '\0';
+
+ if(token->type != NJ_EOF_STATE) {
+
+ if(NJ_is_distance(token->buf)) {
+ token->type = NJ_FLOAT_STATE;
+ } else {
+ token->type = NJ_NAME_STATE;
+ }
+
+ }
+
+ return(token->type);
+}
+
+
+
+
+
+
+/*
+ * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix
+ *
+ *
+ * INPUT:
+ * ------
+ * nj_args -- a pointer to a structure containing the command-line arguments
+ *
+ * OUTPUT:
+ * -------
+ * <DMAT *> -- NULL (failure)
+ * -- A pointer to a populated distance matrix
+ *
+ * DESCRIPTION:
+ * ------------
+ * This function implements a simple state machine to parse a distance matrix
+ * in approximate PHYLIP format. This function auto-detects whether the
+ * distance matrix is in upper, lower, or fully-symmetric format and handles
+ * it accordingly. For full/symmetric matrices, values must be symmetric
+ * around the diagonal, which is required to be zeroes. Names and values must
+ * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can
+ * include numbers, but must start with non-numerical symbols.
+ *
+ *
+ * *** UPPER FORMAT EXAMPLE ***
+ *
+ * 4
+ * seq1 0.2 0.3 0.1
+ * seq2 0.2 0.3
+ * seq3 0.1
+ * seq4
+ *
+ * *** LOWER FORMAT EXAMPLE ***
+ *
+ * 4
+ * seq1
+ * seq2 0.3
+ * seq3 0.2 0.4
+ * seq4 0.3 0.1 0.3
+ *
+ * *** SYMMETRIC (FULL) EXAMPLE ***
+ *
+ * 4
+ * seq1 0.0 0.3 0.5 0.3
+ * seq2 0.3 0.0 0.1 0.2
+ * seq3 0.5 0.1 0.0 0.9
+ * seq4 0.3 0.2 0.9 0.0
+ *
+ * Values in the distance matrix can be positive or negative, integers or
+ * real values. Values can also be parsed in exponential notation form.
+ *
+ */
+DMAT *
+NJ_parse_distance_matrix(NJ_ARGS *nj_args) {
+
+ DMAT *dmat = NULL;
+ FILE *fp = NULL;
+ NJ_DIST_TOKEN *token = NULL;
+
+ int state, dmat_type;
+ int row;
+ int fltcnt;
+ int x, y, i;
+ int numvalread;
+ int expectedvalues = -1;
+ float val;
+ int first_state = 0;
+
+
+ /* allocate our distance matrix and token structure */
+ dmat = (DMAT *)calloc(1, sizeof(DMAT));
+ token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN));
+ if(token) {
+ token->bufsize = NJ_INITIAL_BUFSIZE;
+ token->buf = (char *)calloc(token->bufsize, sizeof(char));
+ }
+ if(!dmat || !token || !token->buf) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
+ goto XIT_BAD;
+ }
+
+ /* open distance matrix file here */
+ if(nj_args->stdin_flag) {
+ fp = stdin;
+ } else {
+ fp = fopen(nj_args->infilename, "r");
+ if(!fp) {
+ fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename);
+ perror("Clearcut");
+ goto XIT_BAD;
+ }
+ }
+
+ /* get the number of taxa in this file */
+ fscanf(fp, "%ld\n", &dmat->ntaxa);
+ if(dmat->ntaxa < 2) {
+ fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n");
+
+ goto XIT_BAD;
+ }
+
+ /* set our initial working size according to the # of taxa */
+ dmat->size = dmat->ntaxa;
+
+ /* allocate space for the distance matrix values here */
+ dmat->val =
+ (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
+ if(!dmat->val) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
+ goto XIT_BAD;
+ }
+
+ /* taxa names */
+ dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *));
+ if(!dmat->taxaname) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
+ goto XIT_BAD;
+ }
+
+ /* set the initial state of our state machine */
+ dmat_type = NJ_PARSE_UNKNOWN;
+ row = -1;
+ fltcnt = 0;
+ numvalread = 0;
+
+
+ /* read the input one character at a time to drive simple state machine */
+ state = NJ_get_token(fp, token);
+ while(state != NJ_EOF_STATE) {
+
+ switch(state) {
+
+ case NJ_NAME_STATE:
+
+ if(first_state == 0) {
+ first_state = 1;
+ }
+
+ row++;
+
+ if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) {
+
+ if(fltcnt == dmat->ntaxa) {
+
+ dmat_type = NJ_PARSE_SYMMETRIC;
+ expectedvalues = dmat->ntaxa * dmat->ntaxa;
+
+ } else if (fltcnt == dmat->ntaxa-1) {
+
+ dmat_type = NJ_PARSE_UPPER;
+ expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
+
+ /* shift everything in first row by one char */
+ for(i=dmat->ntaxa-2;i>=0;i--) {
+ dmat->val[i+1] = dmat->val[i];
+ }
+
+ } else if (fltcnt == 0) {
+
+ dmat_type = NJ_PARSE_LOWER;
+ expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
+
+ } else {
+ goto XIT_BAD;
+ }
+ }
+
+ if(row >= dmat->ntaxa) {
+ goto XIT_BAD;
+ }
+
+ /* allocate space for this taxon label */
+ dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char));
+ if(!dmat->taxaname[row]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
+ goto XIT_BAD;
+ }
+
+ strcpy(dmat->taxaname[row], token->buf);
+
+ fltcnt = 0;
+
+ break;
+
+
+ case NJ_FLOAT_STATE:
+
+ if(first_state == 0) {
+ goto XIT_BAD;
+ }
+
+ val = atof(token->buf);
+ if(errno) {
+ fprintf(stderr, "Clearcut: Distance value out-of-range.\n");
+ goto XIT_BAD;
+ }
+
+ x = row;
+ y = fltcnt;
+
+ switch(dmat_type) {
+
+ case NJ_PARSE_UNKNOWN:
+
+ dmat->val[NJ_MAP(x, y, dmat->size)] = val;
+
+ break;
+
+ case NJ_PARSE_SYMMETRIC:
+
+ if(fltcnt >= dmat->ntaxa) {
+ fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
+ goto XIT_BAD;
+ }
+
+ if(x < y) {
+ dmat->val[NJ_MAP(x, y, dmat->size)] = val;
+ } else if(x > y) {
+ if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) {
+ fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n");
+ goto XIT_BAD;
+ }
+ } else {
+ if(!NJ_FLT_EQ(val, 0.0)) {
+ fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
+ goto XIT_BAD;
+
+ }
+ }
+
+ break;
+
+ case NJ_PARSE_UPPER:
+
+ if(fltcnt > dmat->ntaxa-row) {
+ fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
+ goto XIT_BAD;
+ }
+
+ dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val;
+
+ break;
+
+ case NJ_PARSE_LOWER:
+
+ if(fltcnt > row-1) {
+ fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
+ goto XIT_BAD;
+ }
+
+ dmat->val[NJ_MAP(y, x, dmat->size)] = val;
+
+ break;
+
+ default:
+
+ goto XIT_BAD;
+
+ break;
+ }
+
+ fltcnt++;
+ numvalread++;
+
+ break;
+
+ case NJ_WS_STATE:
+
+ break;
+
+ case NJ_EOF_STATE:
+
+ if(first_state == 0) {
+ goto XIT_BAD;
+ }
+
+ break;
+
+ default:
+
+ fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n");
+ break;
+
+ }
+
+ /* get next token from stream */
+ state = NJ_get_token(fp, token);
+ }
+
+
+ /*
+ * At the end, if we have not read the number of values that we predicted
+ * we would need, then there was a problem and we need to punt.
+ */
+ if(numvalread != expectedvalues) {
+ fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n");
+ goto XIT_BAD;
+ }
+
+ /* special check to make sure first value read is 0.0 */
+ if(dmat_type == NJ_PARSE_SYMMETRIC) {
+ if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) {
+ fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
+ goto XIT_BAD;
+ }
+ }
+
+
+ /* now lets allocate space for the r and r2 columns */
+ dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
+ dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
+ if(!dmat->r || !dmat->r2) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
+ goto XIT_BAD;
+ }
+
+ /* track some memory addresses */
+ dmat->rhandle = dmat->r;
+ dmat->r2handle = dmat->r2;
+ dmat->valhandle = dmat->val;
+
+ /* close matrix file here */
+ if(!nj_args->stdin_flag) {
+ fclose(fp);
+ }
+
+ if(token) {
+ if(token->buf) {
+ free(token->buf);
+ }
+ free(token);
+ }
+
+ return(dmat);
+
+
+
+ /* clean up our partial progress */
+ XIT_BAD:
+
+ if(fp) {
+ fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp));
+ }
+
+ /* close matrix file here */
+ if(!nj_args->stdin_flag) {
+ if(fp) {
+ fclose(fp);
+ }
+ }
+
+ /* if we have a valid dmat (partial or complete), we need to free it */
+ if(dmat) {
+ NJ_free_dmat(dmat);
+ }
+
+ if(token) {
+ if(token->buf) {
+ free(token->buf);
+ }
+ free(token);
+ }
+
+ return(NULL);
+}
+
+
+
+
+
+
+
+/*
+ * NJ_output_matrix() - Output a distance matrix to the specified file
+ *
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- a pointer to a data structure holding the command-line args
+ * dmat -- a pointer to a distance matrix
+ *
+ *
+ * RETURNS:
+ * --------
+ * NOTHING
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ * If the appropriate flag was specified in the command-line, this function
+ * now outputs the parsed or computed distance matrix to a file. This
+ * can be useful if generating a distance matrix was the primary goal of
+ * running the program, or if one wanted to debug and/or verify the
+ * correctness of the program.
+ *
+ * Currently this function outputs full/symmetric matrices only.
+ *
+ */
+void
+NJ_output_matrix(NJ_ARGS *nj_args,
+ DMAT *dmat) {
+
+ FILE *fp = NULL;
+ long int i, j;
+
+
+
+ /* if we haven't specieid matrixout, return immediately */
+ if(!nj_args->matrixout) {
+ return;
+ }
+
+ /* open the specified matrix file for writing */
+ fp = fopen(nj_args->matrixout, "w");
+ if(!fp) {
+ fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout);
+ return;
+ }
+
+ /* output the number of taxa in the matrix */
+ fprintf(fp, " %ld\n", dmat->size);
+
+ fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop
+
+ for(i=1;i<dmat->size;i++) {
+
+ /* output taxaname */
+ fprintf(fp, "%s\t", dmat->taxaname[i]);
+
+ for(j=0;j<i;j++) {
+ if(nj_args->expdist) { /* exponential notation (or not) */
+ fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
+ } else {
+ fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
+ }
+ }
+
+ fprintf(fp, "\n");
+ }
+
+#ifdef FULL_SYMMETRIC_MATRIX
+
+ /* output the number of taxa in the matrix */
+ fprintf(fp, " %ld\n", dmat->size);
+ for(i=0;i<dmat->size;i++) {
+
+ /* output taxaname */
+ fprintf(fp, "%s\t", dmat->taxaname[i]);
+
+ for(j=0;j<dmat->size;j++) {
+ if(i>j) {
+ if(nj_args->expdist) { /* exponential notation (or not) */
+ fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
+ } else {
+ fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
+ }
+ } else if(i<j) {
+ if(nj_args->expdist) { /* exponential notation (or not) */
+ fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]);
+ } else {
+ fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]);
+ }
+ } else {
+ if(nj_args->expdist) { /* exponential notation (or not) */
+ fprintf(fp, "%e ", 0.0);
+ } else {
+ fprintf(fp, "%f ", 0.0);
+ }
+ }
+ }
+
+ fprintf(fp, "\n");
+ }
+
+#endif // FULL_SYMMETRIC_MATRIX
+
+ /* close the file here */
+ if(fp) {
+ fclose(fp);
+ }
+
+ return;
+}
+
+
+
+
+
--- /dev/null
+/*
+ * dmat.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Distance matrix parser header file
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ */
+
+
+#ifndef _INC_DMAT_H_
+#define _INC_DMAT_H_ 1
+
+#include "clearcut.h"
+
+
+#define NJ_INITIAL_BUFSIZE 32
+
+#define NJ_NAME_STATE 100
+#define NJ_FLOAT_STATE 101
+#define NJ_WS_STATE 102
+#define NJ_EOF_STATE 103
+
+#define NJ_PARSE_SYMMETRIC 100
+#define NJ_PARSE_LOWER 101
+#define NJ_PARSE_UPPER 102
+#define NJ_PARSE_UNKNOWN 103
+
+
+/* some data structures */
+typedef struct _NJ_DIST_TOKEN_STRUCT {
+
+ char *buf;
+ long int bufsize;
+ int type;
+
+} NJ_DIST_TOKEN;
+
+
+
+/* some function prototypes */
+
+DMAT *
+NJ_parse_distance_matrix(NJ_ARGS *nj_args);
+
+void
+NJ_output_matrix(NJ_ARGS *nj_args,
+ DMAT *dmat);
+
+
+#endif /* _INC_DMAT_H_ */
+
--- /dev/null
+/*
+ * fasta.c
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Functions for parsing FASTA formatted alignment files
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "clearcut.h"
+#include "common.h"
+#include "fasta.h"
+
+
+#define NJ_NUM_DNA_AMBIGUITY_SYMS 14
+static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] =
+{
+ 'M', 'R', 'W', 'S', 'Y', 'K',
+ 'V', 'H', 'D', 'B', 'X', 'N',
+ '-', '.'
+};
+
+
+#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
+static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
+{
+ 'X', 'B', 'Z', '*', '-', '.'
+};
+
+#define NJ_NUM_DNA_SYMS 5
+static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] =
+{
+ 'A', 'G', 'C', 'T', 'U'
+};
+
+
+#define NJ_NUM_PROTEIN_SYMS 20
+static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] =
+{
+ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
+ 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
+};
+
+
+
+
+
+/*
+ * NJ_is_whitespace() - Check to see if character is whitespace
+ *
+ * INPUTS:
+ * -------
+ * c -- character to check
+ *
+ * RETURNS:
+ * --------
+ * int -- 0 if not whitespace
+ * 1 if whitespace
+ */
+static inline
+int
+NJ_is_whitespace(char c) {
+ if( c == ' ' || /* space */
+ c == '\n' || /* newline */
+ c == '\r' || /* carriage-return */
+ c == '\v' || /* vertical tab */
+ c == '\f' || /* form feed */
+ c == '\t' ) { /* horizontal tab */
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+
+
+/*
+ * NJ_is_dna() -
+ *
+ * Determines if the given symbol is DNA
+ *
+ * RETURNS: 1 if DNA
+ * 0 if not DNA
+ *
+ */
+static inline
+int
+NJ_is_dna(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
+ if(up_c == NJ_dna_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+
+/*
+ * NJ_is_dna_ambiguity() -
+ *
+ * Determines if the given symbol is a
+ * DNA ambiguity code
+ *
+ * RETURNS: 1 if DNA Ambiguity Code
+ * 0 if not DNA Ambiguity Code
+ *
+ */
+static inline
+int
+NJ_is_dna_ambiguity(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
+ if(up_c == NJ_dna_ambiguity_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+/*
+ * NJ_is_protein() -
+ *
+ * Determines if supplied symbol is amino acid symbol
+ *
+ * RETURNS: 1 if amino acid
+ * 0 if not amino acid
+ *
+ */
+static inline
+int
+NJ_is_protein(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
+ if(up_c == NJ_protein_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+/*
+ * NJ_is_protein_ambiguity() -
+ *
+ * Determines if supplied symbol is amino acid ambiguity symbol
+ *
+ * RETURNS: 1 if amino acid ambiguity symbol
+ * 0 if not amino acid ambiguity symbol
+ *
+ */
+static inline
+int
+NJ_is_protein_ambiguity(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
+ if(up_c == NJ_protein_ambiguity_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+
+
+/*
+ * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
+ * data structure
+ *
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- A pointer to a data structure containing command-line args
+ *
+ * RETURNS:
+ * --------
+ * NJ_alignment * -- A pointer to a multiple sequence alignment
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ *
+ * This function implements a state machine parser for parsing FASTA-formatted
+ * multiple sequence alignment files.
+ *
+ * Example Input:
+ *
+ * > sequence1
+ * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
+ * > sequence2
+ * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
+ * > sequence3
+ * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
+ *
+ * This function will parse alignments for DNA or protein, and will do
+ * so mindful of ambiguity codes for these kinds of sequences. All
+ * ambiguity codes are ignored by this program for the purposes of
+ * computing a distance matrix from a multiple alignment. By design,
+ * this program does not auto-detect DNA vs. Protein, and requires that
+ * the user explictly specify that on the command-line.
+ *
+ * Gaps can be represented either with the '-' or '.' characters.
+ *
+ * Newlines and other whitespace are allowed to be interspersed
+ * throughout the sequences.
+ *
+ * Taxon labels are required to be unique, and they must start with
+ * an alphabetic character (not a number, etc.). The parser will read
+ * the first token after the > character in the description line up until the
+ * first whitespace and use that for the taxon label.
+ *
+ * For example, in the line "> taxon1 is homo sapien", the taxon label will be
+ * "taxon1"
+ *
+ */
+NJ_alignment *
+NJ_read_fasta(NJ_ARGS *nj_args) {
+
+ FILE *fp = NULL;
+ char *buf = NULL;
+ char *ptr = NULL;
+ NJ_alignment *alignment = NULL;
+
+ char c;
+ int state;
+ long int index, x, seq;
+ long int i;
+ long int bufsize, nseqs = NJ_INITIAL_NSEQS;
+ int first_sequence_flag;
+
+
+
+
+ /*
+ * In this function, we implement a FASTA alignment parser which
+ * reads in an alignment character-by-character, maintaining state
+ * information which guides the parser.
+ *
+ * The program reads either DNA or Protein alignments. All title lines
+ * and sequences can be arbitrarily long. Gaps can be represented by
+ * "-" or "." characters.
+ *
+ * Ambiguity codes are also handled.
+ *
+ */
+
+ /*
+ * We can't handle reading fasta input unless the user explicity
+ * specifies the input type...just to be sure.
+ */
+ if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
+ (nj_args->dna_flag && nj_args->protein_flag) ) {
+ fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
+ goto XIT_BAD;
+ }
+
+ /* open specified fasta file here */
+ if(nj_args->stdin_flag) {
+ fp = stdin;
+ } else {
+ fp = fopen(nj_args->infilename, "r");
+ if(!fp) {
+ fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
+ perror("Clearcut");
+ goto XIT_BAD;
+ }
+ }
+
+ /* allocate the initial buffer */
+ bufsize = NJ_INITIAL_BUFFER_SIZE;
+ buf = (char *)calloc(bufsize, sizeof(char));
+
+ /* allocate the alignment container here */
+ alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
+
+ /* allocate initial title array */
+ // printf("allocating initial title array\n");
+ alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));
+
+ /* make sure that we successfully allocated memory */
+ if(!buf || !alignment || !alignment->titles) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ /* a flag */
+ first_sequence_flag = 1;
+
+ index = 0; /* tracks the position in buffer */
+ x = 0; /* tracks the position on sequence */
+ seq = 0; /* tracks the active sequence */
+
+ /* intitial state of state machine */
+ state = NJ_FASTA_MODE_UNKNOWN;
+
+ while(1) {
+
+ /* get the next character */
+ c = fgetc(fp);
+ if(feof(fp)) {
+
+ if(state == NJ_FASTA_MODE_SEQUENCE) {
+ buf[index+1] = '\0';
+
+ /* copy buf to alignment */
+ for(i=1;i<=alignment->length;i++) {
+ alignment->data[seq*alignment->length+i-1] = buf[i];
+ }
+ }
+
+ break;
+ }
+
+ /* make sure our dynamic buffer is big enough */
+ if(index >= bufsize) {
+ bufsize *= 2;
+ buf = (char *)realloc(buf, bufsize);
+ if(!buf) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ switch(state) {
+
+ case NJ_FASTA_MODE_UNKNOWN:
+
+ if(!NJ_is_whitespace(c)) {
+ if(c == '>') {
+ state = NJ_FASTA_MODE_TITLE;
+ buf[0] = '>';
+ } else {
+ goto XIT_BAD;
+ }
+ }
+
+ break;
+
+ case NJ_FASTA_MODE_TITLE:
+
+ if( c == '\n' ||
+ c == '\r' ) {
+
+ buf[index] = '\0';
+ state = NJ_FASTA_MODE_SEQUENCE;
+ index = 0;
+ x = -1;
+
+ /* make sure we've allocated enough space for titles and sequences */
+ if(seq == nseqs) {
+
+ // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs);
+
+ nseqs *= 2;
+
+ alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
+ if(!alignment->titles) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);
+
+ alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
+ if(!alignment->titles[seq]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ /* lets forward to the first non-space (space/tab) character after the '>' */
+
+ if(first_sequence_flag) {
+ ptr = buf;
+ } else {
+ ptr = &buf[1];
+ }
+ while(*ptr == '\t' || *ptr == ' ') {
+ ptr++;
+ }
+ sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */
+
+ alignment->nseq++;
+ }
+
+ buf[index++] = c;
+
+ break;
+
+
+ case NJ_FASTA_MODE_SEQUENCE:
+
+ if(c == '>') {
+
+ if(first_sequence_flag) {
+ first_sequence_flag = 0;
+
+ /* allocate our alignment data section here */
+ alignment->length = index-1;
+
+ nseqs = NJ_INITIAL_NSEQS;
+ alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ if(!first_sequence_flag) {
+ if(index-1 < alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+ }
+
+ /* re-allocate if necessary */
+ /*
+ if(seq >= nseqs) {
+ nseqs *= 2;
+ alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+ */
+
+ /* copy buf to alignment */
+ for(i=1;i<=alignment->length;i++) {
+ alignment->data[seq*alignment->length+i-1] = buf[i];
+ }
+
+ state = NJ_FASTA_MODE_TITLE;
+ index = 1;
+ x = 1;
+
+ buf[0] = c;
+
+ seq++;
+
+ } else {
+
+ if(NJ_is_whitespace(c)) {
+ break;
+ }
+
+ if(!first_sequence_flag) {
+ if(index-1 >= alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+ }
+
+
+ /*
+ * Here we check to make sure that the symbol read is appropriate
+ * for the type of data the user specified. (dna or protein).
+ * We also handle ambiguity codes by converting them to a specific
+ * assigned ambiguity code character. Ambiguity codes are ignored
+ * when computing distances
+ */
+
+ if(nj_args->dna_flag) {
+ if(NJ_is_dna(c)) {
+ buf[index++] = toupper(c);
+ } else {
+ if(NJ_is_dna_ambiguity(c)) {
+ buf[index++] = NJ_AMBIGUITY_CHAR;
+ } else {
+ fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
+ goto XIT_BAD;
+ }
+ }
+ } else if(nj_args->protein_flag) {
+ if(NJ_is_protein(c)) {
+ buf[index++] = toupper(c);
+ } else {
+ if(NJ_is_protein_ambiguity(c)) {
+ buf[index++] = NJ_AMBIGUITY_CHAR;
+ } else {
+ fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
+ goto XIT_BAD;
+ }
+ }
+ }
+
+ }
+
+ break;
+
+ default:
+ goto XIT_BAD;
+
+ break;
+
+ }
+ }
+
+ if(index-1 != alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+
+ /* check for duplicate taxon labels */
+ if(!NJ_taxaname_unique(alignment)) {
+ goto XIT_BAD;
+ }
+
+ return(alignment);
+
+
+ XIT_BAD:
+
+ if(fp) {
+ fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
+ }
+
+ if(buf) {
+ free(buf);
+ }
+
+ NJ_free_alignment(alignment);
+
+ return(NULL);
+}
+
+
+
+
+/*
+ * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
+ *
+ * INPUTS:
+ * -------
+ * alignment -- A pointer to the alignment
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ */
+void
+NJ_print_alignment(NJ_alignment *alignment) {
+
+ long int i, j;
+
+ printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
+
+ for(i=0;i<alignment->nseq;i++) {
+
+ printf("> %s\n", alignment->titles[i]);
+
+ for(j=0;j<alignment->length;j++) {
+ printf("%c", alignment->data[i*alignment->length+j]);
+ }
+
+ printf("\n");
+ }
+
+ return;
+}
+
+
+
+
+
+
+
+/*
+ *
+ * NJ_free_alignment() - Free all of the memory allocated for the
+ * multiple sequence alignment
+ *
+ * INPUTS:
+ * -------
+ * alignment -- A pointer to the multiple sequence alignment
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ */
+void
+NJ_free_alignment(NJ_alignment *alignment) {
+
+ long int i;
+
+ if(alignment) {
+
+ /* free the allocated titles */
+ if(alignment->titles) {
+ for(i=0;i<alignment->nseq;i++) {
+
+ if(alignment->titles[i]) {
+ free(alignment->titles[i]);
+ }
+ }
+
+ free(alignment->titles);
+ }
+
+ /* free the alignment data */
+ if(alignment->data) {
+ free(alignment->data);
+ }
+
+ /* free the alignment itself */
+ free(alignment);
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
+ *
+ * INPUTS:
+ * -------
+ * alignment -- a pointer to a multiple sequence alignment
+ *
+ * OUTPUTS:
+ * --------
+ * int -- 0 if all taxanames in alignment are unique
+ * 1 if all taxanames in alignment are NOT unique
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * Check to see if the taxanames in the alignment are unique. It
+ * will be impossible to make sense of the final tree if the taxon
+ * labels are not unqiue.
+ *
+ */
+int
+NJ_taxaname_unique(NJ_alignment *alignment) {
+
+ long int i, j;
+
+ for(i=0;i<alignment->nseq;i++) {
+ for(j=i+1;j<alignment->nseq;j++) {
+ if(!strcmp(alignment->titles[i],
+ alignment->titles[j])) {
+ fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n",
+ i, j, alignment->titles[i]);
+ return(0);
+ }
+ }
+ }
+
+ return(1);
+}
+
+
+void
+NJ_print_titles(NJ_alignment *alignment) {
+
+ int i;
+
+ for(i=0;i<alignment->nseq;i++) {
+ printf("%d: %s\n", i, alignment->titles[i]);
+ }
+
+ return;
+}
--- /dev/null
+/*
+ * fasta.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_NJ_FASTA_H_
+#define _INC_NJ_FASTA_H_ 1
+
+#include "clearcut.h"
+
+#define NJ_INITIAL_BUFFER_SIZE 512
+#define NJ_INITIAL_NSEQS 64
+
+#define NJ_FASTA_MODE_TITLE 100
+#define NJ_FASTA_MODE_SEQUENCE 101
+#define NJ_FASTA_MODE_NEWLINE 102
+#define NJ_FASTA_MODE_UNKNOWN 103
+
+
+typedef struct _STRUCT_NJ_ALIGNMENT {
+
+ long int nseq;
+ long int length;
+
+ char **titles;
+
+ char *data;
+
+} NJ_alignment;
+
+
+NJ_alignment *
+NJ_read_fasta(NJ_ARGS *nj_args);
+
+void
+NJ_print_alignment(NJ_alignment *alignment);
+
+void
+NJ_free_alignment(NJ_alignment *alignment);
+
+int
+NJ_taxaname_unique(NJ_alignment *alignment);
+
+#endif /* _INC_NJ_FASTA_H_ */
+
+
+
+
--- /dev/null
+/*
+ This getopt_long() is compatible with GNU's, however, added original
+ extention (short 1 byte option).
+
+
+ Copyright (c) 2004 Koji Arai
+
+ Permission is hereby granted, free of charge, to any person
+ obtaining a copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction,
+ including without limitation the rights to use, copy, modify, merge,
+ publish, distribute, sublicense, and/or sell copies of the Software,
+ and to permit persons to whom the Software is furnished to do so,
+ subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+
+ Compilation for Test:
+
+ GNU:
+ cc -DUSE_GNU -DDEBUG getopt_long.c -o test_getopt_long_gnu
+
+ not GNU:
+ cc -I. -DDEBUG getopt_long.c -o test_getopt_long
+
+ ./test_getopt_long
+ ./test_getopt_long_gnu
+
+ BUGS:
+ * not implemented any features for getopt() and getopt_long().
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#if DEBUG
+static int
+puts_argv(char **argv)
+{
+ int i;
+
+ for (i = 0; argv[i]; i++) {
+ if (i) printf(" ");
+
+ printf("%s", argv[i]);
+ }
+ printf("\n");
+
+ return 0;
+}
+#endif
+
+#ifndef USE_GNU
+#include <stdio.h>
+#include "getopt_long.h"
+
+char *optarg;
+int optind;
+
+int opterr;
+int optopt;
+
+/*
+ return value 0: no option (include '-')
+ 1: short option like '-x'
+ 2: long option like '--xxx' and just '--'
+*/
+static int
+is_option(char *arg)
+{
+ if (arg[0] == '-') {
+ switch (arg[1]) {
+ case 0: /* just "-" */
+ return 0;
+ case '-': /* long option (include just "--")*/
+ return 2;
+ default: /* short option */
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+insert_argv(char **argv, int src, int dest)
+{
+ int i;
+ char *tmp = argv[src];
+
+ if (src > dest) {
+ for (i = src; i > dest; i--)
+ argv[i] = argv[i-1];
+ }
+ if (src < dest) {
+ for (i = src; i < dest; i++)
+ argv[i] = argv[i+1];
+ }
+
+ argv[dest] = tmp;
+
+ return 0;
+}
+
+static int
+search_longopt(char *arg, struct option *longopts)
+{
+ int i, found = -1;
+ int len;
+ for (len = 0; arg[len] && arg[len] != '='; len++)
+ ;
+
+ for (i = 0; longopts[i].name; i++) {
+ if (strncmp(arg, longopts[i].name, len) == 0) {
+ if (found != -1)
+ return -1; /* found some candidate */
+ found = i;
+ }
+ }
+ return found;
+}
+
+/*
+ * implemented my extention feature.
+ * optional 1 byte argument with [...]
+ * e.g.) shortopts = "a[0123]b"
+ * accepts "-a0 -a1b" (same as "-a0 -a1 -b")
+ */
+static int
+has_argument_short(char *arg, const char *shortopts)
+{
+ int i;
+ int open_bracket = 0;
+ for (i = 0; shortopts[i]; i++) {
+ switch (shortopts[i]) {
+ case '[':
+ open_bracket++;
+ continue;
+ case ']':
+ if (open_bracket <= 0) {
+ fprintf(stderr, "getopt_long() -- unbalanced bracket in short options");
+ return -1;
+ }
+ open_bracket--;
+ continue;
+ }
+ if (open_bracket) continue;
+ if (*arg != shortopts[i]) continue;
+
+ switch (shortopts[i+1]) {
+ case ':':
+ if (shortopts[i+2] != ':') {
+ if (arg[1])
+ return 1; /* following string is argument */
+ else
+ return 2; /* next argv is argument */
+ }
+ else {
+ /* '::' means optional argument (GNU extention) */
+ if (arg[1])
+ return 1;
+ else
+ return 0; /* no argument */
+ }
+ case '[':
+ if (arg[1] == '\0')
+ return 0; /* no argument */
+ /* my extention */
+ for (i++; shortopts[i] && shortopts[i] != ']'; i++) {
+ if (arg[1] == shortopts[i])
+ return 3; /* has 1 byte argument */
+ }
+ if (!shortopts[i]) {
+ fprintf(stderr, "getopt_long() -- unbalanced bracket in short options");
+ return -1;
+ }
+ break;
+ default:
+ return 0; /* no argument */
+ }
+ }
+ /* Invalid option */
+ return -1;
+}
+
+static int
+has_argument_long(char *arg, struct option *longopts)
+{
+ int i;
+
+ i = search_longopt(arg, longopts);
+ if (i == -1) {
+ /* Invalid option */
+ return -1;
+ }
+ else {
+ int len = strlen(arg);
+ char *p = strchr(arg, '=');
+ if (p) {
+ len = p - arg;
+ }
+
+ switch (longopts[i].has_arg) {
+ case no_argument:
+ return 0;
+ case required_argument:
+ if (arg[len] == '=')
+ return 1;
+ else
+ return 2;
+ case optional_argument:
+ if (arg[len] == '=')
+ return 1;
+ else
+ return 0;
+ default:
+ assert(0);
+ }
+ }
+}
+
+/*
+ -1: no option
+ 0: no argument
+ 1: has argument in this argv
+ 2: has argument in next argv
+ 3: has 1 byte argument in this argv
+*/
+static int
+has_argument(char *arg,
+ const char *shortopts,
+ struct option *longopts)
+{
+ int i, n;
+
+ switch (is_option(arg)) {
+ case 0: /* no option */
+ return -1;
+ case 1:
+ /* short option */
+ n = -1;
+ for (i = 1; arg[i]; i++) {
+ n = has_argument_short(arg+i, shortopts);
+ if (n == 0 && arg[i+1]) continue;
+ if (n == 3 && arg[i+2]) { i++; continue; }
+ break;
+ }
+ return n;
+ case 2:
+ /* long option */
+ return has_argument_long(arg+2, longopts);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+int
+getopt_long(int argc, char **argv,
+ const char *shortopts,
+ struct option *longopts,
+ int *indexptr)
+{
+ char *opt;
+ int i;
+ static int shortoptind;
+ static int no_optind = 0;
+
+ if (optind == 0) { /* skip first argument (command name) */
+ optind++;
+ no_optind = 0;
+ shortoptind = 0;
+ }
+
+ optarg = 0;
+
+ if (no_optind && !shortoptind) {
+ while (!is_option(argv[no_optind]))
+ insert_argv(argv, no_optind, optind-1);
+
+ if (has_argument(argv[no_optind], shortopts, longopts) == 2)
+ no_optind += 2;
+ else
+ no_optind++;
+
+ if (argv[optind] && strcmp(argv[optind], "--") == 0) {
+ while (!is_option(argv[no_optind]))
+ insert_argv(argv, no_optind, optind);
+ optind = no_optind;
+ no_optind = 0;
+ }
+ }
+
+ if (optind >= argc)
+ goto end_of_option;
+
+ retry:
+ /*
+ puts_argv(&argv[optind]);
+ */
+ opt = argv[optind];
+
+ if (shortoptind == 0 && is_option(opt) == 1) {
+ shortoptind++;
+ }
+
+ if (shortoptind) {
+ /* short option */
+ char *p = &opt[shortoptind];
+
+ if (*p == '\0')
+ assert(0);
+
+ switch (has_argument_short(p, shortopts)) {
+ case 0:
+ /* no argument */
+ optarg = 0;
+
+ shortoptind++;
+ if (opt[shortoptind] == '\0')
+ optind++, shortoptind = 0;
+ return *p;
+ case 1:
+ /* following character is argument */
+ optind++, shortoptind = 0;
+ optarg = &p[1];
+ return *p;
+ case 2:
+ /* next argv is argument */
+ optind++, shortoptind = 0;
+ optarg = argv[optind++];
+ return *p;
+ case 3:
+ /* has 1 byte argument */
+ optarg = &p[1];
+ if (p[2] == 0)
+ optind++, shortoptind = 0;
+ else
+ shortoptind += 2;
+ return *p;
+ default:
+ /* Invalid option */
+ if (opterr)
+ fprintf(stderr,
+ "%s: invalid option -- %c\n",
+ argv[0],
+ *p);
+
+ optind++, shortoptind = 0;
+ optopt = *p;
+ return '?';
+ }
+ }
+ else if (opt[0] == '-' && opt[1] == '-') {
+ /* long option */
+
+ if (opt[2] == '\0') {
+ /* end of command line switch */
+ optind++;
+ return -1;
+ }
+
+ opt += 2;
+
+ i = search_longopt(opt, longopts);
+ if (i == -1) {
+ optind++;
+ optopt = 0;
+ return '?';
+ }
+ else {
+ int len = strlen(opt);
+ char *p = strchr(opt, '=');
+ if (p) {
+ len = p - opt;
+ }
+
+ switch (longopts[i].has_arg) {
+ case no_argument:
+ break;
+ case required_argument:
+ if (opt[len] == '=')
+ optarg = opt + len + 1;
+ else {
+ optind++;
+ optarg = argv[optind];
+ if (optarg == 0) {
+ if (opterr)
+ fprintf(stderr,
+ "%s: option `--%s' requires an argument\n",
+ argv[0],
+ opt);
+
+ optopt = 0;
+ return '?'; /* no argument */
+ }
+ }
+ break;
+ case optional_argument:
+ if (opt[len] == '=')
+ optarg = opt + len + 1;
+ else {
+ optarg = 0;
+ }
+ break;
+ default:
+ break;
+ }
+
+ *indexptr = i;
+ optind++;
+ if (longopts[i].flag) {
+ *longopts[i].flag = longopts[i].val;
+ return 0;
+ }
+ else {
+ return longopts[i].val;
+ }
+ }
+
+ optind++;
+ optopt = 0;
+ return '?';
+ }
+
+ /* not option */
+ if (no_optind == 0)
+ no_optind = optind;
+
+ for (i = optind; argv[i]; i++) {
+ if (is_option(argv[i])) {
+ optind = i;
+ goto retry;
+ }
+ }
+
+ end_of_option:
+ if (no_optind) {
+ optind = no_optind;
+ no_optind = 0;
+ }
+
+ return -1;
+}
+#endif /* USE_GNU */
+
+#if DEBUG
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#if USE_GNU
+#include <getopt.h> /* use GNU getopt_long() */
+#endif
+
+static int verbose_flag;
+static int option_index;
+int argc;
+char *argv[50];
+char **p;
+int c;
+static struct option long_options[] = {
+ {"verbose", no_argument, &verbose_flag, 1},
+ {"brief", no_argument, &verbose_flag, 0},
+ {"add", required_argument, 0, 'a'},
+ {"append", no_argument, 0, 0},
+ {"delete", required_argument, 0, 0},
+ {"create", optional_argument, 0, 0},
+ {"change", optional_argument, 0, 0},
+ {0, 0, 0, 0}
+};
+
+int
+call_getopt_long(int argc, char **argv,
+ const char *shortopts,
+ struct option *longopts,
+ int *indexptr)
+{
+ int c;
+ c = getopt_long(argc, argv, shortopts, longopts, indexptr);
+ puts_argv(argv);
+ printf("ret=%d(%c) option_index=%d ", c, c, option_index);
+ printf("optind=%d optarg=[%s] opterr=%d optopt=%d(%c)\n",
+ optind, optarg, opterr, optopt, optopt);
+ if (c == 0) {
+ struct option *opt;
+ opt = &longopts[*indexptr];
+ printf("long option: --%s has_arg=%d\n", opt->name, opt->has_arg);
+ if (opt->flag)
+ printf(" flag=[%8p] val=%d\n", opt->flag, *opt->flag);
+ }
+
+ return c;
+}
+
+void
+test1()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-bcd";
+ argc++; *p++ = "-d";
+ argc++; *p++ = "-e";
+ argc++; *p++ = "-f";
+ argc++; *p++ = "-g";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'a');
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'b');
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'c');
+ assert(option_index == 0);
+ assert(optind == 3);
+ assert(optarg == &argv[2][3]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'd');
+ assert(option_index == 0);
+ assert(optind == 5);
+ assert(optarg == argv[4]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == '?');
+ assert(option_index == 0);
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 'f');
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == '?');
+ assert(option_index == 0);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 'g');
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == -1);
+ assert(option_index == 0);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+}
+
+void
+test2()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--verbose";
+ argc++; *p++ = "--brief";
+ argc++; *p++ = "--add";
+ argc++; *p++ = "add_argument";
+ argc++; *p++ = "--add=add_argument";
+ argc++; *p++ = "--append";
+ argc++; *p++ = "--delete=del_argument";
+ argc++; *p++ = "--create=cre_argument";
+ argc++; *p++ = "--create";
+ argc++; *p++ = "files...";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "verbose") == 0);
+ assert(*long_options[option_index].flag == 1);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 1);
+ assert(optind == 3);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "brief") == 0);
+ assert(*long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'a');
+ assert(option_index == 2);
+ assert(optind == 5);
+ assert(optarg == argv[4]);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "add") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 'a');
+ assert(option_index == 2);
+ assert(optind == 6);
+ assert(optarg == argv[5]+6);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "add") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 3);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "append") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 4);
+ assert(optind == 8);
+ assert(optarg == argv[7]+9);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "delete") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 9);
+ assert(optarg == argv[8]+9);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "create") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 10);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(long_options[option_index].name, "create") == 0);
+ assert(long_options[option_index].flag == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 10);
+ assert(optarg == 0);
+ assert(optopt == 'g'); /* no changed */
+ assert(strcmp(argv[optind], "files...") == 0);
+
+}
+
+void
+test3()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--delete"; /* required argument has no argument */
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == '?');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2); /* changed */
+ assert(optarg == 0);
+ assert(optopt == 0); /* changed */
+ assert(argv[optind] == 0);
+
+ /* */
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--file"; /* not option */
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ assert(c == '?');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+ assert(argv[optind] == 0);
+}
+
+void
+test4()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "a1";
+ argc++; *p++ = "a2";
+ argc++; *p++ = "-b";
+ argc++; *p++ = "b";
+ argc++; *p++ = "-efg"; /* some options in a argument */
+ argc++; *p++ = "g";
+ argc++; *p++ = "-c";
+ argc++; *p++ = "c";
+ argc++; *p++ = "d";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'b');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 5);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'e');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'f');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'g');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 8);
+ assert(optarg == argv[7]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 10);
+ assert(optarg == argv[9]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "-efg") == 0);
+ assert(strcmp(*p++, "g") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "a1") == 0);
+ assert(strcmp(*p++, "a2") == 0);
+ assert(strcmp(*p++, "b") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+}
+
+void
+test5()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-b";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(*p == 0);
+
+ assert(c == 'b');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 4);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 3);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test6()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-b";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(*p == 0);
+
+ assert(c == 'b');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 5);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-b") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 3);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test7()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-c";
+ argc++; *p++ = "c";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == argv[5]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 4);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test8()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-c";
+ argc++; *p++ = "c";
+ argc++; *p++ = "--";
+ argc++; *p++ = "-d";
+ argc++; *p++ = "d";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 4);
+ assert(optarg == argv[3]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 5);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 7);
+ assert(optarg == argv[6]);
+ assert(optopt == 0);
+}
+
+void
+test9()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-";
+ argc++; *p++ = "-c";
+ argc++; *p++ = "c";
+ argc++; *p++ = "--";
+ argc++; *p++ = "-d";
+ argc++; *p++ = "d";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == argv[5]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 5);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "c") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 9);
+ assert(optarg == argv[8]);
+ assert(optopt == 0);
+}
+
+void
+test10()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-cc";
+ argc++; *p++ = "-d";
+ argc++; *p++ = "d";
+ argc++; *p++ = "-c"; /* no argument */
+ argc++; *p++ = "-d"; /* at last */
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 3);
+ assert(optarg == argv[2]+2);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 4);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 5); /* no changed */
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-cc") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "-c") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 5); /* no changed */
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test11()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--verbose";
+ argc++; *p++ = "--create=c";
+ argc++; *p++ = "--change";
+ argc++; *p++ = "d";
+ argc++; *p++ = "--create"; /* no argument */
+ argc++; *p++ = "--change"; /* at last */
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 3);
+ assert(optarg == argv[2]+9);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 6);
+ assert(optind == 4);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 6);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 6);
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test12()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--verbose";
+ argc++; *p++ = "--create=c";
+ argc++; *p++ = "files...";
+ argc++; *p++ = "--delete"; /* required argument */
+ argc++; *p++ = "d";
+ argc++; *p++ = "--create"; /* no argument */
+ argc++; *p++ = "--change"; /* at last */
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 3);
+ assert(optarg == argv[2]+9);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 4);
+ assert(optind == 6);
+ assert(optarg == argv[5]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 6);
+ assert(optind == 8);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--create") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 6);
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+}
+
+void
+test13()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "--verbose";
+ argc++; *p++ = "--create=c";
+ argc++; *p++ = "files...";
+ argc++; *p++ = "--delete";
+ argc++; *p++ = "d";
+ argc++; *p++ = "--"; /* option terminator */
+ argc++; *p++ = "--change";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 0);
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 5);
+ assert(optind == 3);
+ assert(optarg == argv[2]+9);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == 0);
+ assert(option_index == 4);
+ assert(optind == 6);
+ assert(optarg == argv[5]);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "--verbose") == 0);
+ assert(strcmp(*p++, "--create=c") == 0);
+ assert(strcmp(*p++, "--delete") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "--") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(strcmp(*p++, "--change") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 4);
+ assert(optind == 6);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+}
+
+void
+test14()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-o5";
+ argc++; *p++ = "files...";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "o[567]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-o5") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(*p == 0);
+
+ assert(c == 'o');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 2);
+ assert(optarg == argv[1]+2);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-o5") == 0);
+ assert(strcmp(*p++, "files...") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 4); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+}
+
+void
+test15()
+{
+ optind = 0;
+ argc = 0;
+ p = argv;
+
+ argc++; *p++ = "command_name";
+ argc++; *p++ = "-a";
+ argc++; *p++ = "-ccd";
+ argc++; *p++ = "-ce";
+ argc++; *p++ = "-d";
+ argc++; *p++ = "d";
+ argc++; *p++ = "-cdd";
+ argc++; *p++ = "-d";
+ *p = 0;
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'a');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 2);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 2);
+ assert(optarg == argv[2]+2);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 3);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 4);
+ assert(optarg == argv[3]+2);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 5);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'c');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 6);
+ assert(optarg == argv[6]+2);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(*p == 0);
+
+ assert(c == 'd');
+ assert(option_index == 4); /* no changed */
+ assert(optind == 8);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+
+ /*************************/
+ c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index);
+
+ p = argv;
+ assert(strcmp(*p++, "command_name") == 0);
+ assert(strcmp(*p++, "-a") == 0);
+ assert(strcmp(*p++, "-ccd") == 0);
+ assert(strcmp(*p++, "-ce") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "-cdd") == 0);
+ assert(strcmp(*p++, "-d") == 0);
+ assert(strcmp(*p++, "d") == 0);
+ assert(*p == 0);
+
+ assert(c == -1);
+ assert(option_index == 4); /* no changed */
+ assert(optind == 7);
+ assert(optarg == 0);
+ assert(optopt == 0);
+
+
+}
+
+int
+main()
+{
+ opterr = 0;
+ optopt = 0;
+
+ test1();
+ test2();
+ test3();
+ test4();
+ test5();
+ test6();
+ test7();
+ test8();
+ test9();
+ test10();
+ test11();
+ test12();
+ test13();
+#ifndef USE_GNU
+ test14();
+ test15();
+#endif
+
+ return 0;
+}
+#endif
--- /dev/null
+/*
+ This getopt_long() is compatible with GNU's, however, added original
+ extention (short 1 byte option).
+
+
+ Copyright (c) 2004 Koji Arai
+
+ Permission is hereby granted, free of charge, to any person
+ obtaining a copy of this software and associated documentation files
+ (the "Software"), to deal in the Software without restriction,
+ including without limitation the rights to use, copy, modify, merge,
+ publish, distribute, sublicense, and/or sell copies of the Software,
+ and to permit persons to whom the Software is furnished to do so,
+ subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef _GETOPT_H
+
+struct option {
+ const char *name;
+ int has_arg;
+
+ /* values of has_arg */
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
+
+ int *flag;
+ int val;
+};
+
+extern char *optarg;
+extern int optind;
+
+int getopt_long(int argc, char **argv,
+ const char *shortopts,
+ struct option *longopts,
+ int *indexptr);
+
+#endif /* _GETOPT_H */
if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; }
if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; }
- //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
-// sizes.clear();
-
+ string fileroot = outputDir + getRootName(getSimpleName(fastafile));
+ string newFastaFile = fileroot + "precluster" + getExtension(fastafile);
+ string newNamesFile = fileroot + "precluster.names";
+ ofstream outFasta;
+ ofstream outNames;
+
+ openOutputFile(newFastaFile, outFasta);
+ openOutputFile(newNamesFile, outNames);
+
//sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+ alignSeqs.sort(comparePriority);
int count = 0;
+ int i = 0;
//think about running through twice...
- for (int i = 0; i < numSeqs; i++) {
-
- //are you active
- // itActive = active.find(alignSeqs[i].seq.getName());
-
- if (alignSeqs[i].active) { //this sequence has not been merged yet
-
- //try to merge it with all smaller seqs
- for (int j = i+1; j < numSeqs; j++) {
+ list<seqPNode>::iterator itList;
+ list<seqPNode>::iterator itList2;
+ for (itList = alignSeqs.begin(); itList != alignSeqs.end(); itList++) {
+ //try to merge it with all smaller seqs
+ for (itList2 = alignSeqs.begin(); itList2 != alignSeqs.end(); itList2++) {
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { outFasta.close(); outNames.close(); remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; }
- if (alignSeqs[j].active) { //this sequence has not been merged yet
- //are you within "diff" bases
- int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+ if ((*itList).seq.getName() != (*itList2).seq.getName()) { //you don't want to merge with yourself
+ //are you within "diff" bases
+ int mismatch = calcMisMatches((*itList).seq.getAligned(), (*itList2).seq.getAligned());
- if (mismatch <= diffs) {
- //merge
- alignSeqs[i].names += ',' + alignSeqs[j].names;
- alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
+ if (mismatch <= diffs) {
+ //merge
+ (*itList).names += ',' + (*itList2).names;
+ (*itList).numIdentical += (*itList2).numIdentical;
- alignSeqs[j].active = 0;
- alignSeqs[j].numIdentical = 0;
- count++;
- }
- }//end if j active
- }//end if i != j
+ alignSeqs.erase(itList2); itList2--;
+ count++;
+ }
+ }
+ }
+
+ //ouptut this sequence
+ printData(outFasta, outNames, (*itList));
+
+ //remove sequence
+ alignSeqs.erase(itList); itList--;
- //remove from active list
- alignSeqs[i].active = 0;
- }//end if active i
- if(i % 100 == 0) { cout << i << '\t' << numSeqs - count << '\t' << count << endl; }
+ i++;
+
+ if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
}
+
+ if(i % 100 != 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
- string fileroot = outputDir + getRootName(getSimpleName(fastafile));
-
- string newFastaFile = fileroot + "precluster" + getExtension(fastafile);
- string newNamesFile = fileroot + "precluster.names";
+ outFasta.close();
+ outNames.close();
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; }
+
- m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
+ m->mothurOut("Total number of sequences before precluster was " + toString(numSeqs) + "."); m->mothurOutEndLine();
m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine();
- printData(newFastaFile, newNamesFile);
if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; }
else{
seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); }
+ if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
}
}else { //no names file, you are identical to yourself
seqPNode tempNode(1, seq, seq.getName());
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); }
+ if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
}
}
}
/**************************************************************************************************/
-void PreClusterCommand::printData(string newfasta, string newname){
+void PreClusterCommand::printData(ofstream& outFasta, ofstream& outNames, seqPNode thisSeq){
try {
- ofstream outFasta;
- ofstream outNames;
-
- openOutputFile(newfasta, outFasta);
- openOutputFile(newname, outNames);
-
-
- for (int i = 0; i < alignSeqs.size(); i++) {
- if (alignSeqs[i].numIdentical != 0) {
- alignSeqs[i].seq.printSequence(outFasta);
- outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
- }
- }
-
- outFasta.close();
- outNames.close();
-
+ thisSeq.seq.printSequence(outFasta);
+ outNames << thisSeq.seq.getName() << '\t' << thisSeq.names << endl;
}
catch(exception& e) {
m->errorOut(e, "PreClusterCommand", "printData");
exit(1);
}
}
-
/**************************************************************************************************/
void PreClusterCommand::readNameFile(){
int numIdentical;
Sequence seq;
string names;
- bool active;
seqPNode() {}
- seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {}
+ seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm) {}
~seqPNode() {}
};
/************************************************************/
int diffs, length;
bool abort;
string fastafile, namefile, outputDir;
- vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
+ list<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
map<string, int> sizes; //this map a seq name to the number of identical seqs in the names file
map<string, int>::iterator itSize;
void readNameFile();
//int readNamesFASTA();
int calcMisMatches(string, string);
- void printData(string, string); //fasta filename, names file name
+ void printData(ofstream&, ofstream&, seqPNode); //fasta filename, names file name
};
/************************************************************/
--- /dev/null
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include <stdio.h>
+
+#include "prng.h"
+
+/* Period parameters */
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0dfUL /* constant vector a */
+#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+#define NJ_RAND_MAX 0x7fffffffUL
+
+
+static unsigned long mt[N]; /* the array for the state vector */
+static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned long s)
+{
+ mt[0]= s & 0xffffffffUL;
+ for (mti=1; mti<N; mti++) {
+ mt[mti] =
+ (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
+ /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+ /* In the previous versions, MSBs of the seed affect */
+ /* only MSBs of the array mt[]. */
+ /* 2002/01/09 modified by Makoto Matsumoto */
+ mt[mti] &= 0xffffffffUL;
+ /* for >32 bit machines */
+ }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned long init_key[], int key_length)
+{
+ int i, j, k;
+ init_genrand(19650218UL);
+ i=1; j=0;
+ k = (N>key_length ? N : key_length);
+ for (; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ + init_key[j] + j; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++; j++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ if (j>=key_length) j=0;
+ }
+ for (k=N-1; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
+ - i; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ }
+
+ mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned long genrand_int32(void)
+{
+ unsigned long y;
+ static unsigned long mag01[2]={0x0UL, MATRIX_A};
+ /* mag01[x] = x * MATRIX_A for x=0,1 */
+
+ if (mti >= N) { /* generate N words at one time */
+ int kk;
+
+ if (mti == N+1) /* if init_genrand() has not been called, */
+ init_genrand(5489UL); /* a default initial seed is used */
+
+ for (kk=0;kk<N-M;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ for (;kk<N-1;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
+ mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+ mti = 0;
+ }
+
+ y = mt[mti++];
+
+ /* Tempering */
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return y;
+}
+
+/* generates a random number on [0,0x7fffffff]-interval */
+long int genrand_int31(void)
+{
+ return (long)(genrand_int32()>>1);
+}
+
+/* These real versions are due to Isaku Wada, 2002/01/09 added */
+
+/* generates a random number on [0,1]-real-interval */
+double genrand_real1(void)
+{
+ return genrand_int32()*(1.0/4294967295.0);
+ /* divided by 2^32-1 */
+}
+
+/* generates a random number on [0,1)-real-interval */
+double genrand_real2(void)
+{
+ return genrand_int32()*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on (0,1)-real-interval */
+double genrand_real3(void)
+{
+ return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on [0,1) with 53-bit resolution*/
+double genrand_res53(void)
+{
+ unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
+ return(a*67108864.0+b)*(1.0/9007199254740992.0);
+}
+
+
+
+
+/*
+ * NJ_genrand_int31_top() - Returns an int in the range 0..top
+ *
+ * This function attempts to remove bias in selecting random
+ * integers in a range.
+ *
+ */
+long int
+NJ_genrand_int31_top(long int top) {
+
+ long int overflow;
+ long int r;
+ long int retval;
+
+ if(top <= 0) {
+ return(0);
+ } else {
+ overflow = (NJ_RAND_MAX / top) * top;
+ }
+
+ while(1) {
+ r = genrand_int31();
+ if(r < overflow) {
+ break;
+ }
+ }
+
+ retval = r % top;
+
+ return(retval);
+}
+
+
+
+
+
+
+
+
--- /dev/null
+/*
+ * prng.h
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Some function prototypes for the Mersenne Twister PRNG
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+
+#ifndef _INC_PRNG_H_
+#define _INC_PRNG_H_ 1
+
+#define NJ_RAND_MAX 0x7fffffffUL
+
+
+/* some function prototypes */
+void
+init_genrand(unsigned long s);
+
+void
+init_by_array(unsigned long init_key[],
+ int key_length);
+
+unsigned long
+genrand_int32(void);
+
+long int
+genrand_int31(void);
+
+double
+genrand_real1(void);
+
+double
+genrand_real2(void);
+
+double
+genrand_real3(void);
+
+double
+genrand_res53(void);
+
+long int
+NJ_genrand_int31_top(long int top);
+
+#endif /* _INC_PRNG_H_ */
+
+
+
+