From: westcott Date: Thu, 26 Aug 2010 15:18:13 +0000 (+0000) Subject: added clearcut source to mothur, fixed mislabeled sharedcalcs X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;ds=sidebyside;h=1b4aa03040e61f4c2d47ce32cb12726dcdadb917;hp=87537e4ee46cae9b7041ae0ed4dbbc229936d37c;p=mothur.git added clearcut source to mothur, fixed mislabeled sharedcalcs --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index ebe8a90..bdcb7e6 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -30,6 +30,7 @@ A7118EE611CFCAC000CFDE03 /* degapseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = degapseqscommand.cpp; sourceTree = ""; }; A7118F0F11CFD5DC00CFDE03 /* getrelabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrelabundcommand.h; sourceTree = ""; }; A7118F1011CFD5DC00CFDE03 /* getrelabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrelabundcommand.cpp; sourceTree = ""; }; + A71BA75F1225694E00C57506 /* clearcut.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearcut.cpp; sourceTree = ""; }; A71D924211AEB42400D00CBC /* clustersplitcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clustersplitcommand.cpp; sourceTree = ""; }; A71D924311AEB42400D00CBC /* clustersplitcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clustersplitcommand.h; sourceTree = ""; }; A71D924411AEB42400D00CBC /* splitabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = splitabundcommand.cpp; sourceTree = ""; }; @@ -466,21 +467,20 @@ A7DA217A113FECD400BF472F /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = ""; }; A7DA217B113FECD400BF472F /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7DA217C113FECD400BF472F /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; - A7DF0AD3121EBB14004A03EA /* clearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = clearcut.c; sourceTree = ""; }; A7DF0AD4121EBB14004A03EA /* clearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearcut.h; sourceTree = ""; }; - A7DF0AD5121EBB14004A03EA /* cmdargs.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = cmdargs.c; sourceTree = ""; }; + A7DF0AD5121EBB14004A03EA /* cmdargs.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; path = cmdargs.cpp; sourceTree = ""; }; A7DF0AD6121EBB14004A03EA /* cmdargs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cmdargs.h; sourceTree = ""; }; A7DF0AD7121EBB14004A03EA /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = ""; }; A7DF0AD8121EBB14004A03EA /* dayhoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dayhoff.h; sourceTree = ""; }; - A7DF0AD9121EBB14004A03EA /* distclearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = distclearcut.c; sourceTree = ""; }; + A7DF0AD9121EBB14004A03EA /* distclearcut.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distclearcut.cpp; sourceTree = ""; }; A7DF0ADA121EBB14004A03EA /* distclearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distclearcut.h; sourceTree = ""; }; - A7DF0ADB121EBB14004A03EA /* dmat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dmat.c; sourceTree = ""; }; + A7DF0ADB121EBB14004A03EA /* dmat.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dmat.cpp; sourceTree = ""; }; A7DF0ADC121EBB14004A03EA /* dmat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dmat.h; sourceTree = ""; }; - A7DF0ADD121EBB14004A03EA /* fasta.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fasta.c; sourceTree = ""; }; + A7DF0ADD121EBB14004A03EA /* fasta.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fasta.cpp; sourceTree = ""; }; A7DF0ADE121EBB14004A03EA /* fasta.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fasta.h; sourceTree = ""; }; - A7DF0ADF121EBB14004A03EA /* getopt_long.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getopt_long.c; sourceTree = ""; }; + A7DF0ADF121EBB14004A03EA /* getopt_long.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getopt_long.cpp; sourceTree = ""; }; A7DF0AE0121EBB14004A03EA /* getopt_long.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getopt_long.h; sourceTree = ""; }; - A7DF0AE1121EBB14004A03EA /* prng.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = prng.c; sourceTree = ""; }; + A7DF0AE1121EBB14004A03EA /* prng.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prng.cpp; sourceTree = ""; }; A7DF0AE2121EBB14004A03EA /* prng.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prng.h; sourceTree = ""; }; A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsesffcommand.cpp; sourceTree = ""; }; A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsesffcommand.h; sourceTree = ""; }; @@ -1008,21 +1008,21 @@ isa = PBXGroup; children = ( A7DF0AD4121EBB14004A03EA /* clearcut.h */, - A7DF0AD3121EBB14004A03EA /* clearcut.c */, - A7DF0AD6121EBB14004A03EA /* cmdargs.h */, + A71BA75F1225694E00C57506 /* clearcut.cpp */, A7DF0AD7121EBB14004A03EA /* common.h */, A7DF0AD8121EBB14004A03EA /* dayhoff.h */, - A7DF0AD5121EBB14004A03EA /* cmdargs.c */, - A7DF0AD9121EBB14004A03EA /* distclearcut.c */, + A7DF0AD6121EBB14004A03EA /* cmdargs.h */, + A7DF0AD5121EBB14004A03EA /* cmdargs.cpp */, + A7DF0AD9121EBB14004A03EA /* distclearcut.cpp */, A7DF0ADA121EBB14004A03EA /* distclearcut.h */, - A7DF0ADB121EBB14004A03EA /* dmat.c */, + A7DF0ADB121EBB14004A03EA /* dmat.cpp */, A7DF0ADC121EBB14004A03EA /* dmat.h */, - A7DF0ADD121EBB14004A03EA /* fasta.c */, + A7DF0ADD121EBB14004A03EA /* fasta.cpp */, A7DF0ADE121EBB14004A03EA /* fasta.h */, - A7DF0ADF121EBB14004A03EA /* getopt_long.c */, + A7DF0ADF121EBB14004A03EA /* getopt_long.cpp */, A7DF0AE0121EBB14004A03EA /* getopt_long.h */, A7DF0AE2121EBB14004A03EA /* prng.h */, - A7DF0AE1121EBB14004A03EA /* prng.c */, + A7DF0AE1121EBB14004A03EA /* prng.cpp */, ); name = clearcutsource; sourceTree = ""; diff --git a/clearcut.c b/clearcut.c deleted file mode 100644 index 3a13ab6..0000000 --- a/clearcut.c +++ /dev/null @@ -1,2158 +0,0 @@ -/* - * clearcut.c - * - * $Id$ - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * An implementation of the Relaxed Neighbor-Joining algorithm - * of Evans, J., Sheneman, L., and Foster, J. - * - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - - - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dist.h" -#include "dmat.h" -#include "fasta.h" -#include "cmdargs.h" -#include "common.h" -#include "clearcut.h" -#include "prng.h" - - - -/* - * main() - - * - * The entry point to the program. - * - */ -int -main(int argc, - char *argv[]) { - - DMAT *dmat; /* The working distance matrix */ - DMAT *dmat_backup = NULL;/* A backup distance matrix */ - NJ_TREE *tree; /* The phylogenetic tree */ - NJ_ARGS *nj_args; /* Structure for holding command-line arguments */ - long int i; - - /* some variables for tracking time */ - struct timeval tv; - unsigned long long startUs, endUs; - - - /* check and parse supplied command-line arguments */ - nj_args = NJ_handle_args(argc, argv); - if(!nj_args) { - fprintf(stderr, "Clearcut: Error processing command-line arguments.\n"); - exit(-1); - } - - /* for verbose reporting, print the random number seed to stdout */ - if(nj_args->verbose_flag) { - printf("PRNG SEED: %d\n", nj_args->seed); - } - - /* Initialize Mersenne Twister PRNG */ - init_genrand(nj_args->seed); - - - switch(nj_args->input_mode) { - - /* If the input type is a distance matrix */ - case NJ_INPUT_MODE_DISTANCE: - - /* parse the distance matrix */ - dmat = NJ_parse_distance_matrix(nj_args); - if(!dmat) { - exit(-1); - } - - break; - - /* If the input type is a multiple sequence alignment */ - case NJ_INPUT_MODE_ALIGNED_SEQUENCES: - - /* build a distance matrix from a multiple sequence alignment */ - dmat = NJ_build_distance_matrix(nj_args); - if(!dmat) { - fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n"); - exit(-1); - } - - break; - - default: - - fprintf(stderr, "Clearcut: Could not determine how to process input\n"); - exit(-1); - } - - /* - * Output the computed distance matrix, - * if the user specified one. - */ - if(nj_args->matrixout) { - NJ_output_matrix(nj_args, dmat); - } - - /* - * If we are going to generate multiple trees from - * the same distance matrix, we need to make a backup - * of the original distance matrix. - */ - if(nj_args->ntrees > 1) { - dmat_backup = NJ_dup_dmat(dmat); - } - - /* process n trees */ - for(i=0;intrees;i++) { - - /* - * If the user has specified matrix shuffling, we need - * to randomize the distance matrix - */ - if(nj_args->shuffle) { - NJ_shuffle_distance_matrix(dmat); - } - - /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */ - gettimeofday(&tv, NULL); - startUs = ((unsigned long long) tv.tv_sec * 1000000ULL) - + ((unsigned long long) tv.tv_usec); - - - /* - * Invoke either the Relaxed Neighbor-Joining algorithm (default) - * or the "traditional" Neighbor-Joining algorithm - */ - if(nj_args->neighbor) { - tree = NJ_neighbor_joining(nj_args, dmat); - } else { - tree = NJ_relaxed_nj(nj_args, dmat); - } - - if(!tree) { - fprintf(stderr, "Clearcut: Failed to construct tree.\n"); - exit(0); - } - - /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */ - gettimeofday(&tv, NULL); - endUs = ((unsigned long long) tv.tv_sec * 1000000ULL) - + ((unsigned long long) tv.tv_usec); - - /* print the time taken to perform the neighbor join */ - if(nj_args->verbose_flag) { - if(nj_args->neighbor) { - fprintf(stderr, "NJ tree built in %llu.%06llu secs\n", - (endUs - startUs) / 1000000ULL, - (endUs - startUs) % 1000000ULL); - } else { - fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n", - (endUs - startUs) / 1000000ULL, - (endUs - startUs) % 1000000ULL); - } - } - - /* Output the neighbor joining tree here */ - NJ_output_tree(nj_args, tree, dmat, i); - - NJ_free_tree(tree); /* Free the tree */ - NJ_free_dmat(dmat); /* Free the working distance matrix */ - - /* - * If we need to do another iteration, lets re-initialize - * our working distance matrix. - */ - if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) { - dmat = NJ_dup_dmat(dmat_backup); - } - } - - /* Free the backup distance matrix */ - if(nj_args->ntrees > 1) { - NJ_free_dmat(dmat_backup); - } - - /* If verbosity, describe where the tree output is */ - if(nj_args->verbose_flag) { - if(nj_args->neighbor) { - printf("NJ tree(s) in %s\n", nj_args->outfilename); - } else { - printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename); - } - } - - exit(0); -} - - - - - -/* - * NJ_find_hmin() - Find minimum transformed values along horizontal - * - * - * INPUTS: - * ------- - * dmat -- The distance matrix - * a -- The index of the specific taxon in the distance matrix - * - * RETURNS: - * -------- - * -- The value of the selected minimum - * min -- Used to transport the index of the minima out - * of the function (by reference) - * hmincount -- Return the number of minima along the horizontal - * (by reference) - * - * - * DESCRIPTION: - * ------------ - * - * A fast, inline function to find the smallest transformed value - * along the "horizontal" portion of an entry in a distance matrix. - * - * Distance matrices are stored internally as continguously-allocated - * upper-diagonal structures. With the exception of the taxa at - * row 0 of this upper-diagonal matrix, all taxa have both a horizontal - * and vertical component in the distance matrix. This function - * scans the horizonal portion of the entry in the distance matrix - * for the specified taxon and finds the minimum transformed value - * along that horizontal component. - * - * Since multiple minima can exist along the horizontal portion - * of the entry, I consider all minima and break ties - * stochastically to help avoid systematic bias. - * - * Just searching along the horizontal portion of a row is very fast - * since the data is stored linearly and contiguously in memory and - * cache locality is exploited in the distance matrix representation. - * - * Look at nj.h for more information on how the distance matrix - * is architected. - * - */ -static inline -float -NJ_find_hmin(DMAT *dmat, - long int a, - long int *min, - long int *hmincount) { - - long int i; /* index variable for looping */ - int size; /* current size of distance matrix */ - int mindex = 0; /* holds the current index to the chosen minimum */ - float curval; /* used to hold current transformed values */ - float hmin; /* the value of the transformed minimum */ - - float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */ - - /* values used for stochastic selection among multiple minima */ - float p, x; - long int smallcnt; - - /* initialize the min to something large */ - hmin = (float)HUGE_VAL; - - /* setup some pointers to limit dereferencing later */ - r2 = dmat->r2; - val = dmat->val; - size = dmat->size; - - /* initialize values associated with minima tie breaking */ - p = 1.0; - smallcnt = 0; - - - ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */ - for(i=a+1;i -- The value of the selected minimum - * min -- Used to transport the index of the minima out - * of the function (by reference) - * vmincount -- The number of minima along the vertical - * return by reference. - * - * DESCRIPTION: - * ------------ - * - * A fast, inline function to find the smallest transformed value - * along the "vertical" portion of an entry in a distance matrix. - * - * Distance matrices are stored internally as continguously-allocated - * upper-diagonal matrices. With the exception of the taxa at - * row 0 of this upper-diagonal matrix, all taxa have both a horizontal - * and vertical component in the distance matrix. This function - * scans the vertical portion of the entry in the distance matrix - * for the specified taxon and finds the minimum transformed value - * along that vertical component. - * - * Since multiple minima can exist along the vertical portion - * of the entry, I consider all minima and break ties - * stochastically to help avoid systematic bias. - * - * Due to cache locality reasons, searching along the vertical - * component is going to be considerably slower than searching - * along the horizontal. - * - * Look at nj.h for more information on how the distance matrix - * is architected. - * - */ -static inline -float -NJ_find_vmin(DMAT *dmat, - long int a, - long int *min, - long int *vmincount) { - - long int i; /* index variable used for looping */ - long int size; /* track the size of the matrix */ - long int mindex = 0;/* track the index to the minimum */ - float curval; /* track value of current transformed distance */ - float vmin; /* the index to the smallest "vertical" minimum */ - - /* pointers which are used to reduce pointer dereferencing in inner loop */ - float *ptr, *r2, *val; - - /* values used in stochastically breaking ties */ - float p, x; - long int smallcnt; - - /* initialize the vertical min to something really big */ - vmin = (float)HUGE_VAL; - - /* save off some values to limit dereferencing later */ - r2 = dmat->r2; - val = dmat->val; - size = dmat->size; - - p = 1.0; - smallcnt = 0; - - /* start on the first row and work down */ - ptr = &(val[NJ_MAP(0, a, size)]); - for(i=0;isize; - val = dmat->val; - r = dmat->r+a+1; - - /* - * Loop through the rows and decrement the stored r values - * by the distances stored in the rows and columns of the distance - * matrix which are being removed post-join. - * - * We do the rows altogether in order to benefit from cache locality. - */ - ptrx = &(val[NJ_MAP(a, a+1, size)]); - ptry = &(val[NJ_MAP(b, b+1, size)]); - - for(i=a+1;ib) { - *r -= *(ptry++); - } - - r++; - } - - /* Similar to the above loop, we now do the columns */ - ptrx = &(val[NJ_MAP(0, a, size)]); - ptry = &(val[NJ_MAP(0, b, size)]); - r = dmat->r; - for(i=0;isize-1) { - /* if we can't do a row here, lets do a column */ - if(a==0) { - if(b==1) { - target = 2; - } else { - target = 1; - } - } else { - target = 0; - } - } else { - target = b+1; - } - - - /* distance between a and the root of clade (a,b) */ - a2clade = - ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + - (dmat->r2[a] - dmat->r2[b]) ) / 2.0; - - /* distance between b and the root of clade (a,b) */ - b2clade = - ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + - (dmat->r2[b] - dmat->r2[a]) ) / 2.0; - - /* distance between the clade (a,b) and the target taxon */ - if(bval[NJ_MAP(a, target, dmat->size)] - a2clade) + - (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0; - - /* - * Check to see that distance from clade root to target + distance from - * b to clade root are equal to the distance from b to the target - */ - if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)], - (clade_dist + b2clade))) { - return(1); /* join is legitimate */ - } else { - return(0); /* join is illigitimate */ - } - - } else { - - /* compute the distance from the clade root to the target */ - clade_dist = - ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) + - (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0; - - /* - * Check to see that distance from clade root to target + distance from - * b to clade root are equal to the distance from b to the target - */ - if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)], - (clade_dist + b2clade))) { - return(1); /* join is legitimate */ - } else { - return(0); /* join is illegitimate */ - } - } -} - - - - - - - -/* - * NJ_check() - Check to see if two taxa can be joined - - * - * INPUTS: - * ------- - * nj_args -- Pointer to the data structure holding command-line args - * dmat -- distance matrix - * a -- index into dmat for one of the rows to be joined - * b -- index into dmat for another row to be joined - * min -- the minimum value found - * additivity -- a flag (0 = not additive mode, 1 = additive mode) - * - * OUTPUTS: - * -------- - * int 1 if join is okay - * 0 if join is not okay - * - * DESCRIPTION: - * ------------ - * - * This function ultimately takes two rows and makes sure that the - * intersection of those two rows, which has a transformed distance of - * "min", is actually the smallest (or equal to the smallest) - * transformed distance for both rows (a, b). If so, it returns - * 1, else it returns 0. - * - * Basically, we want to join two rows only if the minimum - * transformed distance on either row is at the intersection of - * those two rows. - * - */ -static inline -int -NJ_check(NJ_ARGS *nj_args, - DMAT *dmat, - long int a, - long int b, - float min, - int additivity) { - - - long int i, size; - float *ptr, *val, *r2; - - - /* some aliases for speed and readability reasons */ - val = dmat->val; - r2 = dmat->r2; - size = dmat->size; - - - /* now determine if joining a, b will result in broken distances */ - if(additivity) { - if(!NJ_check_additivity(dmat, a, b)) { - return(0); - } - } - - /* scan the horizontal of row b, punt if anything < min */ - ptr = &(val[NJ_MAP(b, b+1, size)]); - for(i=b+1;inorandom) { /* if we are doing random joins, we checked this */ - ptr = val + a; - for(i=0;i reduce pointer dereferencing */ - float a2clade; /* distance from a to the new node that joins a and b */ - float b2clade; /* distance from b to the new node that joins a and b */ - float cval; /* stores distance information during loop */ - float *vptr; /* pointer to elements in first row of dist matrix */ - float *ptra; /* pointer to elements in row a of distance matrix */ - float *ptrb; /* pointer to elements in row b of distance matrix */ - - float *val, *r, *r2; /* simply used to limit pointer dereferencing */ - - - /* We must assume that a < b */ - if(a >= b) { - fprintf(stderr, "Clearcut: (aval; - r = dmat->r; - r2 = dmat->r2; - size = dmat->size; - - /* compute the distance from the clade components (a, b) to the new node */ - a2clade = - ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; - b2clade = - ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; - - - r[a] = 0.0; /* we are removing row a, so clear dist. in r */ - - /* - * Fill the horizontal part of the "a" row and finish computing r and r2 - * we handle the horizontal component first to maximize cache locality - */ - ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */ - ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */ - for(i=a+1;ir = r+1; - - - /* - * Collapse r2 here by copying contents of r2[0] into r2[b] and - * incrementing pointer to the beginning of r2 by one row - */ - r2[b] = r2[0]; - dmat->r2 = r2+1; - - /* increment dmat pointer to next row */ - dmat->val += size; - - /* decrement the total size of the distance matrix by one row */ - dmat->size--; - - return; -} - - - - - - - - - -/* - * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining - * - * - * INPUTS: - * ------- - * nj_args -- A pointer to a structure containing the command-line arguments - * dmat -- A pointer to the distance matrix - * - * RETURNS: - * -------- - * NJ_TREE * -- A pointer to the Neighbor-Joining tree. - * - * DESCRIPTION: - * ------------ - * - * This function performs a traditional Neighbor-Joining operation in which - * the distance matrix is exhaustively searched for the global minimum - * transformed distance. The two nodes which intersect at the global - * minimum transformed distance are then joined and the distance - * matrix is collapsed. This process continues until there are only - * two nodes left, at which point those nodes are joined. - * - */ -NJ_TREE * -NJ_neighbor_joining(NJ_ARGS *nj_args, - DMAT *dmat) { - - - NJ_TREE *tree = NULL; - NJ_VERTEX *vertex = NULL; - - long int a, b; - float min; - - - /* initialize the r and r2 vectors */ - NJ_init_r(dmat); - - /* allocate and initialize our vertex vector used for tree construction */ - vertex = NJ_init_vertex(dmat); - if(!vertex) { - fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n"); - return(NULL); - } - - /* we iterate until the working distance matrix has only 2 entries */ - while(vertex->nactive > 2) { - - /* - * Find the global minimum transformed distance from the distance matrix - */ - min = NJ_min_transform(dmat, &a, &b); - - /* - * Build the tree by removing nodes a and b from the vertex array - * and inserting a new internal node which joins a and b. Collapse - * the vertex array similarly to how the distance matrix and r and r2 - * are compacted. - */ - NJ_decompose(dmat, vertex, a, b, 0); - - /* decrement the r and r2 vectors by the distances corresponding to a, b */ - NJ_compute_r(dmat, a, b); - - /* compact the distance matrix and the r and r2 vectors */ - NJ_collapse(dmat, vertex, a, b); - } - - /* Properly join the last two nodes on the vertex list */ - tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); - - /* return the computed tree to the calling function */ - return(tree); -} - - - - - - - - -/* - * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining - * - * INPUTS: - * ------- - * nj_args -- A pointer to a data structure containing the command-line args - * dmat -- A pointer to the distance matrix - * - * RETURNS: - * -------- - * - * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree - * - * DESCRIPTION: - * ------------ - * - * This function implements the Relaxed Neighbor-Joining algorithm of - * Evans, J., Sheneman, L., and Foster, J. - * - * Relaxed Neighbor-Joining works by choosing a local minimum transformed - * distance when determining when to join two nodes. (Traditional - * Neighbor-Joining chooses a global minimum transformed distance). - * - * The algorithm shares the property with traditional NJ that if the - * input distances are additive (self-consistent), then the algorithm - * will manage to construct the true tree consistent with the additive - * distances. Additivity state is tracked and every proposed join is checked - * to make sure it maintains additivity constraints. If no - * additivity-preserving join is possible in a single pass, then the distance - * matrix is non-additive, and additivity checking is abandoned. - * - * The algorithm will either attempt joins randomly, or it will perform joins - * in a particular order. The default behavior is to perform joins randomly, - * but this can be switched off with a command-line switch. - * - * For randomized joins, all attempts are made to alleviate systematic bias - * for the choice of rows to joins. All tie breaking is done in a way which - * is virtually free of bias. - * - * To perform randomized joins, a random permutation is constructed which - * specifies the order in which to attempt joins. I iterate through the - * random permutation, and for each row in the random permutation, I find - * the minimum transformed distance for that row. If there are multiple - * minima, I break ties evenly. For the row which intersects our - * randomly chosen row at the chosen minimum, if we are are still in - * additivity mode, I check to see if joining the two rows will break - * our additivity constraints. If not, I check to see if there exists - * a transformed distance which is smaller than the minimum found on the - * original row. If there is, then we proceed through the random permutation - * trying additional rows in the random order specified in the permutation. - * If there is no smaller minimum transformed distance on either of the - * two rows, then we join them, collapse the distance matrix, and compute - * a new random permutation. - * - * If the entire random permutation is traversed and no joins are possible - * due to additivity constraints, then the distance matrix is not - * additive, and additivity constraint-checking is disabled. - * - */ -NJ_TREE * -NJ_relaxed_nj(NJ_ARGS *nj_args, - DMAT *dmat) { - - - NJ_TREE *tree; - NJ_VERTEX *vertex; - long int a, b, t, bh, bv, i; - float hmin, vmin, hvmin; - float p, q, x; - int join_flag; - int additivity_mode; - long int hmincount, vmincount; - long int *permutation = NULL; - - - - /* initialize the r and r2 vectors */ - NJ_init_r(dmat); - - additivity_mode = 1; - - /* allocate the permutation vector, if we are in randomize mode */ - if(!nj_args->norandom) { - permutation = (long int *)calloc(dmat->size, sizeof(long int)); - if(!permutation) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n"); - return(NULL); - } - } - - /* allocate and initialize our vertex vector used for tree construction */ - vertex = NJ_init_vertex(dmat); - - /* loop until there are only 2 nodes left to join */ - while(vertex->nactive > 2) { - - switch(nj_args->norandom) { - - /* RANDOMIZED JOINS */ - case 0: - - join_flag = 0; - - NJ_permute(permutation, dmat->size-1); - for(i=0;isize-1 && (vertex->nactive>2) ;i++) { - - a = permutation[i]; - - /* find min trans dist along horiz. of row a */ - hmin = NJ_find_hmin(dmat, a, &bh, &hmincount); - if(a) { - /* find min trans dist along vert. of row a */ - vmin = NJ_find_vmin(dmat, a, &bv, &vmincount); - } else { - vmin = hmin; - bv = bh; - vmincount = 0; - } - - if(NJ_FLT_EQ(hmin, vmin)) { - - /* - * The minima along the vertical and horizontal are - * the same. Compute the proportion of minima along - * the horizonal (p) and the proportion of minima - * along the vertical (q). - * - * If the same minima exist along the horizonal and - * vertical, we break the tie in a way which is - * non-biased. That is, we break the tie based on the - * proportion of horiz. minima versus vertical minima. - * - */ - p = (float)hmincount / ((float)hmincount + (float)vmincount); - q = 1.0 - p; - x = genrand_real2(); - - if(x < p) { - hvmin = hmin; - b = bh; - } else { - hvmin = vmin; - b = bv; - } - } else if(NJ_FLT_LT(hmin, vmin) ) { - hvmin = hmin; - b = bh; - } else { - hvmin = vmin; - b = bv; - } - - if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) { - - /* swap a and b, if necessary, to make sure a < b */ - if(b < a) { - t = a; - a = b; - b = t; - } - - join_flag = 1; - - /* join taxa from rows a and b */ - NJ_decompose(dmat, vertex, a, b, 0); - - /* collapse matrix */ - NJ_compute_r(dmat, a, b); - NJ_collapse(dmat, vertex, a, b); - - NJ_permute(permutation, dmat->size-1); - } - } - - /* turn off additivity if go through an entire cycle without joining */ - if(!join_flag) { - additivity_mode = 0; - } - - break; - - - - /* DETERMINISTIC JOINS */ - case 1: - - join_flag = 0; - - for(a=0;asize-1 && (vertex->nactive > 2) ;) { - - /* find the min along the horizontal of row a */ - hmin = NJ_find_hmin(dmat, a, &b, &hmincount); - - if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) { - - join_flag = 1; - - /* join taxa from rows a and b */ - NJ_decompose(dmat, vertex, a, b, 0); - - /* collapse matrix */ - NJ_compute_r(dmat, a, b); - NJ_collapse(dmat, vertex, a, b); - - if(a) { - a--; - } - - } else { - a++; - } - } - - /* turn off additivity if go through an entire cycle without joining */ - if(!join_flag) { - additivity_mode = 0; - } - - break; - } - - } /* WHILE */ - - /* Join the last two nodes on the vertex list */ - tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); - - if(nj_args->verbose_flag) { - if(additivity_mode) { - printf("Tree is additive\n"); - } else { - printf("Tree is not additive\n"); - } - } - - if(vertex) { - NJ_free_vertex(vertex); - } - - if(!nj_args->norandom && permutation) { - free(permutation); - } - - return(tree); -} - - - - - - -/* - * NJ_print_distance_matrix() - - * - * Print a distance matrix - * - */ -void -NJ_print_distance_matrix(DMAT *dmat) { - - long int i, j; - - printf("ntaxa: %ld\n", dmat->ntaxa); - printf(" size: %ld\n", dmat->size); - - for(i=0;isize;i++) { - - for(j=0;jsize;j++) { - if(j>i) { - printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]); - } else { - printf(" -"); - } - } - - - if(dmat->r && dmat->r2) { - printf("\t\t%0.4f", dmat->r[i]); - printf("\t%0.4f", dmat->r2[i]); - - - - printf("\n"); - - for(j=0;jsize;j++) { - if(j>i) { - printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j])); - } else { - printf(" "); - } - } - - printf("\n\n"); - } - } - - printf("\n"); - - return; -} - - - - - - - -/* - * NJ_output_tree() - - * - * A wrapper for the function that really prints the tree, - * basically to get a newline in there conveniently. :-) - * - * Print n trees, as specified in command-args - * using "count" variable from 0 to (n-1) - * - */ -void -NJ_output_tree(NJ_ARGS *nj_args, - NJ_TREE *tree, - DMAT *dmat, - long int count) { - - FILE *fp; - - if(nj_args->stdout_flag) { - fp = stdout; - } else { - - if(count == 0) { - fp = fopen(nj_args->outfilename, "w"); /* open for writing */ - } else { - fp = fopen(nj_args->outfilename, "a"); /* open for appending */ - } - - if(!fp) { - fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename); - exit(-1); - } - } - - NJ_output_tree2(fp, nj_args, tree, tree, dmat); - fprintf(fp, ";\n"); - - if(!nj_args->stdout_flag) { - fclose(fp); - } - - return; -} - - - - - -/* - * NJ_output_tree2() - - * - * - */ -void -NJ_output_tree2(FILE *fp, - NJ_ARGS *nj_args, - NJ_TREE *tree, - NJ_TREE *root, - DMAT *dmat) { - - if(!tree) { - return; - } - - if(tree->taxa_index != NJ_INTERNAL_NODE) { - - if(nj_args->expblen) { - fprintf(fp, "%s:%e", - dmat->taxaname[tree->taxa_index], - tree->dist); - } else { - fprintf(fp, "%s:%f", - dmat->taxaname[tree->taxa_index], - tree->dist); - } - - } else { - - - if(tree->left && tree->right) { - fprintf(fp, "("); - } - if(tree->left) { - NJ_output_tree2(fp, nj_args, tree->left, root, dmat); - } - - if(tree->left && tree->right) { - fprintf(fp, ","); - } - if(tree->right) { - NJ_output_tree2(fp, nj_args, tree->right, root, dmat); - } - - if(tree != root->left) { - if(tree->left && tree->right) { - if(tree != root) { - if(nj_args->expblen) { - fprintf(fp, "):%e", tree->dist); - } else { - fprintf(fp, "):%f", tree->dist); - } - } else { - fprintf(fp, ")"); - } - } - } else { - fprintf(fp, ")"); - } - } - - return; -} - - - - - - - -/* - * NJ_init_r() - * - * This function computes the r column in our matrix - * - */ -void -NJ_init_r(DMAT *dmat) { - - long int i, j, size; - long int index; - float *r, *r2, *val; - long int size1; - float size2; - - r = dmat->r; - r2 = dmat->r2; - val = dmat->val; - size = dmat->size; - size1 = size-1; - size2 = (float)(size-2); - - index = 0; - for(i=0;inodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *)); - vertex->nodes_handle = vertex->nodes; - - /* initialize our size and active variables */ - vertex->nactive = dmat->ntaxa; - vertex->size = dmat->ntaxa; - - /* initialize the nodes themselves */ - for(i=0;intaxa;i++) { - - vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); - - vertex->nodes[i]->left = NULL; - vertex->nodes[i]->right = NULL; - - vertex->nodes[i]->taxa_index = i; - } - - return(vertex); -} - - - - - -/* - * NJ_decompose() - - * - * This function decomposes the star by creating new internal nodes - * and joining two existing tree nodes to it - * - */ -NJ_TREE * -NJ_decompose(DMAT *dmat, - NJ_VERTEX *vertex, - long int x, - long int y, - int last_flag) { - - NJ_TREE *new_node; - float x2clade, y2clade; - - /* compute the distance from the clade components to the new node */ - if(last_flag) { - x2clade = - (dmat->val[NJ_MAP(x, y, dmat->size)]); - } else { - x2clade = - (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + - ((dmat->r2[x] - dmat->r2[y])/2); - } - - vertex->nodes[x]->dist = x2clade; - - if(last_flag) { - y2clade = - (dmat->val[NJ_MAP(x, y, dmat->size)]); - } else { - y2clade = - (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + - ((dmat->r2[y] - dmat->r2[x])/2); - } - - vertex->nodes[y]->dist = y2clade; - - /* allocate new node to connect two sub-clades */ - new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); - - new_node->left = vertex->nodes[x]; - new_node->right = vertex->nodes[y]; - new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */ - - if(last_flag) { - return(new_node); - } - - vertex->nodes[x] = new_node; - vertex->nodes[y] = vertex->nodes[0]; - - vertex->nodes = &(vertex->nodes[1]); - - vertex->nactive--; - - return(new_node); -} - - - -/* - * NJ_print_vertex() - - * - * For debugging, print the contents of the vertex - * - */ -void -NJ_print_vertex(NJ_VERTEX *vertex) { - - long int i; - - printf("Number of active nodes: %ld\n", vertex->nactive); - - for(i=0;inactive;i++) { - printf("%ld ", vertex->nodes[i]->taxa_index); - } - printf("\n"); - - return; -} - - - - - - - - - -/* - * NJ_print_r() - - * - */ -void -NJ_print_r(DMAT *dmat) { - - long int i; - - printf("\n"); - for(i=0;isize;i++) { - printf("r[%ld] = %0.2f\n", i, dmat->r[i]); - } - printf("\n"); - - return; -} - - - - - -/* - * NJ_print_taxanames() - - * - * Print taxa names here - * - */ -void -NJ_print_taxanames(DMAT *dmat) { - - long int i; - - printf("Number of taxa: %ld\n", dmat->ntaxa); - - for(i=0;intaxa;i++) { - printf("%ld) %s\n", i, dmat->taxaname[i]); - } - - printf("\n"); - - return; -} - - - - -/* - * NJ_shuffle_distance_matrix() - - * - * Randomize a distance matrix here - * - */ -void -NJ_shuffle_distance_matrix(DMAT *dmat) { - - - long int *perm = NULL; - char **tmp_taxaname = NULL; - float *tmp_val = NULL; - long int i, j; - - - /* alloc the random permutation and a new matrix to hold the shuffled vals */ - perm = (long int *)calloc(dmat->size, sizeof(long int)); - tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *)); - tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); - if(!tmp_taxaname || !perm || !tmp_val) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n"); - exit(-1); - } - - /* compute a permutation which will describe how to shuffle the matrix */ - NJ_permute(perm, dmat->size); - - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - - if(perm[j] < perm[i]) { - tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)]; - } else { - tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)]; - } - - } - - tmp_taxaname[i] = dmat->taxaname[perm[i]]; - } - - /* free our random permutation */ - if(perm) { - free(perm); - } - - /* free the old value matrix */ - if(dmat->val) { - free(dmat->val); - } - - /* re-assign the value matrix pointers */ - dmat->val = tmp_val; - dmat->valhandle = dmat->val; - - /* - * Free our old taxaname with its particular ordering - * and re-assign to the new. - */ - if(dmat->taxaname) { - free(dmat->taxaname); - } - dmat->taxaname = tmp_taxaname; - - return; -} - - - -/* - * NJ_free_tree() - - * - * Free a given NJ tree - */ -void -NJ_free_tree(NJ_TREE *node) { - - if(!node) { - return; - } - - if(node->left) { - NJ_free_tree(node->left); - } - - if(node->right) { - NJ_free_tree(node->right); - } - - free(node); - - return; -} - - - - - - - - - -/* - * NJ_print_permutation() - * - * Print a permutation - * - */ -void -NJ_print_permutation(long int *perm, - long int size) { - - long int i; - - for(i=0;intaxa = src->ntaxa; - dest->size = src->size; - - /* allocate space for array of pointers to taxanames */ - dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *)); - if(!dest->taxaname) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); - goto XIT_BAD; - } - - /* allocate space for the taxanames themselves */ - for(i=0;intaxa;i++) { - dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char)); - if(!dest->taxaname[i]) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); - goto XIT_BAD; - } - } - - /* allocate space for the distance values */ - dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float)); - if(!dest->val) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); - goto XIT_BAD; - } - - /* allocate space for the r and r2 vectors */ - dest->r = (float *)calloc(src->ntaxa, sizeof(float)); - dest->r2 = (float *)calloc(src->ntaxa, sizeof(float)); - - /* copy titles */ - for(i=0;intaxa;i++) { - strcpy(dest->taxaname[i], src->taxaname[i]); - } - - /* copy values */ - memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float)); - - /* copy r and r2 */ - memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float)); - memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float)); - - /* track some memory addresses */ - dest->valhandle = dest->val; - dest->rhandle = dest->r; - dest->r2handle = dest->r2; - - return(dest); - - XIT_BAD: - - /* free what we may have allocated */ - NJ_free_dmat(dest); - - return(NULL); -} - - - - -/* - * NJ_free_dmat() - - */ -void -NJ_free_dmat(DMAT *dmat) { - - long int i; - - if(dmat) { - - if(dmat->taxaname) { - - for(i=0;intaxa;i++) { - if(dmat->taxaname[i]) { - free(dmat->taxaname[i]); - } - } - - free(dmat->taxaname); - } - - if(dmat->valhandle) { - free(dmat->valhandle); - } - - if(dmat->rhandle) { - free(dmat->rhandle); - } - - if(dmat->r2handle) { - free(dmat->r2handle); - } - - free(dmat); - } - - return; -} - - - - - -/* - * NJ_free_vertex() - - * - * Free the vertex data structure - * - */ -void -NJ_free_vertex(NJ_VERTEX *vertex) { - - if(vertex) { - if(vertex->nodes_handle) { - free(vertex->nodes_handle); - } - free(vertex); - } - - return; -} - - - - - - - - - -/* - * - * NJ_min_transform() - Find the smallest transformed value to identify - * which nodes to join. - * - * INPUTS: - * ------- - * dmat -- The distance matrix - * - * RETURNS: - * -------- - * -- The minimimum transformed distance - * ret_i -- The row of the smallest transformed distance (by reference) - * ret_j -- The col of the smallest transformed distance (by reference) - * - * - * DESCRIPTION: - * ------------ - * - * Used only with traditional Neighbor-Joining, this function checks the entire - * working distance matrix and identifies the smallest transformed distance. - * This requires traversing the entire diagonal matrix, which is itself a - * O(N^2) operation. - * - */ -float -NJ_min_transform(DMAT *dmat, - long int *ret_i, - long int *ret_j) { - - long int i, j; /* indices used for looping */ - long int tmp_i = 0;/* to limit pointer dereferencing */ - long int tmp_j = 0;/* to limit pointer dereferencing */ - float smallest; /* track the smallest trans. dist */ - float curval; /* the current trans. dist in loop */ - - float *ptr; /* pointer into distance matrix */ - float *r2; /* pointer to r2 matrix for computing transformed dists */ - - smallest = (float)HUGE_VAL; - - /* track these here to limit pointer dereferencing in inner loop */ - ptr = dmat->val; - r2 = dmat->r2; - - /* for every row */ - for(i=0;isize;i++) { - ptr++; /* skip diagonal */ - for(j=i+1;jsize;j++) { /* for every column */ - - /* find transformed distance in matrix at i, j */ - curval = *(ptr++) - (r2[i] + r2[j]); - - /* if the transformed distanance is less than the known minimum */ - if(curval < smallest) { - - smallest = curval; - tmp_i = i; - tmp_j = j; - } - } - } - - /* pass back (by reference) the coords of the min. transformed distance */ - *ret_i = tmp_i; - *ret_j = tmp_j; - - return(smallest); /* return the min transformed distance */ -} - - - - - diff --git a/clearcut.cpp b/clearcut.cpp new file mode 100644 index 0000000..b06aa5b --- /dev/null +++ b/clearcut.cpp @@ -0,0 +1,2159 @@ + +/* + * clearcut.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * An implementation of the Relaxed Neighbor-Joining algorithm + * of Evans, J., Sheneman, L., and Foster, J. + * + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "distclearcut.h" +#include "dmat.h" +#include "fasta.h" +#include "cmdargs.h" +#include "common.h" +#include "clearcut.h" +#include "prng.h" + + +/* + * main() - + * + * The entry point to the program. + * + */ +int clearcut_main(int argc, char *argv[]) { + + DMAT *dmat; /* The working distance matrix */ + DMAT *dmat_backup = NULL;/* A backup distance matrix */ + NJ_TREE *tree; /* The phylogenetic tree */ + NJ_ARGS *nj_args; /* Structure for holding command-line arguments */ + long int i; + + /* some variables for tracking time */ + struct timeval tv; + unsigned long long startUs, endUs; + + + /* check and parse supplied command-line arguments */ + nj_args = NJ_handle_args(argc, argv); + + if(!nj_args) { + fprintf(stderr, "Clearcut: Error processing command-line arguments.\n"); + exit(-1); + } + + /* for verbose reporting, print the random number seed to stdout */ + if(nj_args->verbose_flag) { + printf("PRNG SEED: %d\n", nj_args->seed); + } + + /* Initialize Mersenne Twister PRNG */ + init_genrand(nj_args->seed); + + + switch(nj_args->input_mode) { + + /* If the input type is a distance matrix */ + case NJ_INPUT_MODE_DISTANCE: + + /* parse the distance matrix */ + dmat = NJ_parse_distance_matrix(nj_args); + if(!dmat) { + exit(-1); + } + + break; + + /* If the input type is a multiple sequence alignment */ + case NJ_INPUT_MODE_ALIGNED_SEQUENCES: + + /* build a distance matrix from a multiple sequence alignment */ + dmat = NJ_build_distance_matrix(nj_args); + if(!dmat) { + fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n"); + exit(-1); + } + + break; + + default: + + fprintf(stderr, "Clearcut: Could not determine how to process input\n"); + exit(-1); + } + + /* + * Output the computed distance matrix, + * if the user specified one. + */ + if(nj_args->matrixout) { + NJ_output_matrix(nj_args, dmat); + } + + /* + * If we are going to generate multiple trees from + * the same distance matrix, we need to make a backup + * of the original distance matrix. + */ + if(nj_args->ntrees > 1) { + dmat_backup = NJ_dup_dmat(dmat); + } + + /* process n trees */ + for(i=0;intrees;i++) { + + /* + * If the user has specified matrix shuffling, we need + * to randomize the distance matrix + */ + if(nj_args->shuffle) { + NJ_shuffle_distance_matrix(dmat); + } + + /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + startUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + + /* + * Invoke either the Relaxed Neighbor-Joining algorithm (default) + * or the "traditional" Neighbor-Joining algorithm + */ + if(nj_args->neighbor) { + tree = NJ_neighbor_joining(nj_args, dmat); + } else { + tree = NJ_relaxed_nj(nj_args, dmat); + } + + if(!tree) { + fprintf(stderr, "Clearcut: Failed to construct tree.\n"); + exit(0); + } + + /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + endUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + /* print the time taken to perform the neighbor join */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + fprintf(stderr, "NJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } else { + fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } + } + + /* Output the neighbor joining tree here */ + NJ_output_tree(nj_args, tree, dmat, i); + + NJ_free_tree(tree); /* Free the tree */ + NJ_free_dmat(dmat); /* Free the working distance matrix */ + + /* + * If we need to do another iteration, lets re-initialize + * our working distance matrix. + */ + if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) { + dmat = NJ_dup_dmat(dmat_backup); + } + } + + /* Free the backup distance matrix */ + if(nj_args->ntrees > 1) { + NJ_free_dmat(dmat_backup); + } + + /* If verbosity, describe where the tree output is */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + printf("NJ tree(s) in %s\n", nj_args->outfilename); + } else { + printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename); + } + } + + //exit(0); +} + + + + + +/* + * NJ_find_hmin() - Find minimum transformed values along horizontal + * + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * a -- The index of the specific taxon in the distance matrix + * + * RETURNS: + * -------- + * -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * hmincount -- Return the number of minima along the horizontal + * (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "horizontal" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal structures. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the horizonal portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that horizontal component. + * + * Since multiple minima can exist along the horizontal portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Just searching along the horizontal portion of a row is very fast + * since the data is stored linearly and contiguously in memory and + * cache locality is exploited in the distance matrix representation. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_hmin(DMAT *dmat, + long int a, + long int *min, + long int *hmincount) { + + long int i; /* index variable for looping */ + int size; /* current size of distance matrix */ + int mindex = 0; /* holds the current index to the chosen minimum */ + float curval; /* used to hold current transformed values */ + float hmin; /* the value of the transformed minimum */ + + float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */ + + /* values used for stochastic selection among multiple minima */ + float p, x; + long int smallcnt; + + /* initialize the min to something large */ + hmin = (float)HUGE_VAL; + + /* setup some pointers to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + /* initialize values associated with minima tie breaking */ + p = 1.0; + smallcnt = 0; + + + ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */ + for(i=a+1;i -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * vmincount -- The number of minima along the vertical + * return by reference. + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "vertical" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal matrices. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the vertical portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that vertical component. + * + * Since multiple minima can exist along the vertical portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Due to cache locality reasons, searching along the vertical + * component is going to be considerably slower than searching + * along the horizontal. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_vmin(DMAT *dmat, + long int a, + long int *min, + long int *vmincount) { + + long int i; /* index variable used for looping */ + long int size; /* track the size of the matrix */ + long int mindex = 0;/* track the index to the minimum */ + float curval; /* track value of current transformed distance */ + float vmin; /* the index to the smallest "vertical" minimum */ + + /* pointers which are used to reduce pointer dereferencing in inner loop */ + float *ptr, *r2, *val; + + /* values used in stochastically breaking ties */ + float p, x; + long int smallcnt; + + /* initialize the vertical min to something really big */ + vmin = (float)HUGE_VAL; + + /* save off some values to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + p = 1.0; + smallcnt = 0; + + /* start on the first row and work down */ + ptr = &(val[NJ_MAP(0, a, size)]); + for(i=0;isize; + val = dmat->val; + r = dmat->r+a+1; + + /* + * Loop through the rows and decrement the stored r values + * by the distances stored in the rows and columns of the distance + * matrix which are being removed post-join. + * + * We do the rows altogether in order to benefit from cache locality. + */ + ptrx = &(val[NJ_MAP(a, a+1, size)]); + ptry = &(val[NJ_MAP(b, b+1, size)]); + + for(i=a+1;ib) { + *r -= *(ptry++); + } + + r++; + } + + /* Similar to the above loop, we now do the columns */ + ptrx = &(val[NJ_MAP(0, a, size)]); + ptry = &(val[NJ_MAP(0, b, size)]); + r = dmat->r; + for(i=0;isize-1) { + /* if we can't do a row here, lets do a column */ + if(a==0) { + if(b==1) { + target = 2; + } else { + target = 1; + } + } else { + target = 0; + } + } else { + target = b+1; + } + + + /* distance between a and the root of clade (a,b) */ + a2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + + /* distance between b and the root of clade (a,b) */ + b2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + /* distance between the clade (a,b) and the target taxon */ + if(bval[NJ_MAP(a, target, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illigitimate */ + } + + } else { + + /* compute the distance from the clade root to the target */ + clade_dist = + ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illegitimate */ + } + } +} + + + + + + + +/* + * NJ_check() - Check to see if two taxa can be joined + + * + * INPUTS: + * ------- + * nj_args -- Pointer to the data structure holding command-line args + * dmat -- distance matrix + * a -- index into dmat for one of the rows to be joined + * b -- index into dmat for another row to be joined + * min -- the minimum value found + * additivity -- a flag (0 = not additive mode, 1 = additive mode) + * + * OUTPUTS: + * -------- + * int 1 if join is okay + * 0 if join is not okay + * + * DESCRIPTION: + * ------------ + * + * This function ultimately takes two rows and makes sure that the + * intersection of those two rows, which has a transformed distance of + * "min", is actually the smallest (or equal to the smallest) + * transformed distance for both rows (a, b). If so, it returns + * 1, else it returns 0. + * + * Basically, we want to join two rows only if the minimum + * transformed distance on either row is at the intersection of + * those two rows. + * + */ +static inline +int +NJ_check(NJ_ARGS *nj_args, + DMAT *dmat, + long int a, + long int b, + float min, + int additivity) { + + + long int i, size; + float *ptr, *val, *r2; + + + /* some aliases for speed and readability reasons */ + val = dmat->val; + r2 = dmat->r2; + size = dmat->size; + + + /* now determine if joining a, b will result in broken distances */ + if(additivity) { + if(!NJ_check_additivity(dmat, a, b)) { + return(0); + } + } + + /* scan the horizontal of row b, punt if anything < min */ + ptr = &(val[NJ_MAP(b, b+1, size)]); + for(i=b+1;inorandom) { /* if we are doing random joins, we checked this */ + ptr = val + a; + for(i=0;i reduce pointer dereferencing */ + float a2clade; /* distance from a to the new node that joins a and b */ + float b2clade; /* distance from b to the new node that joins a and b */ + float cval; /* stores distance information during loop */ + float *vptr; /* pointer to elements in first row of dist matrix */ + float *ptra; /* pointer to elements in row a of distance matrix */ + float *ptrb; /* pointer to elements in row b of distance matrix */ + + float *val, *r, *r2; /* simply used to limit pointer dereferencing */ + + + /* We must assume that a < b */ + if(a >= b) { + fprintf(stderr, "Clearcut: (aval; + r = dmat->r; + r2 = dmat->r2; + size = dmat->size; + + /* compute the distance from the clade components (a, b) to the new node */ + a2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + b2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + + r[a] = 0.0; /* we are removing row a, so clear dist. in r */ + + /* + * Fill the horizontal part of the "a" row and finish computing r and r2 + * we handle the horizontal component first to maximize cache locality + */ + ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */ + ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */ + for(i=a+1;ir = r+1; + + + /* + * Collapse r2 here by copying contents of r2[0] into r2[b] and + * incrementing pointer to the beginning of r2 by one row + */ + r2[b] = r2[0]; + dmat->r2 = r2+1; + + /* increment dmat pointer to next row */ + dmat->val += size; + + /* decrement the total size of the distance matrix by one row */ + dmat->size--; + + return; +} + + + + + + + + + +/* + * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining + * + * + * INPUTS: + * ------- + * nj_args -- A pointer to a structure containing the command-line arguments + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * NJ_TREE * -- A pointer to the Neighbor-Joining tree. + * + * DESCRIPTION: + * ------------ + * + * This function performs a traditional Neighbor-Joining operation in which + * the distance matrix is exhaustively searched for the global minimum + * transformed distance. The two nodes which intersect at the global + * minimum transformed distance are then joined and the distance + * matrix is collapsed. This process continues until there are only + * two nodes left, at which point those nodes are joined. + * + */ +NJ_TREE * +NJ_neighbor_joining(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree = NULL; + NJ_VERTEX *vertex = NULL; + + long int a, b; + float min; + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + if(!vertex) { + fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n"); + return(NULL); + } + + /* we iterate until the working distance matrix has only 2 entries */ + while(vertex->nactive > 2) { + + /* + * Find the global minimum transformed distance from the distance matrix + */ + min = NJ_min_transform(dmat, &a, &b); + + /* + * Build the tree by removing nodes a and b from the vertex array + * and inserting a new internal node which joins a and b. Collapse + * the vertex array similarly to how the distance matrix and r and r2 + * are compacted. + */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* decrement the r and r2 vectors by the distances corresponding to a, b */ + NJ_compute_r(dmat, a, b); + + /* compact the distance matrix and the r and r2 vectors */ + NJ_collapse(dmat, vertex, a, b); + } + + /* Properly join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + /* return the computed tree to the calling function */ + return(tree); +} + + + + + + + + +/* + * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining + * + * INPUTS: + * ------- + * nj_args -- A pointer to a data structure containing the command-line args + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * + * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree + * + * DESCRIPTION: + * ------------ + * + * This function implements the Relaxed Neighbor-Joining algorithm of + * Evans, J., Sheneman, L., and Foster, J. + * + * Relaxed Neighbor-Joining works by choosing a local minimum transformed + * distance when determining when to join two nodes. (Traditional + * Neighbor-Joining chooses a global minimum transformed distance). + * + * The algorithm shares the property with traditional NJ that if the + * input distances are additive (self-consistent), then the algorithm + * will manage to construct the true tree consistent with the additive + * distances. Additivity state is tracked and every proposed join is checked + * to make sure it maintains additivity constraints. If no + * additivity-preserving join is possible in a single pass, then the distance + * matrix is non-additive, and additivity checking is abandoned. + * + * The algorithm will either attempt joins randomly, or it will perform joins + * in a particular order. The default behavior is to perform joins randomly, + * but this can be switched off with a command-line switch. + * + * For randomized joins, all attempts are made to alleviate systematic bias + * for the choice of rows to joins. All tie breaking is done in a way which + * is virtually free of bias. + * + * To perform randomized joins, a random permutation is constructed which + * specifies the order in which to attempt joins. I iterate through the + * random permutation, and for each row in the random permutation, I find + * the minimum transformed distance for that row. If there are multiple + * minima, I break ties evenly. For the row which intersects our + * randomly chosen row at the chosen minimum, if we are are still in + * additivity mode, I check to see if joining the two rows will break + * our additivity constraints. If not, I check to see if there exists + * a transformed distance which is smaller than the minimum found on the + * original row. If there is, then we proceed through the random permutation + * trying additional rows in the random order specified in the permutation. + * If there is no smaller minimum transformed distance on either of the + * two rows, then we join them, collapse the distance matrix, and compute + * a new random permutation. + * + * If the entire random permutation is traversed and no joins are possible + * due to additivity constraints, then the distance matrix is not + * additive, and additivity constraint-checking is disabled. + * + */ +NJ_TREE * +NJ_relaxed_nj(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree; + NJ_VERTEX *vertex; + long int a, b, t, bh, bv, i; + float hmin, vmin, hvmin; + float p, q, x; + int join_flag; + int additivity_mode; + long int hmincount, vmincount; + long int *permutation = NULL; + + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + additivity_mode = 1; + + /* allocate the permutation vector, if we are in randomize mode */ + if(!nj_args->norandom) { + permutation = (long int *)calloc(dmat->size, sizeof(long int)); + if(!permutation) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n"); + return(NULL); + } + } + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + + /* loop until there are only 2 nodes left to join */ + while(vertex->nactive > 2) { + + switch(nj_args->norandom) { + + /* RANDOMIZED JOINS */ + case 0: + + join_flag = 0; + + NJ_permute(permutation, dmat->size-1); + for(i=0;isize-1 && (vertex->nactive>2) ;i++) { + + a = permutation[i]; + + /* find min trans dist along horiz. of row a */ + hmin = NJ_find_hmin(dmat, a, &bh, &hmincount); + if(a) { + /* find min trans dist along vert. of row a */ + vmin = NJ_find_vmin(dmat, a, &bv, &vmincount); + } else { + vmin = hmin; + bv = bh; + vmincount = 0; + } + + if(NJ_FLT_EQ(hmin, vmin)) { + + /* + * The minima along the vertical and horizontal are + * the same. Compute the proportion of minima along + * the horizonal (p) and the proportion of minima + * along the vertical (q). + * + * If the same minima exist along the horizonal and + * vertical, we break the tie in a way which is + * non-biased. That is, we break the tie based on the + * proportion of horiz. minima versus vertical minima. + * + */ + p = (float)hmincount / ((float)hmincount + (float)vmincount); + q = 1.0 - p; + x = genrand_real2(); + + if(x < p) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + } else if(NJ_FLT_LT(hmin, vmin) ) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + + if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) { + + /* swap a and b, if necessary, to make sure a < b */ + if(b < a) { + t = a; + a = b; + b = t; + } + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + NJ_permute(permutation, dmat->size-1); + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + + + + /* DETERMINISTIC JOINS */ + case 1: + + join_flag = 0; + + for(a=0;asize-1 && (vertex->nactive > 2) ;) { + + /* find the min along the horizontal of row a */ + hmin = NJ_find_hmin(dmat, a, &b, &hmincount); + + if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) { + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + if(a) { + a--; + } + + } else { + a++; + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + } + + } /* WHILE */ + + /* Join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + if(nj_args->verbose_flag) { + if(additivity_mode) { + printf("Tree is additive\n"); + } else { + printf("Tree is not additive\n"); + } + } + + if(vertex) { + NJ_free_vertex(vertex); + } + + if(!nj_args->norandom && permutation) { + free(permutation); + } + + return(tree); +} + + + + + + +/* + * NJ_print_distance_matrix() - + * + * Print a distance matrix + * + */ +void +NJ_print_distance_matrix(DMAT *dmat) { + + long int i, j; + + printf("ntaxa: %ld\n", dmat->ntaxa); + printf(" size: %ld\n", dmat->size); + + for(i=0;isize;i++) { + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]); + } else { + printf(" -"); + } + } + + + if(dmat->r && dmat->r2) { + printf("\t\t%0.4f", dmat->r[i]); + printf("\t%0.4f", dmat->r2[i]); + + + + printf("\n"); + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j])); + } else { + printf(" "); + } + } + + printf("\n\n"); + } + } + + printf("\n"); + + return; +} + + + + + + + +/* + * NJ_output_tree() - + * + * A wrapper for the function that really prints the tree, + * basically to get a newline in there conveniently. :-) + * + * Print n trees, as specified in command-args + * using "count" variable from 0 to (n-1) + * + */ +void +NJ_output_tree(NJ_ARGS *nj_args, + NJ_TREE *tree, + DMAT *dmat, + long int count) { + + FILE *fp; + + if(nj_args->stdout_flag) { + fp = stdout; + } else { + + if(count == 0) { + fp = fopen(nj_args->outfilename, "w"); /* open for writing */ + } else { + fp = fopen(nj_args->outfilename, "a"); /* open for appending */ + } + + if(!fp) { + fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename); + exit(-1); + } + } + + NJ_output_tree2(fp, nj_args, tree, tree, dmat); + fprintf(fp, ";\n"); + + if(!nj_args->stdout_flag) { + fclose(fp); + } + + return; +} + + + + + +/* + * NJ_output_tree2() - + * + * + */ +void +NJ_output_tree2(FILE *fp, + NJ_ARGS *nj_args, + NJ_TREE *tree, + NJ_TREE *root, + DMAT *dmat) { + + if(!tree) { + return; + } + + if(tree->taxa_index != NJ_INTERNAL_NODE) { + + if(nj_args->expblen) { + fprintf(fp, "%s:%e", + dmat->taxaname[tree->taxa_index], + tree->dist); + } else { + fprintf(fp, "%s:%f", + dmat->taxaname[tree->taxa_index], + tree->dist); + } + + } else { + + + if(tree->left && tree->right) { + fprintf(fp, "("); + } + if(tree->left) { + NJ_output_tree2(fp, nj_args, tree->left, root, dmat); + } + + if(tree->left && tree->right) { + fprintf(fp, ","); + } + if(tree->right) { + NJ_output_tree2(fp, nj_args, tree->right, root, dmat); + } + + if(tree != root->left) { + if(tree->left && tree->right) { + if(tree != root) { + if(nj_args->expblen) { + fprintf(fp, "):%e", tree->dist); + } else { + fprintf(fp, "):%f", tree->dist); + } + } else { + fprintf(fp, ")"); + } + } + } else { + fprintf(fp, ")"); + } + } + + return; +} + + + + + + + +/* + * NJ_init_r() + * + * This function computes the r column in our matrix + * + */ +void +NJ_init_r(DMAT *dmat) { + + long int i, j, size; + long int index; + float *r, *r2, *val; + long int size1; + float size2; + + r = dmat->r; + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + size1 = size-1; + size2 = (float)(size-2); + + index = 0; + for(i=0;inodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *)); + vertex->nodes_handle = vertex->nodes; + + /* initialize our size and active variables */ + vertex->nactive = dmat->ntaxa; + vertex->size = dmat->ntaxa; + + /* initialize the nodes themselves */ + for(i=0;intaxa;i++) { + + vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + vertex->nodes[i]->left = NULL; + vertex->nodes[i]->right = NULL; + + vertex->nodes[i]->taxa_index = i; + } + + return(vertex); +} + + + + + +/* + * NJ_decompose() - + * + * This function decomposes the star by creating new internal nodes + * and joining two existing tree nodes to it + * + */ +NJ_TREE * +NJ_decompose(DMAT *dmat, + NJ_VERTEX *vertex, + long int x, + long int y, + int last_flag) { + + NJ_TREE *new_node; + float x2clade, y2clade; + + /* compute the distance from the clade components to the new node */ + if(last_flag) { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[x] - dmat->r2[y])/2); + } + + vertex->nodes[x]->dist = x2clade; + + if(last_flag) { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[y] - dmat->r2[x])/2); + } + + vertex->nodes[y]->dist = y2clade; + + /* allocate new node to connect two sub-clades */ + new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + new_node->left = vertex->nodes[x]; + new_node->right = vertex->nodes[y]; + new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */ + + if(last_flag) { + return(new_node); + } + + vertex->nodes[x] = new_node; + vertex->nodes[y] = vertex->nodes[0]; + + vertex->nodes = &(vertex->nodes[1]); + + vertex->nactive--; + + return(new_node); +} + + + +/* + * NJ_print_vertex() - + * + * For debugging, print the contents of the vertex + * + */ +void +NJ_print_vertex(NJ_VERTEX *vertex) { + + long int i; + + printf("Number of active nodes: %ld\n", vertex->nactive); + + for(i=0;inactive;i++) { + printf("%ld ", vertex->nodes[i]->taxa_index); + } + printf("\n"); + + return; +} + + + + + + + + + +/* + * NJ_print_r() - + * + */ +void +NJ_print_r(DMAT *dmat) { + + long int i; + + printf("\n"); + for(i=0;isize;i++) { + printf("r[%ld] = %0.2f\n", i, dmat->r[i]); + } + printf("\n"); + + return; +} + + + + + +/* + * NJ_print_taxanames() - + * + * Print taxa names here + * + */ +void +NJ_print_taxanames(DMAT *dmat) { + + long int i; + + printf("Number of taxa: %ld\n", dmat->ntaxa); + + for(i=0;intaxa;i++) { + printf("%ld) %s\n", i, dmat->taxaname[i]); + } + + printf("\n"); + + return; +} + + + + +/* + * NJ_shuffle_distance_matrix() - + * + * Randomize a distance matrix here + * + */ +void +NJ_shuffle_distance_matrix(DMAT *dmat) { + + + long int *perm = NULL; + char **tmp_taxaname = NULL; + float *tmp_val = NULL; + long int i, j; + + + /* alloc the random permutation and a new matrix to hold the shuffled vals */ + perm = (long int *)calloc(dmat->size, sizeof(long int)); + tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *)); + tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); + if(!tmp_taxaname || !perm || !tmp_val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n"); + exit(-1); + } + + /* compute a permutation which will describe how to shuffle the matrix */ + NJ_permute(perm, dmat->size); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + if(perm[j] < perm[i]) { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)]; + } else { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)]; + } + + } + + tmp_taxaname[i] = dmat->taxaname[perm[i]]; + } + + /* free our random permutation */ + if(perm) { + free(perm); + } + + /* free the old value matrix */ + if(dmat->val) { + free(dmat->val); + } + + /* re-assign the value matrix pointers */ + dmat->val = tmp_val; + dmat->valhandle = dmat->val; + + /* + * Free our old taxaname with its particular ordering + * and re-assign to the new. + */ + if(dmat->taxaname) { + free(dmat->taxaname); + } + dmat->taxaname = tmp_taxaname; + + return; +} + + + +/* + * NJ_free_tree() - + * + * Free a given NJ tree + */ +void +NJ_free_tree(NJ_TREE *node) { + + if(!node) { + return; + } + + if(node->left) { + NJ_free_tree(node->left); + } + + if(node->right) { + NJ_free_tree(node->right); + } + + free(node); + + return; +} + + + + + + + + + +/* + * NJ_print_permutation() + * + * Print a permutation + * + */ +void +NJ_print_permutation(long int *perm, + long int size) { + + long int i; + + for(i=0;intaxa = src->ntaxa; + dest->size = src->size; + + /* allocate space for array of pointers to taxanames */ + dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *)); + if(!dest->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the taxanames themselves */ + for(i=0;intaxa;i++) { + dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char)); + if(!dest->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + } + + /* allocate space for the distance values */ + dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float)); + if(!dest->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the r and r2 vectors */ + dest->r = (float *)calloc(src->ntaxa, sizeof(float)); + dest->r2 = (float *)calloc(src->ntaxa, sizeof(float)); + + /* copy titles */ + for(i=0;intaxa;i++) { + strcpy(dest->taxaname[i], src->taxaname[i]); + } + + /* copy values */ + memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float)); + + /* copy r and r2 */ + memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float)); + memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float)); + + /* track some memory addresses */ + dest->valhandle = dest->val; + dest->rhandle = dest->r; + dest->r2handle = dest->r2; + + return(dest); + + XIT_BAD: + + /* free what we may have allocated */ + NJ_free_dmat(dest); + + return(NULL); +} + + + + +/* + * NJ_free_dmat() - + */ +void +NJ_free_dmat(DMAT *dmat) { + + long int i; + + if(dmat) { + + if(dmat->taxaname) { + + for(i=0;intaxa;i++) { + if(dmat->taxaname[i]) { + free(dmat->taxaname[i]); + } + } + + free(dmat->taxaname); + } + + if(dmat->valhandle) { + free(dmat->valhandle); + } + + if(dmat->rhandle) { + free(dmat->rhandle); + } + + if(dmat->r2handle) { + free(dmat->r2handle); + } + + free(dmat); + } + + return; +} + + + + + +/* + * NJ_free_vertex() - + * + * Free the vertex data structure + * + */ +void +NJ_free_vertex(NJ_VERTEX *vertex) { + + if(vertex) { + if(vertex->nodes_handle) { + free(vertex->nodes_handle); + } + free(vertex); + } + + return; +} + + + + + + + + + +/* + * + * NJ_min_transform() - Find the smallest transformed value to identify + * which nodes to join. + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * + * RETURNS: + * -------- + * -- The minimimum transformed distance + * ret_i -- The row of the smallest transformed distance (by reference) + * ret_j -- The col of the smallest transformed distance (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * Used only with traditional Neighbor-Joining, this function checks the entire + * working distance matrix and identifies the smallest transformed distance. + * This requires traversing the entire diagonal matrix, which is itself a + * O(N^2) operation. + * + */ +float +NJ_min_transform(DMAT *dmat, + long int *ret_i, + long int *ret_j) { + + long int i, j; /* indices used for looping */ + long int tmp_i = 0;/* to limit pointer dereferencing */ + long int tmp_j = 0;/* to limit pointer dereferencing */ + float smallest; /* track the smallest trans. dist */ + float curval; /* the current trans. dist in loop */ + + float *ptr; /* pointer into distance matrix */ + float *r2; /* pointer to r2 matrix for computing transformed dists */ + + smallest = (float)HUGE_VAL; + + /* track these here to limit pointer dereferencing in inner loop */ + ptr = dmat->val; + r2 = dmat->r2; + + /* for every row */ + for(i=0;isize;i++) { + ptr++; /* skip diagonal */ + for(j=i+1;jsize;j++) { /* for every column */ + + /* find transformed distance in matrix at i, j */ + curval = *(ptr++) - (r2[i] + r2[j]); + + /* if the transformed distanance is less than the known minimum */ + if(curval < smallest) { + + smallest = curval; + tmp_i = i; + tmp_j = j; + } + } + } + + /* pass back (by reference) the coords of the min. transformed distance */ + *ret_i = tmp_i; + *ret_j = tmp_j; + + return(smallest); /* return the min transformed distance */ +} + + + + + + + diff --git a/clearcut.h b/clearcut.h index 2710a8d..f3124e3 100644 --- a/clearcut.h +++ b/clearcut.h @@ -1,3 +1,6 @@ + + + /* * clearcut.h * @@ -47,10 +50,11 @@ #ifndef _INC_CLEARCUT_H_ #define _INC_CLEARCUT_H_ 1 +extern "C" { + #include "common.h" #include "cmdargs.h" - #define NJ_VERSION "1.0.9" @@ -180,11 +184,8 @@ typedef struct _STRUCT_NJ_VERTEX { } NJ_VERTEX; - - - - /* some function prototypes */ +int clearcut_main(int, char**); /* core function for performing Relaxed Neighbor Joining */ NJ_TREE * @@ -272,6 +273,8 @@ NJ_min_transform(DMAT *dmat, long int *ret_i, long int *ret_j); +} + #endif /* _INC_CLEARCUT_H_ */ @@ -279,3 +282,4 @@ NJ_min_transform(DMAT *dmat, + diff --git a/clearcutcommand.cpp b/clearcutcommand.cpp index d6c284d..7d0d539 100644 --- a/clearcutcommand.cpp +++ b/clearcutcommand.cpp @@ -8,6 +8,8 @@ */ #include "clearcutcommand.h" +#include "clearcut.h" + /**************************************************************************************/ ClearcutCommand::ClearcutCommand(string option) { @@ -177,52 +179,81 @@ int ClearcutCommand::execute() { //prepare filename string outputName = outputDir + m->getRootName(m->getSimpleName(inputFile)) + "tre"; - //get location of clearcut - GlobalData* globaldata = GlobalData::getInstance(); - string path = globaldata->argv; - path = path.substr(0, (path.find_last_of('m'))); + vector cPara; - string clearcutCommand = path + "clearcut "; + char* tempClearcut = new char[8]; strcpy(tempClearcut, "clearcut"); cPara.push_back(tempClearcut); //you gave us a distance matrix - if (phylipfile != "") { clearcutCommand += "--distance "; } + if (phylipfile != "") { char* temp = new char[10]; strcpy(temp, "--distance"); cPara.push_back(temp); } //you gave us a fastafile - if (fastafile != "") { clearcutCommand += "--alignment "; } + if (fastafile != "") { char* temp = new char[11]; strcpy(temp, "--alignment"); cPara.push_back(temp); } - if (version) { clearcutCommand += "--version "; } - if (verbose) { clearcutCommand += "--verbose "; } - if (quiet) { clearcutCommand += "--quiet "; } - if (seed != "*") { clearcutCommand += "--seed=" + seed + " "; } - if (norandom) { clearcutCommand += "--norandom "; } - if (shuffle) { clearcutCommand += "--shuffle "; } - if (neighbor) { clearcutCommand += "--neighbor "; } + if (version) { char* temp = new char[9]; strcpy(temp, "--version"); cPara.push_back(temp); } + if (verbose) { char* temp = new char[9]; strcpy(temp, "--verbose"); cPara.push_back(temp); } + if (quiet) { char* temp = new char[7]; strcpy(temp, "--quiet"); cPara.push_back(temp); } + if (seed != "*") { + string tempSeed = "--seed=" + seed; + char* temp = new char[tempSeed.length()]; + strcpy(temp, tempSeed.c_str()); + cPara.push_back(temp); + } + if (norandom) { char* temp = new char[10]; strcpy(temp, "--norandom"); cPara.push_back(temp); } + if (shuffle) { char* temp = new char[9]; strcpy(temp, "--shuffle"); cPara.push_back(temp); } + if (neighbor) { char* temp = new char[10]; strcpy(temp, "--neighbor"); cPara.push_back(temp); } + string tempIn = ""; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - clearcutCommand += "--in=" + inputFile + " "; + tempIn += "--in=" + inputFile; #else - clearcutCommand += "--in=\"" + inputFile + "\" "; + tempIn += "--in=\"" + inputFile + "\""; #endif - if (stdoutWanted) { clearcutCommand += "--stdout "; } + char* tempI = new char[tempIn.length()]; + strcpy(tempI, tempIn.c_str()); + cPara.push_back(tempI); + + if (stdoutWanted) { char* temp = new char[8]; strcpy(temp, "--stdout"); cPara.push_back(temp); } else{ + string tempOut = ""; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - clearcutCommand += "--out=" + outputName + " "; } + tempOut += "--out=" + outputName; #else - clearcutCommand += "--out=\"" + outputName + "\" "; } + tempOut += "--out=\"" + outputName + "\""; #endif + + char* temp = new char[tempOut.length()]; + strcpy(temp, tempOut.c_str()); + cPara.push_back(temp); + } + + if (DNA) { char* temp = new char[5]; strcpy(temp, "--DNA"); cPara.push_back(temp); } + if (protein) { char* temp = new char[9]; strcpy(temp, "--protein"); cPara.push_back(temp); } + if (jukes) { char* temp = new char[7]; strcpy(temp, "--jukes"); cPara.push_back(temp); } + if (kimura) { char* temp = new char[8]; strcpy(temp, "--kimura"); cPara.push_back(temp); } + if (matrixout != "") { + string tempMatrix = "--matrixout=" + matrixout; + char* temp = new char[tempMatrix.length()]; + strcpy(temp, tempMatrix.c_str()); + cPara.push_back(temp); + } + + if (ntrees != "1") { + string tempNtrees = "--ntrees=" + ntrees; + char* temp = new char[tempNtrees.length()]; + strcpy(temp, tempNtrees.c_str()); + cPara.push_back(temp); + } + + if (expblen) { char* temp = new char[9]; strcpy(temp, "--expblen"); cPara.push_back(temp); } + if (expdist) { char* temp = new char[9]; strcpy(temp, "--expdist"); cPara.push_back(temp); } + + char** clearcutParameters; + clearcutParameters = new char*[cPara.size()]; + for (int i = 0; i < cPara.size(); i++) { clearcutParameters[i] = cPara[i]; } + int numArgs = cPara.size(); - if (DNA) { clearcutCommand += "--DNA "; } - if (protein) { clearcutCommand += "--protein "; } - if (jukes) { clearcutCommand += "--jukes "; } - if (kimura) { clearcutCommand += "--kimura "; } - if (matrixout != "") { clearcutCommand += "--matrixout=" + matrixout + " "; } - if (ntrees != "1") { clearcutCommand += "--ntrees=" + ntrees + " "; } - if (expblen) { clearcutCommand += "--expblen "; } - if (expdist) { clearcutCommand += "--expdist "; } - - //run clearcut - system(clearcutCommand.c_str()); + clearcut_main(numArgs, clearcutParameters); if (!stdoutWanted) { m->mothurOutEndLine(); diff --git a/cmdargs.c b/cmdargs.c deleted file mode 100644 index 156b1da..0000000 --- a/cmdargs.c +++ /dev/null @@ -1,526 +0,0 @@ -/* - * cmdargs.c - * - * $Id$ - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - -#include -#include -#include -#include - - -#ifdef USE_GNU -#include -#else -#include "getopt_long.h" -#endif /* USE_GNU*/ - - -#include "clearcut.h" -#include "cmdargs.h" - - -/* - * NJ_handle_args() - - * - */ -NJ_ARGS * -NJ_handle_args(int argc, - char *argv[]) { - - static NJ_ARGS nj_args; - int option_index, c; - - struct option NJ_long_options[] = { - - /* These options don't set a flag */ - {"in", required_argument, NULL, 'i'}, - {"out", required_argument, NULL, 'o'}, - {"seed", required_argument, NULL, 's'}, - {"matrixout", required_argument, NULL, 'm'}, - {"ntrees", required_argument, NULL, 'n'}, - - /* These options set a flag */ - {"verbose", no_argument, &(nj_args.verbose_flag), 1}, - {"quiet", no_argument, &(nj_args.quiet_flag), 1}, - {"distance", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_DISTANCE}, - {"alignment", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_ALIGNED_SEQUENCES}, - {"help", no_argument, &(nj_args.help), 1}, - {"version", no_argument, &(nj_args.version), 1}, - {"norandom", no_argument, &(nj_args.norandom), 1}, - {"shuffle", no_argument, &(nj_args.shuffle), 1}, - {"stdin", no_argument, &(nj_args.stdin_flag), 1}, - {"stdout", no_argument, &(nj_args.stdout_flag), 1}, - {"dna", no_argument, &(nj_args.dna_flag), 1}, - {"DNA", no_argument, &(nj_args.dna_flag), 1}, - {"protein", no_argument, &(nj_args.protein_flag), 1}, - {"neighbor", no_argument, &(nj_args.neighbor), 1}, - {"expblen", no_argument, &(nj_args.expblen), 1}, - {"expdist", no_argument, &(nj_args.expdist), 1}, - - {"jukes", no_argument, &(nj_args.jukes_flag), 1}, - {"kimura", no_argument, &(nj_args.kimura_flag), 1}, - - {0, 0, 0, 0} - - }; - - /* initializes options to their default */ - nj_args.infilename = NULL; - nj_args.outfilename = NULL; - nj_args.matrixout = NULL; - nj_args.seed = time(0); - nj_args.verbose_flag = 0; - nj_args.quiet_flag = 0; - nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; - nj_args.help = 0; - nj_args.version = 0; - nj_args.norandom = 0; - nj_args.shuffle = 0; - nj_args.stdin_flag = 0; - nj_args.stdout_flag = 0; - nj_args.dna_flag = 0; - nj_args.protein_flag = 0; - nj_args.correction_model = NJ_MODEL_NONE; - nj_args.jukes_flag = 0; - nj_args.kimura_flag = 0; - nj_args.neighbor = 0; - nj_args.ntrees = 1; - nj_args.expblen = 0; - nj_args.expdist = 0; - - while(1) { - c = getopt_long(argc, - argv, - "i:o:s:m:n:vqduahVSIOrDPjkNeE", - NJ_long_options, - &option_index); - - if(c == -1) { - break; - } - - switch(c) { - - case 0: - if(NJ_long_options[option_index].flag) { - break; - } - - printf("option %s", NJ_long_options[option_index].name); - if(optarg) { - printf(" with arg %s", optarg); - } - printf("\n"); - break; - - case 'i': - nj_args.infilename = optarg; - break; - - case 'o': - nj_args.outfilename = optarg; - break; - - case 's': - nj_args.seed = atoi(optarg); - break; - - case 'm': - nj_args.matrixout = optarg; - break; - - case 'n': - nj_args.ntrees = atoi(optarg); - break; - - case 'v': - nj_args.verbose_flag = 1; - break; - - case 'q': - nj_args.quiet_flag = 1; - break; - - case 'd': - nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; - break; - - case 'a': - nj_args.input_mode = NJ_INPUT_MODE_ALIGNED_SEQUENCES; - break; - - case 'h': - nj_args.help = 1; - break; - - case 'V': - nj_args.version = 1; - break; - - case 'S': - nj_args.shuffle = 1; - break; - - case 'I': - nj_args.stdin_flag = 1; - break; - - case 'O': - nj_args.stdin_flag = 1; - break; - - case 'r': - nj_args.norandom = 1; - break; - - case 'D': - nj_args.dna_flag = 1; - break; - - case 'P': - nj_args.protein_flag = 1; - break; - - case 'j': - nj_args.jukes_flag = 1; - break; - - case 'k': - nj_args.kimura_flag = 1; - break; - - case 'N': - nj_args.neighbor = 1; - break; - - case 'e': - nj_args.expblen = 1; - break; - - case 'E': - nj_args.expdist = 1; - break; - - default: - NJ_usage(); - exit(-1); - } - } - - if(optind < argc) { - fprintf(stderr, "Clearcut: Unknown command-line argument:\n --> %s\n", argv[optind]); - NJ_usage(); - exit(-1); - } - - if(nj_args.version) { - printf("Clearcut Version: %s\n", NJ_VERSION); - exit(0); - } - - if(nj_args.help) { - NJ_usage(); - exit(0); - } - - /* if stdin & explicit filename are specified for input */ - if(nj_args.stdin_flag) { - if(nj_args.infilename) { - fprintf(stderr, "Clearcut: Ambiguous input source specified. Specify input filename OR stdin.\n"); - NJ_usage(); - exit(-1); - } - } - - /* if stdout & explicit filename are specified for output */ - if(nj_args.stdout_flag) { - if(nj_args.outfilename) { - fprintf(stderr, "Clearcut: Ambiguous output specified. Specify output filename OR stdout.\n"); - NJ_usage(); - exit(-1); - } - } - - /* if user did not specify stdin or filename, default to stdin */ - if(!nj_args.stdin_flag) { - if(!nj_args.infilename) { - - fprintf(stderr, "Clearcut: No input file specified. Using stdin.\n"); - nj_args.stdin_flag = 1; - } - } - - /* if user did not specify stdout or filename, default to stdout */ - if(!nj_args.stdout_flag) { - if(!nj_args.outfilename) { - - fprintf(stderr, "Clearcut: No output file specified. Using stdout.\n"); - nj_args.stdout_flag = 1; - } - } - - /* User must specify distance matrix or alignment */ - if(nj_args.input_mode == NJ_INPUT_MODE_UNKNOWN) { - fprintf(stderr, "Clearcut: Must specify input type (--distance | --alignment)\n"); - NJ_usage(); - exit(-1); - } - - /* do not allow protein or DNA options for distance matrix input */ - if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { - if(nj_args.dna_flag || nj_args.protein_flag) { - fprintf(stderr, "Clearcut: Ambiguous arguments. (--protein | --DNA) do not apply to distance \n"); - NJ_usage(); - exit(-1); - } - } - - /* make sure different filenames were specified for input and output */ - if(!nj_args.stdin_flag && !nj_args.stdout_flag) { - - if(!strcmp(nj_args.infilename, nj_args.outfilename)) { - fprintf(stderr, "Clearcut: Input filename and output filename must be unique.\n"); - NJ_usage(); - exit(-1); - } - } - - /* make sure that user specifies DNA or Protein if dealing with alignment input */ - if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { - if(!nj_args.dna_flag && !nj_args.protein_flag) { - fprintf(stderr, "Clearcut: Must specify protein or DNA for alignment input.\n"); - NJ_usage(); - exit(-1); - } - } - - /* make sure that user does not specify both protein and DNA when dealing with alignment input */ - if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { - if(nj_args.dna_flag && nj_args.protein_flag) { - fprintf(stderr, "Clearcut: Specifying protein and DNA sequences are mutually exclusive options\n"); - NJ_usage(); - exit(-1); - } - } - - /* make sure verbose and quiet were not specified together */ - if(nj_args.verbose_flag && nj_args.quiet_flag) { - fprintf(stderr, "Clearcut: Verbose and Quiet mode are mutually exclusive.\n"); - NJ_usage(); - exit(-1); - } - - /* make sure that a correction model was specified only when providing an alignment */ - if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { - if(nj_args.jukes_flag || nj_args.kimura_flag) { - fprintf(stderr, "Clearcut: Only specify correction model for alignment input.\n"); - NJ_usage(); - exit(-1); - } - } else { - if(nj_args.jukes_flag && nj_args.kimura_flag) { - fprintf(stderr, "Clearcut: Only specify one correction model\n"); - NJ_usage(); - exit(-1); - } else { - if(nj_args.jukes_flag && !nj_args.kimura_flag) { - nj_args.correction_model = NJ_MODEL_JUKES; - } else if(nj_args.kimura_flag && !nj_args.jukes_flag) { - nj_args.correction_model = NJ_MODEL_KIMURA; - } else { - nj_args.correction_model = NJ_MODEL_NONE; /* DEFAULT */ - } - } - } - - /* make sure that the number of output trees is reasonable */ - if(nj_args.ntrees <= 0) { - fprintf(stderr, "Clearcut: Number of output trees must be a positive integer.\n"); - NJ_usage(); - exit(-1); - } - - /* - * make sure that if exponential distances are specified, - * we are dealing with alignment input - */ - if(nj_args.expdist && nj_args.input_mode != NJ_INPUT_MODE_ALIGNED_SEQUENCES) { - fprintf(stderr, "Clearcut: Exponential notation for distance matrix output requires that input be an alignment\n"); - NJ_usage(); - exit(-1); - } - - return(&nj_args); -} - - - - - -/* - * NJ_print_args() - - * - */ -void -NJ_print_args(NJ_ARGS *nj_args) { - - char input_mode[32]; - - switch (nj_args->input_mode) { - case NJ_INPUT_MODE_DISTANCE: - sprintf(input_mode, "Distance Matrix"); - break; - case NJ_INPUT_MODE_UNALIGNED_SEQUENCES: - sprintf(input_mode, "Unaligned Sequences"); - break; - case NJ_INPUT_MODE_ALIGNED_SEQUENCES: - sprintf(input_mode, "Aligned Sequences"); - break; - default: - sprintf(input_mode, "UNKNOWN"); - break; - } - - printf("\n*** Command Line Arguments ***\n"); - - printf("Input Mode: %s\n", input_mode); - - if(nj_args->stdin_flag) { - printf("Input from STDIN\n"); - } else { - printf("Input File: %s\n", nj_args->infilename); - } - - if(nj_args->stdout_flag) { - printf("Output from STDOUT\n"); - } else { - printf("Output File: %s\n", nj_args->outfilename); - } - - if(nj_args->input_mode != NJ_INPUT_MODE_DISTANCE) { - if(nj_args->aligned_flag) { - printf("Input Sequences Aligned: YES\n"); - } else { - printf("Input Sequences Aligned: NO\n"); - } - } - - if(nj_args->verbose_flag) { - printf("Verbose Mode: ON\n"); - } else { - printf("Verbose Mode: OFF\n"); - } - - if(nj_args->quiet_flag) { - printf("Quiet Mode: ON\n"); - } else { - printf("Quiet Mode: OFF\n"); - } - - if(nj_args->seed) { - printf("Random Seed: %d\n", nj_args->seed); - } - - printf("\n*******\n"); - - return; -} - - - - -/* - * NJ_usage() - - * - * Print a usage message - * - */ -void -NJ_usage(void) { - - printf("Usage: clearcut --in= --out= [options]...\n"); - printf("GENERAL OPTIONS:\n"); - printf(" -h, --help Display this information.\n"); - printf(" -V, --version Print the version of this program.\n"); - printf(" -v, --verbose More output. (Default: OFF)\n"); - printf(" -q, --quiet Silent operation. (Default: ON)\n"); - printf(" -s, --seed= Explicitly set the PRNG seed to a specific value.\n"); - printf(" -r, --norandom Attempt joins deterministically. (Default: OFF)\n"); - printf(" -S, --shuffle Randomly shuffle the distance matrix. (Default: OFF)\n"); - printf(" -N, --neighbor Use traditional Neighbor-Joining algorithm. (Default: OFF)\n"); - - printf("\n"); - printf("INPUT OPTIONS:\n"); - printf(" -I, --stdin Read input from STDIN.\n"); - printf(" -d, --distance Input file is a distance matrix. (Default: ON)\n"); - printf(" -a, --alignment Input file is a set of aligned sequences. (Default: OFF)\n"); - printf(" -D, --DNA Input alignment are DNA sequences.\n"); - printf(" -P, --protein Input alignment are protein sequences.\n"); - - printf("\n"); - printf("CORRECTION MODEL FOR COMPUTING DISTANCE MATRIX (Default: NO Correction):\n"); - printf(" -j, --jukes Use Jukes-Cantor correction for computing distance matrix.\n"); - printf(" -k, --kimura Use Kimura correction for distance matrix.\n"); - - printf("\n"); - printf("OUTPUT OPTIONS:\n"); - printf(" -O, --stdout Output tree to STDOUT.\n"); - printf(" -m, --matrixout= Output distance matrix to specified file.\n"); - printf(" -n, --ntrees= Output n trees. (Default: 1)\n"); - printf(" -e, --expblen Exponential notation for branch lengths. (Default: OFF)\n"); - printf(" -E, --expdist Exponential notation in distance output. (Default: OFF)\n"); - - printf("\n"); - printf("EXAMPLES:\n"); - printf(" Compute tree by supplying distance matrix via stdin:\n"); - printf(" clearcut --distance < distances.txt > treefile.tre\n"); - printf("\n"); - printf(" Compute tree by supplying an alignment of DNA sequences from a file:\n"); - printf(" clearcut --alignment --DNA --in=alignment.txt --out=treefile.tre\n"); - - return; -} - - - diff --git a/cmdargs.cpp b/cmdargs.cpp new file mode 100644 index 0000000..416ced8 --- /dev/null +++ b/cmdargs.cpp @@ -0,0 +1,526 @@ +/* + * cmdargs.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include + + +#ifdef USE_GNU +#include +#else +#include "getopt_long.h" +#endif /* USE_GNU*/ + + +#include "clearcut.h" +#include "cmdargs.h" + + +/* + * NJ_handle_args() - + * + */ +NJ_ARGS * +NJ_handle_args(int argc, + char *argv[]) { + + static NJ_ARGS nj_args; + int option_index, c; + + struct option NJ_long_options[] = { + + /* These options don't set a flag */ + {"in", required_argument, NULL, 'i'}, + {"out", required_argument, NULL, 'o'}, + {"seed", required_argument, NULL, 's'}, + {"matrixout", required_argument, NULL, 'm'}, + {"ntrees", required_argument, NULL, 'n'}, + + /* These options set a flag */ + {"verbose", no_argument, &(nj_args.verbose_flag), 1}, + {"quiet", no_argument, &(nj_args.quiet_flag), 1}, + {"distance", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_DISTANCE}, + {"alignment", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_ALIGNED_SEQUENCES}, + {"help", no_argument, &(nj_args.help), 1}, + {"version", no_argument, &(nj_args.version), 1}, + {"norandom", no_argument, &(nj_args.norandom), 1}, + {"shuffle", no_argument, &(nj_args.shuffle), 1}, + {"stdin", no_argument, &(nj_args.stdin_flag), 1}, + {"stdout", no_argument, &(nj_args.stdout_flag), 1}, + {"dna", no_argument, &(nj_args.dna_flag), 1}, + {"DNA", no_argument, &(nj_args.dna_flag), 1}, + {"protein", no_argument, &(nj_args.protein_flag), 1}, + {"neighbor", no_argument, &(nj_args.neighbor), 1}, + {"expblen", no_argument, &(nj_args.expblen), 1}, + {"expdist", no_argument, &(nj_args.expdist), 1}, + + {"jukes", no_argument, &(nj_args.jukes_flag), 1}, + {"kimura", no_argument, &(nj_args.kimura_flag), 1}, + + {0, 0, 0, 0} + + }; + + /* initializes options to their default */ + nj_args.infilename = NULL; + nj_args.outfilename = NULL; + nj_args.matrixout = NULL; + nj_args.seed = time(0); + nj_args.verbose_flag = 0; + nj_args.quiet_flag = 0; + nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; + nj_args.help = 0; + nj_args.version = 0; + nj_args.norandom = 0; + nj_args.shuffle = 0; + nj_args.stdin_flag = 0; + nj_args.stdout_flag = 0; + nj_args.dna_flag = 0; + nj_args.protein_flag = 0; + nj_args.correction_model = NJ_MODEL_NONE; + nj_args.jukes_flag = 0; + nj_args.kimura_flag = 0; + nj_args.neighbor = 0; + nj_args.ntrees = 1; + nj_args.expblen = 0; + nj_args.expdist = 0; + + while(1) { + + c = getopt_long(argc, + argv, + "i:o:s:m:n:vqduahVSIOrDPjkNeE", + NJ_long_options, + &option_index); + if(c == -1) { + break; + } + + switch(c) { + + case 0: + if(NJ_long_options[option_index].flag) { + break; + } + + printf("option %s", NJ_long_options[option_index].name); + if(optarg) { + printf(" with arg %s", optarg); + } + printf("\n"); + break; + + case 'i': + nj_args.infilename = optarg; + break; + + case 'o': + nj_args.outfilename = optarg; + break; + + case 's': + nj_args.seed = atoi(optarg); + break; + + case 'm': + nj_args.matrixout = optarg; + break; + + case 'n': + nj_args.ntrees = atoi(optarg); + break; + + case 'v': + nj_args.verbose_flag = 1; + break; + + case 'q': + nj_args.quiet_flag = 1; + break; + + case 'd': + nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; + break; + + case 'a': + nj_args.input_mode = NJ_INPUT_MODE_ALIGNED_SEQUENCES; + break; + + case 'h': + nj_args.help = 1; + break; + + case 'V': + nj_args.version = 1; + break; + + case 'S': + nj_args.shuffle = 1; + break; + + case 'I': + nj_args.stdin_flag = 1; + break; + + case 'O': + nj_args.stdin_flag = 1; + break; + + case 'r': + nj_args.norandom = 1; + break; + + case 'D': + nj_args.dna_flag = 1; + break; + + case 'P': + nj_args.protein_flag = 1; + break; + + case 'j': + nj_args.jukes_flag = 1; + break; + + case 'k': + nj_args.kimura_flag = 1; + break; + + case 'N': + nj_args.neighbor = 1; + break; + + case 'e': + nj_args.expblen = 1; + break; + + case 'E': + nj_args.expdist = 1; + break; + + default: + NJ_usage(); + exit(-1); + } + } + + if(optind < argc) { + fprintf(stderr, "Clearcut: Unknown command-line argument:\n --> %s\n", argv[optind]); + NJ_usage(); + exit(-1); + } + + if(nj_args.version) { + printf("Clearcut Version: %s\n", NJ_VERSION); + exit(0); + } + + if(nj_args.help) { + NJ_usage(); + exit(0); + } + + /* if stdin & explicit filename are specified for input */ + if(nj_args.stdin_flag) { + if(nj_args.infilename) { + fprintf(stderr, "Clearcut: Ambiguous input source specified. Specify input filename OR stdin.\n"); + NJ_usage(); + exit(-1); + } + } + + /* if stdout & explicit filename are specified for output */ + if(nj_args.stdout_flag) { + if(nj_args.outfilename) { + fprintf(stderr, "Clearcut: Ambiguous output specified. Specify output filename OR stdout.\n"); + NJ_usage(); + exit(-1); + } + } + + /* if user did not specify stdin or filename, default to stdin */ + if(!nj_args.stdin_flag) { + if(!nj_args.infilename) { + + fprintf(stderr, "Clearcut: No input file specified. Using stdin.\n"); + nj_args.stdin_flag = 1; + } + } + + /* if user did not specify stdout or filename, default to stdout */ + if(!nj_args.stdout_flag) { + if(!nj_args.outfilename) { + + fprintf(stderr, "Clearcut: No output file specified. Using stdout.\n"); + nj_args.stdout_flag = 1; + } + } + + /* User must specify distance matrix or alignment */ + if(nj_args.input_mode == NJ_INPUT_MODE_UNKNOWN) { + fprintf(stderr, "Clearcut: Must specify input type (--distance | --alignment)\n"); + NJ_usage(); + exit(-1); + } + + /* do not allow protein or DNA options for distance matrix input */ + if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { + if(nj_args.dna_flag || nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Ambiguous arguments. (--protein | --DNA) do not apply to distance \n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure different filenames were specified for input and output */ + if(!nj_args.stdin_flag && !nj_args.stdout_flag) { + + if(!strcmp(nj_args.infilename, nj_args.outfilename)) { + fprintf(stderr, "Clearcut: Input filename and output filename must be unique.\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure that user specifies DNA or Protein if dealing with alignment input */ + if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + if(!nj_args.dna_flag && !nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Must specify protein or DNA for alignment input.\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure that user does not specify both protein and DNA when dealing with alignment input */ + if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + if(nj_args.dna_flag && nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Specifying protein and DNA sequences are mutually exclusive options\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure verbose and quiet were not specified together */ + if(nj_args.verbose_flag && nj_args.quiet_flag) { + fprintf(stderr, "Clearcut: Verbose and Quiet mode are mutually exclusive.\n"); + NJ_usage(); + exit(-1); + } + + /* make sure that a correction model was specified only when providing an alignment */ + if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { + if(nj_args.jukes_flag || nj_args.kimura_flag) { + fprintf(stderr, "Clearcut: Only specify correction model for alignment input.\n"); + NJ_usage(); + exit(-1); + } + } else { + if(nj_args.jukes_flag && nj_args.kimura_flag) { + fprintf(stderr, "Clearcut: Only specify one correction model\n"); + NJ_usage(); + exit(-1); + } else { + if(nj_args.jukes_flag && !nj_args.kimura_flag) { + nj_args.correction_model = NJ_MODEL_JUKES; + } else if(nj_args.kimura_flag && !nj_args.jukes_flag) { + nj_args.correction_model = NJ_MODEL_KIMURA; + } else { + nj_args.correction_model = NJ_MODEL_NONE; /* DEFAULT */ + } + } + } + + /* make sure that the number of output trees is reasonable */ + if(nj_args.ntrees <= 0) { + fprintf(stderr, "Clearcut: Number of output trees must be a positive integer.\n"); + NJ_usage(); + exit(-1); + } + + /* + * make sure that if exponential distances are specified, + * we are dealing with alignment input + */ + if(nj_args.expdist && nj_args.input_mode != NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + fprintf(stderr, "Clearcut: Exponential notation for distance matrix output requires that input be an alignment\n"); + NJ_usage(); + exit(-1); + } + + return(&nj_args); +} + + + + + +/* + * NJ_print_args() - + * + */ +void +NJ_print_args(NJ_ARGS *nj_args) { + + char input_mode[32]; + + switch (nj_args->input_mode) { + case NJ_INPUT_MODE_DISTANCE: + sprintf(input_mode, "Distance Matrix"); + break; + case NJ_INPUT_MODE_UNALIGNED_SEQUENCES: + sprintf(input_mode, "Unaligned Sequences"); + break; + case NJ_INPUT_MODE_ALIGNED_SEQUENCES: + sprintf(input_mode, "Aligned Sequences"); + break; + default: + sprintf(input_mode, "UNKNOWN"); + break; + } + + printf("\n*** Command Line Arguments ***\n"); + + printf("Input Mode: %s\n", input_mode); + + if(nj_args->stdin_flag) { + printf("Input from STDIN\n"); + } else { + printf("Input File: %s\n", nj_args->infilename); + } + + if(nj_args->stdout_flag) { + printf("Output from STDOUT\n"); + } else { + printf("Output File: %s\n", nj_args->outfilename); + } + + if(nj_args->input_mode != NJ_INPUT_MODE_DISTANCE) { + if(nj_args->aligned_flag) { + printf("Input Sequences Aligned: YES\n"); + } else { + printf("Input Sequences Aligned: NO\n"); + } + } + + if(nj_args->verbose_flag) { + printf("Verbose Mode: ON\n"); + } else { + printf("Verbose Mode: OFF\n"); + } + + if(nj_args->quiet_flag) { + printf("Quiet Mode: ON\n"); + } else { + printf("Quiet Mode: OFF\n"); + } + + if(nj_args->seed) { + printf("Random Seed: %d\n", nj_args->seed); + } + + printf("\n*******\n"); + + return; +} + + + + +/* + * NJ_usage() - + * + * Print a usage message + * + */ +void +NJ_usage(void) { + + printf("Usage: clearcut --in= --out= [options]...\n"); + printf("GENERAL OPTIONS:\n"); + printf(" -h, --help Display this information.\n"); + printf(" -V, --version Print the version of this program.\n"); + printf(" -v, --verbose More output. (Default: OFF)\n"); + printf(" -q, --quiet Silent operation. (Default: ON)\n"); + printf(" -s, --seed= Explicitly set the PRNG seed to a specific value.\n"); + printf(" -r, --norandom Attempt joins deterministically. (Default: OFF)\n"); + printf(" -S, --shuffle Randomly shuffle the distance matrix. (Default: OFF)\n"); + printf(" -N, --neighbor Use traditional Neighbor-Joining algorithm. (Default: OFF)\n"); + + printf("\n"); + printf("INPUT OPTIONS:\n"); + printf(" -I, --stdin Read input from STDIN.\n"); + printf(" -d, --distance Input file is a distance matrix. (Default: ON)\n"); + printf(" -a, --alignment Input file is a set of aligned sequences. (Default: OFF)\n"); + printf(" -D, --DNA Input alignment are DNA sequences.\n"); + printf(" -P, --protein Input alignment are protein sequences.\n"); + + printf("\n"); + printf("CORRECTION MODEL FOR COMPUTING DISTANCE MATRIX (Default: NO Correction):\n"); + printf(" -j, --jukes Use Jukes-Cantor correction for computing distance matrix.\n"); + printf(" -k, --kimura Use Kimura correction for distance matrix.\n"); + + printf("\n"); + printf("OUTPUT OPTIONS:\n"); + printf(" -O, --stdout Output tree to STDOUT.\n"); + printf(" -m, --matrixout= Output distance matrix to specified file.\n"); + printf(" -n, --ntrees= Output n trees. (Default: 1)\n"); + printf(" -e, --expblen Exponential notation for branch lengths. (Default: OFF)\n"); + printf(" -E, --expdist Exponential notation in distance output. (Default: OFF)\n"); + + printf("\n"); + printf("EXAMPLES:\n"); + printf(" Compute tree by supplying distance matrix via stdin:\n"); + printf(" clearcut --distance < distances.txt > treefile.tre\n"); + printf("\n"); + printf(" Compute tree by supplying an alignment of DNA sequences from a file:\n"); + printf(" clearcut --alignment --DNA --in=alignment.txt --out=treefile.tre\n"); + + return; +} + + + diff --git a/distclearcut.c b/distclearcut.c deleted file mode 100644 index b688aa6..0000000 --- a/distclearcut.c +++ /dev/null @@ -1,668 +0,0 @@ -/* - * dist.c - * - * $Id$ - * - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * Compute a distance matrix given a set of sequences - * - ***************************************************************************** - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - -#include -#include -#include -#include -#include - -#include "common.h" -#include "dayhoff.h" -#include "fasta.h" -#include "dist.h" - - - - -/* - * NJ_build_distance_matrix() - - * - * Given a filename for an alignment, read the alignment - * into memory and then compute the distance matrix - * using the appropriate correction model - */ -DMAT * -NJ_build_distance_matrix(NJ_ARGS *nj_args) { - - DMAT *dmat; - NJ_alignment *alignment; - - /* Read an alignment in FASTA format */ - alignment = - NJ_read_fasta(nj_args); - - if(!alignment) { - return(NULL); - } - - /* - * Given a global multiple sequence alignment (MSA) and - * a specified distance correction model, compute a - * corrected distance matrix - * - * From proteins, we may want to allow users to specify - * a substitution matrix (feature) - */ - dmat = - NJ_compute_dmat(nj_args, - alignment); - - // NJ_print_taxanames(dmat); - - if(!dmat) { - fprintf(stderr, "Clearcut: Error computing distance matrix\n"); - } - - /* now free the memory associated with the alignment */ - NJ_free_alignment(alignment); - - return(dmat); -} - - - - - -/* - * NJ_compute_dmat() - - * - * Given an alignment and a correction model, compute the - * distance matrix and return it - * - */ -DMAT * -NJ_compute_dmat(NJ_ARGS *nj_args, - NJ_alignment *alignment) { - - DMAT *dmat; - long int i; - - - /* allocate distance matrix here */ - dmat = (DMAT *)calloc(1, sizeof(DMAT)); - if(!dmat) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); - return(NULL); - } - - dmat->ntaxa = alignment->nseq; - dmat->size = alignment->nseq; - - /* allocate memory to hold the taxa names */ - dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *)); - if(!dmat->taxaname) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); - return(NULL); - } - - /* copy sequence titles */ - for(i=0;inseq;i++) { - dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char)); - if(!dmat->taxaname[i]) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); - return(NULL); - } - - strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i])); - } - - /* allocate val matrix in dmat */ - dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float)); - if(!dmat->val) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); - return(NULL); - } - - /* now lets allocate space for the r and r2 columns */ - dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); - dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); - - /* track some memory addresses */ - dmat->rhandle = dmat->r; - dmat->r2handle = dmat->r2; - dmat->valhandle = dmat->val; - - /* apply model correction to matrix */ - switch(nj_args->correction_model) { - - case NJ_MODEL_JUKES: - - if(nj_args->dna_flag) { - NJ_DNA_jc_correction(dmat, alignment); - } else if(nj_args->protein_flag) { - NJ_PROTEIN_jc_correction(dmat, alignment); - } else { - fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n"); - return(NULL); - } - - break; - - case NJ_MODEL_KIMURA: - - if(nj_args->dna_flag) { - NJ_DNA_k2p_correction(dmat, alignment); - } else if(nj_args->protein_flag) { - NJ_PROTEIN_kimura_correction(dmat, alignment); - } else { - fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n"); - return(NULL); - } - - break; - - case NJ_MODEL_NONE: - - NJ_no_correction(dmat, alignment); - - break; - - default: - fprintf(stderr, "Clearcut: Invalid distance correction model.\n"); - return(NULL); - } - - return(dmat); -} - - - - - -/* - * NJ_no_correction() - - * - * Compute the distance matrix without correction - * (straight percent ID) - * - * Resolve ambiguities in sequence data by skipping - * those nucleotides/residues - * - */ -void -NJ_no_correction(DMAT *dmat, - NJ_alignment *alignment) { - - long int i, j; - float pdiff; - - /* compute pairwise percent identity */ - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - pdiff = 1.0 - NJ_pw_percentid(alignment, i, j); - dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff; - } - } - - return; -} - - - - -/* - * NJ_DNA_jc_correction() - - * - * Compute the distance matrix with jukes-cantor correction - * and assign high distance if sequence divergence exceeds - * 0.75 - * - * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), - * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. - * New York: Academic Press - * - */ -void -NJ_DNA_jc_correction(DMAT *dmat, - NJ_alignment *alignment) { - - long int i, j; - long int k; - float d, cutoff, dist; - long int residues; - - cutoff = 0.75; - - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - - k = NJ_pw_differences(alignment, i, j, &residues); - d = 1.0 - NJ_pw_percentid(alignment, i, j); - - if(d > cutoff) { - dist = NJ_BIGDIST; - } else { - dist = (-0.75) * log(1.0 - (4.0/3.0)*d); - } - - if(fabs(dist) < FLT_EPSILON) { - dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; - } else { - dmat->val[NJ_MAP(i, j, dmat->size)] = dist; - } - } - } - - - - return; -} - - - - - - -/* - * NJ_PROTEIN_jc_correction() - - * - * This function performs modified jukes/cantor correction on - * a protein alignment - * - * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), - * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. - * New York: Academic Press - * - */ -void -NJ_PROTEIN_jc_correction(DMAT *dmat, - NJ_alignment *alignment) { - - long int i, j; - long int residues; - long int diff; - float dist, x; - - - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - - diff = NJ_pw_differences(alignment, i, j, &residues); - - if(!diff || !residues) { - dist = 0.0; - } else { - - dist = (float)diff/(float)residues; - x = ((20.0/19.0)*dist); - - if(NJ_FLT_GT(x, 1.0)) { - dist = NJ_BIGDIST; - } else { - dist = -(19.0/20.0) * log(1.0 - x); - } - } - - dmat->val[NJ_MAP(i, j, dmat->size)] = dist; - } - } - - return; -} - - - - - - -/* - * NJ_DNA_k2p_correction() - - * - * Correct a distance matrix using k2p correction using - * cutoffs to avoid problems with logarithms. - * - * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q) - * - * But due to the logarithms, this is only valid when - * - * (2P+Q <= 1) && - * (2Q <= 1) - * - * So assign arbitary distances when these constraints are - * not strictly followed. - * - * Kimura, M. (1980), A simple method for estimating evolutionary - * rates of base substitutions through comparative studies of - * nucleotide sequences. J. Mol. Evol., 16, 111-120 - * - */ -void -NJ_DNA_k2p_correction(DMAT *dmat, - NJ_alignment *alignment) { - - long int i, j; - float P; /* proportion of transitions */ - float Q; /* proportion of transversions */ - long int nucleotides; - long int transitions, transversions; - float dist; - float log_x = 0.0; /* the params for the first log */ - float log_y = 0.0; /* the params for the second log */ - - int blowup; /* a flag to specify if we have a log blowup */ - - - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - - blowup = 0; - - /* count the number of transitions and transversions */ - NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides); - - if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */ - P = 0.0; - Q = 0.0; - } else { - P = (float)transitions / (float)nucleotides; - Q = (float)transversions / (float)nucleotides; - } - - /* the first log blows up if 2*P+Q = 1.0 */ - if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) { - blowup = 1; - } else { - if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) { - blowup = 1; - } else { - log_x = log(1.0 - 2.0*P - Q); - } - } - - /* the second log blows up if 2*Q >= 1.0 */ - if( NJ_FLT_EQ((2.0 * Q), 1.0) || - NJ_FLT_GT((2.0 * Q), 1.0) ) { - blowup = 1; - } else { - log_y = log(1.0 - 2.0*Q); - } - - /* if our logarithms blow up, we just set the distance to the max */ - if(blowup) { - dist = NJ_BIGDIST; - } else { - dist = (-0.5)*log_x - 0.25*log_y; - } - - if(fabs(dist) < FLT_EPSILON) { - dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; - } else { - dmat->val[NJ_MAP(i, j, dmat->size)] = dist; - } - } - } - - return; -} - - - - -/* - * NJ_PROTEIN_kimura_correction() - - * - * Perform Kimura correction for distances derived from protein - * alignments. - * - * Kimura, M. (1983), The Neutral Theory of Molecular Evolution. - * p. 75., Cambridge University Press, Cambridge, England - * - */ -void -NJ_PROTEIN_kimura_correction(DMAT *dmat, - NJ_alignment *alignment) { - - long int i, j; - long int residues; - long int diff; - float dist; - - - printf("NJ_PROTEIN_kimura_correction()\n"); - - for(i=0;isize;i++) { - for(j=i+1;jsize;j++) { - diff = NJ_pw_differences(alignment, i, j, &residues); - - if(!diff || !residues) { - dist = 0.0; - } else { - dist = (float)diff/(float)residues; - } - - if(NJ_FLT_LT(dist, 0.75)) { - if(NJ_FLT_GT(dist, 0.0) ) { - dist = -log(1.0 - dist - (dist * dist/5.0) ); - } - } else { - if(NJ_FLT_GT(dist, 0.93) ) { - dist = 10.0; - } else { - dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ; - } - } - - dmat->val[NJ_MAP(i, j, dmat->size)] = dist; - } - } - - return; -} - - - - - -/* - * NJ_DNA_count_tt() - - * - * Count the number of transitions and transversions - * between two aligned DNA sequences - * - * This routine automatically skips ambiguities when - * counting transitions and transversions. - * - */ -void -NJ_DNA_count_tt(NJ_alignment *alignment, - long int x, - long int y, - long int *transitions, - long int *transversions, - long int *residues) { - - long int tmp_transitions = 0; - long int tmp_transversions = 0; - long int tmp_residues = 0; - char a, b; - long int i; - - for(i=0;ilength;i++) { - - a = toupper(alignment->data[x*alignment->length+i]); - b = toupper(alignment->data[y*alignment->length+i]); - - if( (a == 'A' && b == 'T') || - (a == 'T' && b == 'A') || - (a == 'A' && b == 'C') || - (a == 'C' && b == 'A') || - (a == 'T' && b == 'G') || - (a == 'G' && b == 'T') || - (a == 'C' && b == 'G') || - (a == 'G' && b == 'C') ) { - tmp_transversions++; - } - - if( (a == 'C' && b == 'T') || - (a == 'T' && b == 'C') || - (a == 'G' && b == 'A') || - (a == 'A' && b == 'G') ) { - tmp_transitions++; - } - - /* count the number of residues */ - if(a != NJ_AMBIGUITY_CHAR && - b != NJ_AMBIGUITY_CHAR ) { - tmp_residues++; - } - - } - - *transitions = tmp_transitions; - *transversions = tmp_transversions; - - if(residues) { - *residues = tmp_residues; - } - - return; -} - - - - - -/* - * NJ_pw_percentid() - - * - * Given an alignment and a specification - * for two rows, compute the pairwise - * percent identity between the two - * - */ -float -NJ_pw_percentid(NJ_alignment *alignment, - long int x, - long int y) { - - float pid; - long int i; - long int residues; - long int same; - char c1, c2; - - residues = 0; - same = 0; - for(i=0;ilength;i++) { - - c1 = alignment->data[x*alignment->length+i]; - c2 = alignment->data[y*alignment->length+i]; - - if( c1 != NJ_AMBIGUITY_CHAR || - c2 != NJ_AMBIGUITY_CHAR ) { - - residues++; - - if(c1 == c2) { - same++; - } - } - - } - - pid = (float)same/(float)residues; - - return(pid); -} - - - -/* - * NJ_pw_differences() - - * - * Given an alignment and a specification - * for two rows in the alignment, compute the - * number of differences between the two sequences - * - * With respect to ambiguity codes, we will want to - * disregard those sites entirely in our count. - * - */ -long int -NJ_pw_differences(NJ_alignment *alignment, - long int x, - long int y, - long int *residues) { - - long int i; - long int diff; - char c1, c2; - long int tmp_residues; - - diff = 0; - tmp_residues = 0; - for(i=0;ilength;i++) { - - c1 = alignment->data[x*alignment->length+i]; - c2 = alignment->data[y*alignment->length+i]; - - if( c1 != NJ_AMBIGUITY_CHAR || - c2 != NJ_AMBIGUITY_CHAR ) { - - tmp_residues++; - - if(c1 != c2) { - diff++; - } - } - - } - - *residues = tmp_residues; - - return(diff); -} - - - - - - diff --git a/distclearcut.cpp b/distclearcut.cpp new file mode 100644 index 0000000..a4ac777 --- /dev/null +++ b/distclearcut.cpp @@ -0,0 +1,668 @@ +/* + * dist.c + * + * $Id$ + * + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Compute a distance matrix given a set of sequences + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include +#include + +#include "common.h" +#include "dayhoff.h" +#include "fasta.h" +#include "distclearcut.h" + + + + +/* + * NJ_build_distance_matrix() - + * + * Given a filename for an alignment, read the alignment + * into memory and then compute the distance matrix + * using the appropriate correction model + */ +DMAT * +NJ_build_distance_matrix(NJ_ARGS *nj_args) { + + DMAT *dmat; + NJ_alignment *alignment; + + /* Read an alignment in FASTA format */ + alignment = + NJ_read_fasta(nj_args); + + if(!alignment) { + return(NULL); + } + + /* + * Given a global multiple sequence alignment (MSA) and + * a specified distance correction model, compute a + * corrected distance matrix + * + * From proteins, we may want to allow users to specify + * a substitution matrix (feature) + */ + dmat = + NJ_compute_dmat(nj_args, + alignment); + + // NJ_print_taxanames(dmat); + + if(!dmat) { + fprintf(stderr, "Clearcut: Error computing distance matrix\n"); + } + + /* now free the memory associated with the alignment */ + NJ_free_alignment(alignment); + + return(dmat); +} + + + + + +/* + * NJ_compute_dmat() - + * + * Given an alignment and a correction model, compute the + * distance matrix and return it + * + */ +DMAT * +NJ_compute_dmat(NJ_ARGS *nj_args, + NJ_alignment *alignment) { + + DMAT *dmat; + long int i; + + + /* allocate distance matrix here */ + dmat = (DMAT *)calloc(1, sizeof(DMAT)); + if(!dmat) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + dmat->ntaxa = alignment->nseq; + dmat->size = alignment->nseq; + + /* allocate memory to hold the taxa names */ + dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *)); + if(!dmat->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* copy sequence titles */ + for(i=0;inseq;i++) { + dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char)); + if(!dmat->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i])); + } + + /* allocate val matrix in dmat */ + dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float)); + if(!dmat->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* now lets allocate space for the r and r2 columns */ + dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); + dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); + + /* track some memory addresses */ + dmat->rhandle = dmat->r; + dmat->r2handle = dmat->r2; + dmat->valhandle = dmat->val; + + /* apply model correction to matrix */ + switch(nj_args->correction_model) { + + case NJ_MODEL_JUKES: + + if(nj_args->dna_flag) { + NJ_DNA_jc_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_jc_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_KIMURA: + + if(nj_args->dna_flag) { + NJ_DNA_k2p_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_kimura_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_NONE: + + NJ_no_correction(dmat, alignment); + + break; + + default: + fprintf(stderr, "Clearcut: Invalid distance correction model.\n"); + return(NULL); + } + + return(dmat); +} + + + + + +/* + * NJ_no_correction() - + * + * Compute the distance matrix without correction + * (straight percent ID) + * + * Resolve ambiguities in sequence data by skipping + * those nucleotides/residues + * + */ +void +NJ_no_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float pdiff; + + /* compute pairwise percent identity */ + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + pdiff = 1.0 - NJ_pw_percentid(alignment, i, j); + dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff; + } + } + + return; +} + + + + +/* + * NJ_DNA_jc_correction() - + * + * Compute the distance matrix with jukes-cantor correction + * and assign high distance if sequence divergence exceeds + * 0.75 + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_DNA_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int k; + float d, cutoff, dist; + long int residues; + + cutoff = 0.75; + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + k = NJ_pw_differences(alignment, i, j, &residues); + d = 1.0 - NJ_pw_percentid(alignment, i, j); + + if(d > cutoff) { + dist = NJ_BIGDIST; + } else { + dist = (-0.75) * log(1.0 - (4.0/3.0)*d); + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + + + return; +} + + + + + + +/* + * NJ_PROTEIN_jc_correction() - + * + * This function performs modified jukes/cantor correction on + * a protein alignment + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_PROTEIN_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist, x; + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + + dist = (float)diff/(float)residues; + x = ((20.0/19.0)*dist); + + if(NJ_FLT_GT(x, 1.0)) { + dist = NJ_BIGDIST; + } else { + dist = -(19.0/20.0) * log(1.0 - x); + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + + +/* + * NJ_DNA_k2p_correction() - + * + * Correct a distance matrix using k2p correction using + * cutoffs to avoid problems with logarithms. + * + * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q) + * + * But due to the logarithms, this is only valid when + * + * (2P+Q <= 1) && + * (2Q <= 1) + * + * So assign arbitary distances when these constraints are + * not strictly followed. + * + * Kimura, M. (1980), A simple method for estimating evolutionary + * rates of base substitutions through comparative studies of + * nucleotide sequences. J. Mol. Evol., 16, 111-120 + * + */ +void +NJ_DNA_k2p_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float P; /* proportion of transitions */ + float Q; /* proportion of transversions */ + long int nucleotides; + long int transitions, transversions; + float dist; + float log_x = 0.0; /* the params for the first log */ + float log_y = 0.0; /* the params for the second log */ + + int blowup; /* a flag to specify if we have a log blowup */ + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + blowup = 0; + + /* count the number of transitions and transversions */ + NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides); + + if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */ + P = 0.0; + Q = 0.0; + } else { + P = (float)transitions / (float)nucleotides; + Q = (float)transversions / (float)nucleotides; + } + + /* the first log blows up if 2*P+Q = 1.0 */ + if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) { + blowup = 1; + } else { + if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) { + blowup = 1; + } else { + log_x = log(1.0 - 2.0*P - Q); + } + } + + /* the second log blows up if 2*Q >= 1.0 */ + if( NJ_FLT_EQ((2.0 * Q), 1.0) || + NJ_FLT_GT((2.0 * Q), 1.0) ) { + blowup = 1; + } else { + log_y = log(1.0 - 2.0*Q); + } + + /* if our logarithms blow up, we just set the distance to the max */ + if(blowup) { + dist = NJ_BIGDIST; + } else { + dist = (-0.5)*log_x - 0.25*log_y; + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + return; +} + + + + +/* + * NJ_PROTEIN_kimura_correction() - + * + * Perform Kimura correction for distances derived from protein + * alignments. + * + * Kimura, M. (1983), The Neutral Theory of Molecular Evolution. + * p. 75., Cambridge University Press, Cambridge, England + * + */ +void +NJ_PROTEIN_kimura_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist; + + + printf("NJ_PROTEIN_kimura_correction()\n"); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + dist = (float)diff/(float)residues; + } + + if(NJ_FLT_LT(dist, 0.75)) { + if(NJ_FLT_GT(dist, 0.0) ) { + dist = -log(1.0 - dist - (dist * dist/5.0) ); + } + } else { + if(NJ_FLT_GT(dist, 0.93) ) { + dist = 10.0; + } else { + dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ; + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + +/* + * NJ_DNA_count_tt() - + * + * Count the number of transitions and transversions + * between two aligned DNA sequences + * + * This routine automatically skips ambiguities when + * counting transitions and transversions. + * + */ +void +NJ_DNA_count_tt(NJ_alignment *alignment, + long int x, + long int y, + long int *transitions, + long int *transversions, + long int *residues) { + + long int tmp_transitions = 0; + long int tmp_transversions = 0; + long int tmp_residues = 0; + char a, b; + long int i; + + for(i=0;ilength;i++) { + + a = toupper(alignment->data[x*alignment->length+i]); + b = toupper(alignment->data[y*alignment->length+i]); + + if( (a == 'A' && b == 'T') || + (a == 'T' && b == 'A') || + (a == 'A' && b == 'C') || + (a == 'C' && b == 'A') || + (a == 'T' && b == 'G') || + (a == 'G' && b == 'T') || + (a == 'C' && b == 'G') || + (a == 'G' && b == 'C') ) { + tmp_transversions++; + } + + if( (a == 'C' && b == 'T') || + (a == 'T' && b == 'C') || + (a == 'G' && b == 'A') || + (a == 'A' && b == 'G') ) { + tmp_transitions++; + } + + /* count the number of residues */ + if(a != NJ_AMBIGUITY_CHAR && + b != NJ_AMBIGUITY_CHAR ) { + tmp_residues++; + } + + } + + *transitions = tmp_transitions; + *transversions = tmp_transversions; + + if(residues) { + *residues = tmp_residues; + } + + return; +} + + + + + +/* + * NJ_pw_percentid() - + * + * Given an alignment and a specification + * for two rows, compute the pairwise + * percent identity between the two + * + */ +float +NJ_pw_percentid(NJ_alignment *alignment, + long int x, + long int y) { + + float pid; + long int i; + long int residues; + long int same; + char c1, c2; + + residues = 0; + same = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + residues++; + + if(c1 == c2) { + same++; + } + } + + } + + pid = (float)same/(float)residues; + + return(pid); +} + + + +/* + * NJ_pw_differences() - + * + * Given an alignment and a specification + * for two rows in the alignment, compute the + * number of differences between the two sequences + * + * With respect to ambiguity codes, we will want to + * disregard those sites entirely in our count. + * + */ +long int +NJ_pw_differences(NJ_alignment *alignment, + long int x, + long int y, + long int *residues) { + + long int i; + long int diff; + char c1, c2; + long int tmp_residues; + + diff = 0; + tmp_residues = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + tmp_residues++; + + if(c1 != c2) { + diff++; + } + } + + } + + *residues = tmp_residues; + + return(diff); +} + + + + + + diff --git a/dmat.c b/dmat.c deleted file mode 100644 index 1d00cff..0000000 --- a/dmat.c +++ /dev/null @@ -1,869 +0,0 @@ -/* - * dmat.c - * - * $Id$ - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * Distance matrix parser - * - ***************************************************************************** - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - - - -#include -#include -#include -#include -#include - - - -#include "common.h" -#include "clearcut.h" - -#include "dmat.h" - - - - - -/* - * - * NJ_is_alpha() - determine if character is an alphabetic character - * - * INPUT: - * ------ - * c -- character to test - * - * RETURN: - * ------- - * int -- 1 if character is alphabetic (A-Z || a-z) - * 0 if character is NOT alphabetic - * - */ -static inline -int -NJ_is_alpha(char c) { - - if( (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') ) { - return(1); - } else { - return(0); - } -} - - - -/* - * - * NJ_is_whitespace() - determine if character is a whitespace character - * - * INPUT: - * ------ - * c -- character to test - * - * RETURN: - * ------- - * int -- 1 if character is whitespace (space, tab, CR, LF) - * 0 if character is NOT whitespace - * - */ -static inline -int -NJ_is_whitespace(char c) { - - if( c == ' ' || /* space */ - c == '\n' || /* newline */ - c == '\r' || /* carriage-return */ - c == '\v' || /* vertical tab */ - c == '\f' || /* form feed */ - c == '\t' ) { /* horizontal tab */ - return(1); - } else { - return(0); - } - -} - - - - -/* - * - * NJ_is_number() - determine if character is a number - * - * INPUT: - * ------ - * c -- character to test - * - * RETURN: - * ------- - * int -- 1 if character is a number (0-9) - * 0 if character is NOT a number - * - */ -static inline -int -NJ_is_number(char c) { - if(c >= '0' && c <= '9') { - return(1); - } else { - return(0); - } -} - - - -/* - * NJ_is_distance() - check if string is a properly formatted distance value - * - */ -static inline -int -NJ_is_distance(char *token) { - - int i; - char c; - int exponent_state; - int expsign_state; - int dpoint_state; - - /* if token is NULL return failure */ - if(!token) { - return(0); - } - - exponent_state = 0; - expsign_state = 0; - dpoint_state = 0; - - /* The first character must be a number, a decimal point or a sign */ - c = token[0]; - if(!NJ_is_number(c) && - c != '.' && - c != '-' && - c != '+' ) { - - goto BAD; - } - - /* - * if the first character is not a number, and string is only one - * character long, then we return failure. - */ - if(strlen(token) == 1) { - if(!NJ_is_number(c)) { - goto BAD; - } - } - - for(i=0;i0 && !exponent_state) { - if(c == '-' || - c == '+') { - goto BAD; - } - } - - /* if we are in the exponent state, and we've already seen a sign */ - if(exponent_state && expsign_state) { - if(c == '-' || - c == '+') { - goto BAD; - } - } - - /* if we are in the exponent state and we see a decimal point */ - if(exponent_state) { - if(c == '.') { - goto BAD; - } - } - - /* if we are in the exponent state and see another e or E */ - if(exponent_state) { - if(c == 'e' || - c == 'E') { - goto BAD; - } - } - - /* if we are dpoint_state and see another decimal point */ - if(dpoint_state) { - if(c == '.') { - goto BAD; - } - } - - - /* enter the exponent state if we need to */ - if(!exponent_state) { - if(c == 'e' || - c == 'E') { - exponent_state = 1; - } - } - - /* enter the expsign_state if we need to */ - if(exponent_state && !expsign_state) { - if(c == '-' || - c == '+') { - expsign_state = 1; - } - } - - /* if not in dpoint state and we see a dpoint */ - if(!dpoint_state) { - if(c == '.') { - dpoint_state = 1; - } - } - - } - - /* the token must end in a number char */ - if(!NJ_is_number(token[strlen(token)-1])) { - goto BAD; - } - - /* token is a valid numerical distance */ - return(1); - - BAD: - - /* token is invalid distance format */ - return(0); -} - - - - -/* - * NJ_is_label() - - * - * Simply, if token is not a valid number, then it is a name - * - */ -static inline -int -NJ_is_label(char *token) { - if(NJ_is_distance(token)) { - return(0); - } else { - return(1); - } -} - - - -/* - * NJ_get_token() - get a token from an input stream - * - */ -static inline -int -NJ_get_token(FILE *fp, - NJ_DIST_TOKEN *token) { - - char c; - int index; - - c = fgetc(fp); - if(feof(fp)) { - token->type = NJ_EOF_STATE; - return(token->type); - } - - if(NJ_is_whitespace(c)) { - token->buf[0] = c; - token->buf[1] = '\0'; - token->type = NJ_WS_STATE; - - return NJ_WS_STATE; - } - - index = 0; - while(!NJ_is_whitespace(c)) { - - /* reallocate our buffer if necessary */ - if(index >= token->bufsize) { - token->bufsize *= 2; - token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char)); - if(!token->buf) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n"); - exit(-1); - } - } - - token->buf[index++] = c; - - c = fgetc(fp); - if(feof(fp)) { - token->type = NJ_EOF_STATE; - break; - } - } - - token->buf[index] = '\0'; - - if(token->type != NJ_EOF_STATE) { - - if(NJ_is_distance(token->buf)) { - token->type = NJ_FLOAT_STATE; - } else { - token->type = NJ_NAME_STATE; - } - - } - - return(token->type); -} - - - - - - -/* - * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix - * - * - * INPUT: - * ------ - * nj_args -- a pointer to a structure containing the command-line arguments - * - * OUTPUT: - * ------- - * -- NULL (failure) - * -- A pointer to a populated distance matrix - * - * DESCRIPTION: - * ------------ - * This function implements a simple state machine to parse a distance matrix - * in approximate PHYLIP format. This function auto-detects whether the - * distance matrix is in upper, lower, or fully-symmetric format and handles - * it accordingly. For full/symmetric matrices, values must be symmetric - * around the diagonal, which is required to be zeroes. Names and values must - * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can - * include numbers, but must start with non-numerical symbols. - * - * - * *** UPPER FORMAT EXAMPLE *** - * - * 4 - * seq1 0.2 0.3 0.1 - * seq2 0.2 0.3 - * seq3 0.1 - * seq4 - * - * *** LOWER FORMAT EXAMPLE *** - * - * 4 - * seq1 - * seq2 0.3 - * seq3 0.2 0.4 - * seq4 0.3 0.1 0.3 - * - * *** SYMMETRIC (FULL) EXAMPLE *** - * - * 4 - * seq1 0.0 0.3 0.5 0.3 - * seq2 0.3 0.0 0.1 0.2 - * seq3 0.5 0.1 0.0 0.9 - * seq4 0.3 0.2 0.9 0.0 - * - * Values in the distance matrix can be positive or negative, integers or - * real values. Values can also be parsed in exponential notation form. - * - */ -DMAT * -NJ_parse_distance_matrix(NJ_ARGS *nj_args) { - - DMAT *dmat = NULL; - FILE *fp = NULL; - NJ_DIST_TOKEN *token = NULL; - - int state, dmat_type; - int row; - int fltcnt; - int x, y, i; - int numvalread; - int expectedvalues = -1; - float val; - int first_state = 0; - - - /* allocate our distance matrix and token structure */ - dmat = (DMAT *)calloc(1, sizeof(DMAT)); - token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN)); - if(token) { - token->bufsize = NJ_INITIAL_BUFSIZE; - token->buf = (char *)calloc(token->bufsize, sizeof(char)); - } - if(!dmat || !token || !token->buf) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); - goto XIT_BAD; - } - - /* open distance matrix file here */ - if(nj_args->stdin_flag) { - fp = stdin; - } else { - fp = fopen(nj_args->infilename, "r"); - if(!fp) { - fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename); - perror("Clearcut"); - goto XIT_BAD; - } - } - - /* get the number of taxa in this file */ - fscanf(fp, "%ld\n", &dmat->ntaxa); - if(dmat->ntaxa < 2) { - fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n"); - - goto XIT_BAD; - } - - /* set our initial working size according to the # of taxa */ - dmat->size = dmat->ntaxa; - - /* allocate space for the distance matrix values here */ - dmat->val = - (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); - if(!dmat->val) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); - goto XIT_BAD; - } - - /* taxa names */ - dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *)); - if(!dmat->taxaname) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); - goto XIT_BAD; - } - - /* set the initial state of our state machine */ - dmat_type = NJ_PARSE_UNKNOWN; - row = -1; - fltcnt = 0; - numvalread = 0; - - - /* read the input one character at a time to drive simple state machine */ - state = NJ_get_token(fp, token); - while(state != NJ_EOF_STATE) { - - switch(state) { - - case NJ_NAME_STATE: - - if(first_state == 0) { - first_state = 1; - } - - row++; - - if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) { - - if(fltcnt == dmat->ntaxa) { - - dmat_type = NJ_PARSE_SYMMETRIC; - expectedvalues = dmat->ntaxa * dmat->ntaxa; - - } else if (fltcnt == dmat->ntaxa-1) { - - dmat_type = NJ_PARSE_UPPER; - expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; - - /* shift everything in first row by one char */ - for(i=dmat->ntaxa-2;i>=0;i--) { - dmat->val[i+1] = dmat->val[i]; - } - - } else if (fltcnt == 0) { - - dmat_type = NJ_PARSE_LOWER; - expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; - - } else { - goto XIT_BAD; - } - } - - if(row >= dmat->ntaxa) { - goto XIT_BAD; - } - - /* allocate space for this taxon label */ - dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char)); - if(!dmat->taxaname[row]) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); - goto XIT_BAD; - } - - strcpy(dmat->taxaname[row], token->buf); - - fltcnt = 0; - - break; - - - case NJ_FLOAT_STATE: - - if(first_state == 0) { - goto XIT_BAD; - } - - val = atof(token->buf); - if(errno) { - fprintf(stderr, "Clearcut: Distance value out-of-range.\n"); - goto XIT_BAD; - } - - x = row; - y = fltcnt; - - switch(dmat_type) { - - case NJ_PARSE_UNKNOWN: - - dmat->val[NJ_MAP(x, y, dmat->size)] = val; - - break; - - case NJ_PARSE_SYMMETRIC: - - if(fltcnt >= dmat->ntaxa) { - fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); - goto XIT_BAD; - } - - if(x < y) { - dmat->val[NJ_MAP(x, y, dmat->size)] = val; - } else if(x > y) { - if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) { - fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n"); - goto XIT_BAD; - } - } else { - if(!NJ_FLT_EQ(val, 0.0)) { - fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); - goto XIT_BAD; - - } - } - - break; - - case NJ_PARSE_UPPER: - - if(fltcnt > dmat->ntaxa-row) { - fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); - goto XIT_BAD; - } - - dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val; - - break; - - case NJ_PARSE_LOWER: - - if(fltcnt > row-1) { - fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); - goto XIT_BAD; - } - - dmat->val[NJ_MAP(y, x, dmat->size)] = val; - - break; - - default: - - goto XIT_BAD; - - break; - } - - fltcnt++; - numvalread++; - - break; - - case NJ_WS_STATE: - - break; - - case NJ_EOF_STATE: - - if(first_state == 0) { - goto XIT_BAD; - } - - break; - - default: - - fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n"); - break; - - } - - /* get next token from stream */ - state = NJ_get_token(fp, token); - } - - - /* - * At the end, if we have not read the number of values that we predicted - * we would need, then there was a problem and we need to punt. - */ - if(numvalread != expectedvalues) { - fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n"); - goto XIT_BAD; - } - - /* special check to make sure first value read is 0.0 */ - if(dmat_type == NJ_PARSE_SYMMETRIC) { - if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) { - fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); - goto XIT_BAD; - } - } - - - /* now lets allocate space for the r and r2 columns */ - dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); - dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); - if(!dmat->r || !dmat->r2) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); - goto XIT_BAD; - } - - /* track some memory addresses */ - dmat->rhandle = dmat->r; - dmat->r2handle = dmat->r2; - dmat->valhandle = dmat->val; - - /* close matrix file here */ - if(!nj_args->stdin_flag) { - fclose(fp); - } - - if(token) { - if(token->buf) { - free(token->buf); - } - free(token); - } - - return(dmat); - - - - /* clean up our partial progress */ - XIT_BAD: - - if(fp) { - fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp)); - } - - /* close matrix file here */ - if(!nj_args->stdin_flag) { - if(fp) { - fclose(fp); - } - } - - /* if we have a valid dmat (partial or complete), we need to free it */ - if(dmat) { - NJ_free_dmat(dmat); - } - - if(token) { - if(token->buf) { - free(token->buf); - } - free(token); - } - - return(NULL); -} - - - - - - - -/* - * NJ_output_matrix() - Output a distance matrix to the specified file - * - * - * INPUTS: - * ------- - * nj_args -- a pointer to a data structure holding the command-line args - * dmat -- a pointer to a distance matrix - * - * - * RETURNS: - * -------- - * NOTHING - * - * - * DESCRIPTION: - * ------------ - * If the appropriate flag was specified in the command-line, this function - * now outputs the parsed or computed distance matrix to a file. This - * can be useful if generating a distance matrix was the primary goal of - * running the program, or if one wanted to debug and/or verify the - * correctness of the program. - * - * Currently this function outputs full/symmetric matrices only. - * - */ -void -NJ_output_matrix(NJ_ARGS *nj_args, - DMAT *dmat) { - - FILE *fp = NULL; - long int i, j; - - - - /* if we haven't specieid matrixout, return immediately */ - if(!nj_args->matrixout) { - return; - } - - /* open the specified matrix file for writing */ - fp = fopen(nj_args->matrixout, "w"); - if(!fp) { - fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout); - return; - } - - /* output the number of taxa in the matrix */ - fprintf(fp, " %ld\n", dmat->size); - - fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop - - for(i=1;isize;i++) { - - /* output taxaname */ - fprintf(fp, "%s\t", dmat->taxaname[i]); - - for(j=0;jexpdist) { /* exponential notation (or not) */ - fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); - } else { - fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); - } - } - - fprintf(fp, "\n"); - } - -#ifdef FULL_SYMMETRIC_MATRIX - - /* output the number of taxa in the matrix */ - fprintf(fp, " %ld\n", dmat->size); - for(i=0;isize;i++) { - - /* output taxaname */ - fprintf(fp, "%s\t", dmat->taxaname[i]); - - for(j=0;jsize;j++) { - if(i>j) { - if(nj_args->expdist) { /* exponential notation (or not) */ - fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); - } else { - fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); - } - } else if(iexpdist) { /* exponential notation (or not) */ - fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]); - } else { - fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]); - } - } else { - if(nj_args->expdist) { /* exponential notation (or not) */ - fprintf(fp, "%e ", 0.0); - } else { - fprintf(fp, "%f ", 0.0); - } - } - } - - fprintf(fp, "\n"); - } - -#endif // FULL_SYMMETRIC_MATRIX - - /* close the file here */ - if(fp) { - fclose(fp); - } - - return; -} - - - - - diff --git a/dmat.cpp b/dmat.cpp new file mode 100644 index 0000000..1d00cff --- /dev/null +++ b/dmat.cpp @@ -0,0 +1,869 @@ +/* + * dmat.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Distance matrix parser + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + + +#include +#include +#include +#include +#include + + + +#include "common.h" +#include "clearcut.h" + +#include "dmat.h" + + + + + +/* + * + * NJ_is_alpha() - determine if character is an alphabetic character + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is alphabetic (A-Z || a-z) + * 0 if character is NOT alphabetic + * + */ +static inline +int +NJ_is_alpha(char c) { + + if( (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') ) { + return(1); + } else { + return(0); + } +} + + + +/* + * + * NJ_is_whitespace() - determine if character is a whitespace character + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is whitespace (space, tab, CR, LF) + * 0 if character is NOT whitespace + * + */ +static inline +int +NJ_is_whitespace(char c) { + + if( c == ' ' || /* space */ + c == '\n' || /* newline */ + c == '\r' || /* carriage-return */ + c == '\v' || /* vertical tab */ + c == '\f' || /* form feed */ + c == '\t' ) { /* horizontal tab */ + return(1); + } else { + return(0); + } + +} + + + + +/* + * + * NJ_is_number() - determine if character is a number + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is a number (0-9) + * 0 if character is NOT a number + * + */ +static inline +int +NJ_is_number(char c) { + if(c >= '0' && c <= '9') { + return(1); + } else { + return(0); + } +} + + + +/* + * NJ_is_distance() - check if string is a properly formatted distance value + * + */ +static inline +int +NJ_is_distance(char *token) { + + int i; + char c; + int exponent_state; + int expsign_state; + int dpoint_state; + + /* if token is NULL return failure */ + if(!token) { + return(0); + } + + exponent_state = 0; + expsign_state = 0; + dpoint_state = 0; + + /* The first character must be a number, a decimal point or a sign */ + c = token[0]; + if(!NJ_is_number(c) && + c != '.' && + c != '-' && + c != '+' ) { + + goto BAD; + } + + /* + * if the first character is not a number, and string is only one + * character long, then we return failure. + */ + if(strlen(token) == 1) { + if(!NJ_is_number(c)) { + goto BAD; + } + } + + for(i=0;i0 && !exponent_state) { + if(c == '-' || + c == '+') { + goto BAD; + } + } + + /* if we are in the exponent state, and we've already seen a sign */ + if(exponent_state && expsign_state) { + if(c == '-' || + c == '+') { + goto BAD; + } + } + + /* if we are in the exponent state and we see a decimal point */ + if(exponent_state) { + if(c == '.') { + goto BAD; + } + } + + /* if we are in the exponent state and see another e or E */ + if(exponent_state) { + if(c == 'e' || + c == 'E') { + goto BAD; + } + } + + /* if we are dpoint_state and see another decimal point */ + if(dpoint_state) { + if(c == '.') { + goto BAD; + } + } + + + /* enter the exponent state if we need to */ + if(!exponent_state) { + if(c == 'e' || + c == 'E') { + exponent_state = 1; + } + } + + /* enter the expsign_state if we need to */ + if(exponent_state && !expsign_state) { + if(c == '-' || + c == '+') { + expsign_state = 1; + } + } + + /* if not in dpoint state and we see a dpoint */ + if(!dpoint_state) { + if(c == '.') { + dpoint_state = 1; + } + } + + } + + /* the token must end in a number char */ + if(!NJ_is_number(token[strlen(token)-1])) { + goto BAD; + } + + /* token is a valid numerical distance */ + return(1); + + BAD: + + /* token is invalid distance format */ + return(0); +} + + + + +/* + * NJ_is_label() - + * + * Simply, if token is not a valid number, then it is a name + * + */ +static inline +int +NJ_is_label(char *token) { + if(NJ_is_distance(token)) { + return(0); + } else { + return(1); + } +} + + + +/* + * NJ_get_token() - get a token from an input stream + * + */ +static inline +int +NJ_get_token(FILE *fp, + NJ_DIST_TOKEN *token) { + + char c; + int index; + + c = fgetc(fp); + if(feof(fp)) { + token->type = NJ_EOF_STATE; + return(token->type); + } + + if(NJ_is_whitespace(c)) { + token->buf[0] = c; + token->buf[1] = '\0'; + token->type = NJ_WS_STATE; + + return NJ_WS_STATE; + } + + index = 0; + while(!NJ_is_whitespace(c)) { + + /* reallocate our buffer if necessary */ + if(index >= token->bufsize) { + token->bufsize *= 2; + token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char)); + if(!token->buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n"); + exit(-1); + } + } + + token->buf[index++] = c; + + c = fgetc(fp); + if(feof(fp)) { + token->type = NJ_EOF_STATE; + break; + } + } + + token->buf[index] = '\0'; + + if(token->type != NJ_EOF_STATE) { + + if(NJ_is_distance(token->buf)) { + token->type = NJ_FLOAT_STATE; + } else { + token->type = NJ_NAME_STATE; + } + + } + + return(token->type); +} + + + + + + +/* + * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix + * + * + * INPUT: + * ------ + * nj_args -- a pointer to a structure containing the command-line arguments + * + * OUTPUT: + * ------- + * -- NULL (failure) + * -- A pointer to a populated distance matrix + * + * DESCRIPTION: + * ------------ + * This function implements a simple state machine to parse a distance matrix + * in approximate PHYLIP format. This function auto-detects whether the + * distance matrix is in upper, lower, or fully-symmetric format and handles + * it accordingly. For full/symmetric matrices, values must be symmetric + * around the diagonal, which is required to be zeroes. Names and values must + * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can + * include numbers, but must start with non-numerical symbols. + * + * + * *** UPPER FORMAT EXAMPLE *** + * + * 4 + * seq1 0.2 0.3 0.1 + * seq2 0.2 0.3 + * seq3 0.1 + * seq4 + * + * *** LOWER FORMAT EXAMPLE *** + * + * 4 + * seq1 + * seq2 0.3 + * seq3 0.2 0.4 + * seq4 0.3 0.1 0.3 + * + * *** SYMMETRIC (FULL) EXAMPLE *** + * + * 4 + * seq1 0.0 0.3 0.5 0.3 + * seq2 0.3 0.0 0.1 0.2 + * seq3 0.5 0.1 0.0 0.9 + * seq4 0.3 0.2 0.9 0.0 + * + * Values in the distance matrix can be positive or negative, integers or + * real values. Values can also be parsed in exponential notation form. + * + */ +DMAT * +NJ_parse_distance_matrix(NJ_ARGS *nj_args) { + + DMAT *dmat = NULL; + FILE *fp = NULL; + NJ_DIST_TOKEN *token = NULL; + + int state, dmat_type; + int row; + int fltcnt; + int x, y, i; + int numvalread; + int expectedvalues = -1; + float val; + int first_state = 0; + + + /* allocate our distance matrix and token structure */ + dmat = (DMAT *)calloc(1, sizeof(DMAT)); + token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN)); + if(token) { + token->bufsize = NJ_INITIAL_BUFSIZE; + token->buf = (char *)calloc(token->bufsize, sizeof(char)); + } + if(!dmat || !token || !token->buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* open distance matrix file here */ + if(nj_args->stdin_flag) { + fp = stdin; + } else { + fp = fopen(nj_args->infilename, "r"); + if(!fp) { + fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename); + perror("Clearcut"); + goto XIT_BAD; + } + } + + /* get the number of taxa in this file */ + fscanf(fp, "%ld\n", &dmat->ntaxa); + if(dmat->ntaxa < 2) { + fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n"); + + goto XIT_BAD; + } + + /* set our initial working size according to the # of taxa */ + dmat->size = dmat->ntaxa; + + /* allocate space for the distance matrix values here */ + dmat->val = + (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); + if(!dmat->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* taxa names */ + dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *)); + if(!dmat->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* set the initial state of our state machine */ + dmat_type = NJ_PARSE_UNKNOWN; + row = -1; + fltcnt = 0; + numvalread = 0; + + + /* read the input one character at a time to drive simple state machine */ + state = NJ_get_token(fp, token); + while(state != NJ_EOF_STATE) { + + switch(state) { + + case NJ_NAME_STATE: + + if(first_state == 0) { + first_state = 1; + } + + row++; + + if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) { + + if(fltcnt == dmat->ntaxa) { + + dmat_type = NJ_PARSE_SYMMETRIC; + expectedvalues = dmat->ntaxa * dmat->ntaxa; + + } else if (fltcnt == dmat->ntaxa-1) { + + dmat_type = NJ_PARSE_UPPER; + expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; + + /* shift everything in first row by one char */ + for(i=dmat->ntaxa-2;i>=0;i--) { + dmat->val[i+1] = dmat->val[i]; + } + + } else if (fltcnt == 0) { + + dmat_type = NJ_PARSE_LOWER; + expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; + + } else { + goto XIT_BAD; + } + } + + if(row >= dmat->ntaxa) { + goto XIT_BAD; + } + + /* allocate space for this taxon label */ + dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char)); + if(!dmat->taxaname[row]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + strcpy(dmat->taxaname[row], token->buf); + + fltcnt = 0; + + break; + + + case NJ_FLOAT_STATE: + + if(first_state == 0) { + goto XIT_BAD; + } + + val = atof(token->buf); + if(errno) { + fprintf(stderr, "Clearcut: Distance value out-of-range.\n"); + goto XIT_BAD; + } + + x = row; + y = fltcnt; + + switch(dmat_type) { + + case NJ_PARSE_UNKNOWN: + + dmat->val[NJ_MAP(x, y, dmat->size)] = val; + + break; + + case NJ_PARSE_SYMMETRIC: + + if(fltcnt >= dmat->ntaxa) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + if(x < y) { + dmat->val[NJ_MAP(x, y, dmat->size)] = val; + } else if(x > y) { + if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) { + fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n"); + goto XIT_BAD; + } + } else { + if(!NJ_FLT_EQ(val, 0.0)) { + fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); + goto XIT_BAD; + + } + } + + break; + + case NJ_PARSE_UPPER: + + if(fltcnt > dmat->ntaxa-row) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val; + + break; + + case NJ_PARSE_LOWER: + + if(fltcnt > row-1) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + dmat->val[NJ_MAP(y, x, dmat->size)] = val; + + break; + + default: + + goto XIT_BAD; + + break; + } + + fltcnt++; + numvalread++; + + break; + + case NJ_WS_STATE: + + break; + + case NJ_EOF_STATE: + + if(first_state == 0) { + goto XIT_BAD; + } + + break; + + default: + + fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n"); + break; + + } + + /* get next token from stream */ + state = NJ_get_token(fp, token); + } + + + /* + * At the end, if we have not read the number of values that we predicted + * we would need, then there was a problem and we need to punt. + */ + if(numvalread != expectedvalues) { + fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n"); + goto XIT_BAD; + } + + /* special check to make sure first value read is 0.0 */ + if(dmat_type == NJ_PARSE_SYMMETRIC) { + if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) { + fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); + goto XIT_BAD; + } + } + + + /* now lets allocate space for the r and r2 columns */ + dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); + dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); + if(!dmat->r || !dmat->r2) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* track some memory addresses */ + dmat->rhandle = dmat->r; + dmat->r2handle = dmat->r2; + dmat->valhandle = dmat->val; + + /* close matrix file here */ + if(!nj_args->stdin_flag) { + fclose(fp); + } + + if(token) { + if(token->buf) { + free(token->buf); + } + free(token); + } + + return(dmat); + + + + /* clean up our partial progress */ + XIT_BAD: + + if(fp) { + fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp)); + } + + /* close matrix file here */ + if(!nj_args->stdin_flag) { + if(fp) { + fclose(fp); + } + } + + /* if we have a valid dmat (partial or complete), we need to free it */ + if(dmat) { + NJ_free_dmat(dmat); + } + + if(token) { + if(token->buf) { + free(token->buf); + } + free(token); + } + + return(NULL); +} + + + + + + + +/* + * NJ_output_matrix() - Output a distance matrix to the specified file + * + * + * INPUTS: + * ------- + * nj_args -- a pointer to a data structure holding the command-line args + * dmat -- a pointer to a distance matrix + * + * + * RETURNS: + * -------- + * NOTHING + * + * + * DESCRIPTION: + * ------------ + * If the appropriate flag was specified in the command-line, this function + * now outputs the parsed or computed distance matrix to a file. This + * can be useful if generating a distance matrix was the primary goal of + * running the program, or if one wanted to debug and/or verify the + * correctness of the program. + * + * Currently this function outputs full/symmetric matrices only. + * + */ +void +NJ_output_matrix(NJ_ARGS *nj_args, + DMAT *dmat) { + + FILE *fp = NULL; + long int i, j; + + + + /* if we haven't specieid matrixout, return immediately */ + if(!nj_args->matrixout) { + return; + } + + /* open the specified matrix file for writing */ + fp = fopen(nj_args->matrixout, "w"); + if(!fp) { + fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout); + return; + } + + /* output the number of taxa in the matrix */ + fprintf(fp, " %ld\n", dmat->size); + + fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop + + for(i=1;isize;i++) { + + /* output taxaname */ + fprintf(fp, "%s\t", dmat->taxaname[i]); + + for(j=0;jexpdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } + } + + fprintf(fp, "\n"); + } + +#ifdef FULL_SYMMETRIC_MATRIX + + /* output the number of taxa in the matrix */ + fprintf(fp, " %ld\n", dmat->size); + for(i=0;isize;i++) { + + /* output taxaname */ + fprintf(fp, "%s\t", dmat->taxaname[i]); + + for(j=0;jsize;j++) { + if(i>j) { + if(nj_args->expdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } + } else if(iexpdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]); + } + } else { + if(nj_args->expdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", 0.0); + } else { + fprintf(fp, "%f ", 0.0); + } + } + } + + fprintf(fp, "\n"); + } + +#endif // FULL_SYMMETRIC_MATRIX + + /* close the file here */ + if(fp) { + fclose(fp); + } + + return; +} + + + + + diff --git a/fasta.c b/fasta.c deleted file mode 100644 index 1dc17fc..0000000 --- a/fasta.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * fasta.c - * - * $Id$ - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * Functions for parsing FASTA formatted alignment files - * - ***************************************************************************** - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - -#include -#include -#include -#include - -#include "clearcut.h" -#include "common.h" -#include "fasta.h" - - -#define NJ_NUM_DNA_AMBIGUITY_SYMS 14 -static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = -{ - 'M', 'R', 'W', 'S', 'Y', 'K', - 'V', 'H', 'D', 'B', 'X', 'N', - '-', '.' -}; - - -#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6 -static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] = -{ - 'X', 'B', 'Z', '*', '-', '.' -}; - -#define NJ_NUM_DNA_SYMS 5 -static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = -{ - 'A', 'G', 'C', 'T', 'U' -}; - - -#define NJ_NUM_PROTEIN_SYMS 20 -static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = -{ - 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' -}; - - - - - -/* - * NJ_is_whitespace() - Check to see if character is whitespace - * - * INPUTS: - * ------- - * c -- character to check - * - * RETURNS: - * -------- - * int -- 0 if not whitespace - * 1 if whitespace - */ -static inline -int -NJ_is_whitespace(char c) { - if( c == ' ' || /* space */ - c == '\n' || /* newline */ - c == '\r' || /* carriage-return */ - c == '\v' || /* vertical tab */ - c == '\f' || /* form feed */ - c == '\t' ) { /* horizontal tab */ - return(1); - } else { - return(0); - } -} - - - - - -/* - * NJ_is_dna() - - * - * Determines if the given symbol is DNA - * - * RETURNS: 1 if DNA - * 0 if not DNA - * - */ -static inline -int -NJ_is_dna(char c) { - - int i; - char up_c; - - up_c = toupper(c); - - for(i=0;i sequence1 - * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT- - * > sequence2 - * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT - * > sequence3 - * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC-- - * - * This function will parse alignments for DNA or protein, and will do - * so mindful of ambiguity codes for these kinds of sequences. All - * ambiguity codes are ignored by this program for the purposes of - * computing a distance matrix from a multiple alignment. By design, - * this program does not auto-detect DNA vs. Protein, and requires that - * the user explictly specify that on the command-line. - * - * Gaps can be represented either with the '-' or '.' characters. - * - * Newlines and other whitespace are allowed to be interspersed - * throughout the sequences. - * - * Taxon labels are required to be unique, and they must start with - * an alphabetic character (not a number, etc.). The parser will read - * the first token after the > character in the description line up until the - * first whitespace and use that for the taxon label. - * - * For example, in the line "> taxon1 is homo sapien", the taxon label will be - * "taxon1" - * - */ -NJ_alignment * -NJ_read_fasta(NJ_ARGS *nj_args) { - - FILE *fp = NULL; - char *buf = NULL; - char *ptr = NULL; - NJ_alignment *alignment = NULL; - - char c; - int state; - long int index, x, seq; - long int i; - long int bufsize, nseqs = NJ_INITIAL_NSEQS; - int first_sequence_flag; - - - - - /* - * In this function, we implement a FASTA alignment parser which - * reads in an alignment character-by-character, maintaining state - * information which guides the parser. - * - * The program reads either DNA or Protein alignments. All title lines - * and sequences can be arbitrarily long. Gaps can be represented by - * "-" or "." characters. - * - * Ambiguity codes are also handled. - * - */ - - /* - * We can't handle reading fasta input unless the user explicity - * specifies the input type...just to be sure. - */ - if( (!nj_args->dna_flag && !nj_args->protein_flag) || - (nj_args->dna_flag && nj_args->protein_flag) ) { - fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n"); - goto XIT_BAD; - } - - /* open specified fasta file here */ - if(nj_args->stdin_flag) { - fp = stdin; - } else { - fp = fopen(nj_args->infilename, "r"); - if(!fp) { - fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename); - perror("Clearcut"); - goto XIT_BAD; - } - } - - /* allocate the initial buffer */ - bufsize = NJ_INITIAL_BUFFER_SIZE; - buf = (char *)calloc(bufsize, sizeof(char)); - - /* allocate the alignment container here */ - alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment)); - - /* allocate initial title array */ - // printf("allocating initial title array\n"); - alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *)); - - /* make sure that we successfully allocated memory */ - if(!buf || !alignment || !alignment->titles) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - /* a flag */ - first_sequence_flag = 1; - - index = 0; /* tracks the position in buffer */ - x = 0; /* tracks the position on sequence */ - seq = 0; /* tracks the active sequence */ - - /* intitial state of state machine */ - state = NJ_FASTA_MODE_UNKNOWN; - - while(1) { - - /* get the next character */ - c = fgetc(fp); - if(feof(fp)) { - - if(state == NJ_FASTA_MODE_SEQUENCE) { - buf[index+1] = '\0'; - - /* copy buf to alignment */ - for(i=1;i<=alignment->length;i++) { - alignment->data[seq*alignment->length+i-1] = buf[i]; - } - } - - break; - } - - /* make sure our dynamic buffer is big enough */ - if(index >= bufsize) { - bufsize *= 2; - buf = (char *)realloc(buf, bufsize); - if(!buf) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - switch(state) { - - case NJ_FASTA_MODE_UNKNOWN: - - if(!NJ_is_whitespace(c)) { - if(c == '>') { - state = NJ_FASTA_MODE_TITLE; - buf[0] = '>'; - } else { - goto XIT_BAD; - } - } - - break; - - case NJ_FASTA_MODE_TITLE: - - if( c == '\n' || - c == '\r' ) { - - buf[index] = '\0'; - state = NJ_FASTA_MODE_SEQUENCE; - index = 0; - x = -1; - - /* make sure we've allocated enough space for titles and sequences */ - if(seq == nseqs) { - - // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs); - - nseqs *= 2; - - alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *)); - if(!alignment->titles) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf); - - alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char)); - if(!alignment->titles[seq]) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - /* lets forward to the first non-space (space/tab) character after the '>' */ - - if(first_sequence_flag) { - ptr = buf; - } else { - ptr = &buf[1]; - } - while(*ptr == '\t' || *ptr == ' ') { - ptr++; - } - sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */ - - alignment->nseq++; - } - - buf[index++] = c; - - break; - - - case NJ_FASTA_MODE_SEQUENCE: - - if(c == '>') { - - if(first_sequence_flag) { - first_sequence_flag = 0; - - /* allocate our alignment data section here */ - alignment->length = index-1; - - nseqs = NJ_INITIAL_NSEQS; - alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - if(!first_sequence_flag) { - if(index-1 < alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - } - - /* re-allocate if necessary */ - /* - if(seq >= nseqs) { - nseqs *= 2; - alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - */ - - /* copy buf to alignment */ - for(i=1;i<=alignment->length;i++) { - alignment->data[seq*alignment->length+i-1] = buf[i]; - } - - state = NJ_FASTA_MODE_TITLE; - index = 1; - x = 1; - - buf[0] = c; - - seq++; - - } else { - - if(NJ_is_whitespace(c)) { - break; - } - - if(!first_sequence_flag) { - if(index-1 >= alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - } - - - /* - * Here we check to make sure that the symbol read is appropriate - * for the type of data the user specified. (dna or protein). - * We also handle ambiguity codes by converting them to a specific - * assigned ambiguity code character. Ambiguity codes are ignored - * when computing distances - */ - - if(nj_args->dna_flag) { - if(NJ_is_dna(c)) { - buf[index++] = toupper(c); - } else { - if(NJ_is_dna_ambiguity(c)) { - buf[index++] = NJ_AMBIGUITY_CHAR; - } else { - fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq); - goto XIT_BAD; - } - } - } else if(nj_args->protein_flag) { - if(NJ_is_protein(c)) { - buf[index++] = toupper(c); - } else { - if(NJ_is_protein_ambiguity(c)) { - buf[index++] = NJ_AMBIGUITY_CHAR; - } else { - fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq); - goto XIT_BAD; - } - } - } - - } - - break; - - default: - goto XIT_BAD; - - break; - - } - } - - if(index-1 != alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - - /* check for duplicate taxon labels */ - if(!NJ_taxaname_unique(alignment)) { - goto XIT_BAD; - } - - return(alignment); - - - XIT_BAD: - - if(fp) { - fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp)); - } - - if(buf) { - free(buf); - } - - NJ_free_alignment(alignment); - - return(NULL); -} - - - - -/* - * NJ_print_alignment() - Print multiple sequence alignment (for debugging) - * - * INPUTS: - * ------- - * alignment -- A pointer to the alignment - * - * RETURNS: - * -------- - * NONE - * - */ -void -NJ_print_alignment(NJ_alignment *alignment) { - - long int i, j; - - printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length); - - for(i=0;inseq;i++) { - - printf("> %s\n", alignment->titles[i]); - - for(j=0;jlength;j++) { - printf("%c", alignment->data[i*alignment->length+j]); - } - - printf("\n"); - } - - return; -} - - - - - - - -/* - * - * NJ_free_alignment() - Free all of the memory allocated for the - * multiple sequence alignment - * - * INPUTS: - * ------- - * alignment -- A pointer to the multiple sequence alignment - * - * RETURNS: - * -------- - * NONE - * - */ -void -NJ_free_alignment(NJ_alignment *alignment) { - - long int i; - - if(alignment) { - - /* free the allocated titles */ - if(alignment->titles) { - for(i=0;inseq;i++) { - - if(alignment->titles[i]) { - free(alignment->titles[i]); - } - } - - free(alignment->titles); - } - - /* free the alignment data */ - if(alignment->data) { - free(alignment->data); - } - - /* free the alignment itself */ - free(alignment); - } - - return; -} - - - - -/* - * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment - * - * INPUTS: - * ------- - * alignment -- a pointer to a multiple sequence alignment - * - * OUTPUTS: - * -------- - * int -- 0 if all taxanames in alignment are unique - * 1 if all taxanames in alignment are NOT unique - * - * - * DESCRIPTION: - * ------------ - * - * Check to see if the taxanames in the alignment are unique. It - * will be impossible to make sense of the final tree if the taxon - * labels are not unqiue. - * - */ -int -NJ_taxaname_unique(NJ_alignment *alignment) { - - long int i, j; - - for(i=0;inseq;i++) { - for(j=i+1;jnseq;j++) { - if(!strcmp(alignment->titles[i], - alignment->titles[j])) { - fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", - i, j, alignment->titles[i]); - return(0); - } - } - } - - return(1); -} - - -void -NJ_print_titles(NJ_alignment *alignment) { - - int i; - - for(i=0;inseq;i++) { - printf("%d: %s\n", i, alignment->titles[i]); - } - - return; -} diff --git a/fasta.cpp b/fasta.cpp new file mode 100644 index 0000000..1dc17fc --- /dev/null +++ b/fasta.cpp @@ -0,0 +1,755 @@ +/* + * fasta.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Functions for parsing FASTA formatted alignment files + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include + +#include "clearcut.h" +#include "common.h" +#include "fasta.h" + + +#define NJ_NUM_DNA_AMBIGUITY_SYMS 14 +static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = +{ + 'M', 'R', 'W', 'S', 'Y', 'K', + 'V', 'H', 'D', 'B', 'X', 'N', + '-', '.' +}; + + +#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6 +static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] = +{ + 'X', 'B', 'Z', '*', '-', '.' +}; + +#define NJ_NUM_DNA_SYMS 5 +static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = +{ + 'A', 'G', 'C', 'T', 'U' +}; + + +#define NJ_NUM_PROTEIN_SYMS 20 +static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = +{ + 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', + 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' +}; + + + + + +/* + * NJ_is_whitespace() - Check to see if character is whitespace + * + * INPUTS: + * ------- + * c -- character to check + * + * RETURNS: + * -------- + * int -- 0 if not whitespace + * 1 if whitespace + */ +static inline +int +NJ_is_whitespace(char c) { + if( c == ' ' || /* space */ + c == '\n' || /* newline */ + c == '\r' || /* carriage-return */ + c == '\v' || /* vertical tab */ + c == '\f' || /* form feed */ + c == '\t' ) { /* horizontal tab */ + return(1); + } else { + return(0); + } +} + + + + + +/* + * NJ_is_dna() - + * + * Determines if the given symbol is DNA + * + * RETURNS: 1 if DNA + * 0 if not DNA + * + */ +static inline +int +NJ_is_dna(char c) { + + int i; + char up_c; + + up_c = toupper(c); + + for(i=0;i sequence1 + * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT- + * > sequence2 + * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT + * > sequence3 + * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC-- + * + * This function will parse alignments for DNA or protein, and will do + * so mindful of ambiguity codes for these kinds of sequences. All + * ambiguity codes are ignored by this program for the purposes of + * computing a distance matrix from a multiple alignment. By design, + * this program does not auto-detect DNA vs. Protein, and requires that + * the user explictly specify that on the command-line. + * + * Gaps can be represented either with the '-' or '.' characters. + * + * Newlines and other whitespace are allowed to be interspersed + * throughout the sequences. + * + * Taxon labels are required to be unique, and they must start with + * an alphabetic character (not a number, etc.). The parser will read + * the first token after the > character in the description line up until the + * first whitespace and use that for the taxon label. + * + * For example, in the line "> taxon1 is homo sapien", the taxon label will be + * "taxon1" + * + */ +NJ_alignment * +NJ_read_fasta(NJ_ARGS *nj_args) { + + FILE *fp = NULL; + char *buf = NULL; + char *ptr = NULL; + NJ_alignment *alignment = NULL; + + char c; + int state; + long int index, x, seq; + long int i; + long int bufsize, nseqs = NJ_INITIAL_NSEQS; + int first_sequence_flag; + + + + + /* + * In this function, we implement a FASTA alignment parser which + * reads in an alignment character-by-character, maintaining state + * information which guides the parser. + * + * The program reads either DNA or Protein alignments. All title lines + * and sequences can be arbitrarily long. Gaps can be represented by + * "-" or "." characters. + * + * Ambiguity codes are also handled. + * + */ + + /* + * We can't handle reading fasta input unless the user explicity + * specifies the input type...just to be sure. + */ + if( (!nj_args->dna_flag && !nj_args->protein_flag) || + (nj_args->dna_flag && nj_args->protein_flag) ) { + fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n"); + goto XIT_BAD; + } + + /* open specified fasta file here */ + if(nj_args->stdin_flag) { + fp = stdin; + } else { + fp = fopen(nj_args->infilename, "r"); + if(!fp) { + fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename); + perror("Clearcut"); + goto XIT_BAD; + } + } + + /* allocate the initial buffer */ + bufsize = NJ_INITIAL_BUFFER_SIZE; + buf = (char *)calloc(bufsize, sizeof(char)); + + /* allocate the alignment container here */ + alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment)); + + /* allocate initial title array */ + // printf("allocating initial title array\n"); + alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *)); + + /* make sure that we successfully allocated memory */ + if(!buf || !alignment || !alignment->titles) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + /* a flag */ + first_sequence_flag = 1; + + index = 0; /* tracks the position in buffer */ + x = 0; /* tracks the position on sequence */ + seq = 0; /* tracks the active sequence */ + + /* intitial state of state machine */ + state = NJ_FASTA_MODE_UNKNOWN; + + while(1) { + + /* get the next character */ + c = fgetc(fp); + if(feof(fp)) { + + if(state == NJ_FASTA_MODE_SEQUENCE) { + buf[index+1] = '\0'; + + /* copy buf to alignment */ + for(i=1;i<=alignment->length;i++) { + alignment->data[seq*alignment->length+i-1] = buf[i]; + } + } + + break; + } + + /* make sure our dynamic buffer is big enough */ + if(index >= bufsize) { + bufsize *= 2; + buf = (char *)realloc(buf, bufsize); + if(!buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + switch(state) { + + case NJ_FASTA_MODE_UNKNOWN: + + if(!NJ_is_whitespace(c)) { + if(c == '>') { + state = NJ_FASTA_MODE_TITLE; + buf[0] = '>'; + } else { + goto XIT_BAD; + } + } + + break; + + case NJ_FASTA_MODE_TITLE: + + if( c == '\n' || + c == '\r' ) { + + buf[index] = '\0'; + state = NJ_FASTA_MODE_SEQUENCE; + index = 0; + x = -1; + + /* make sure we've allocated enough space for titles and sequences */ + if(seq == nseqs) { + + // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs); + + nseqs *= 2; + + alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *)); + if(!alignment->titles) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf); + + alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char)); + if(!alignment->titles[seq]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + /* lets forward to the first non-space (space/tab) character after the '>' */ + + if(first_sequence_flag) { + ptr = buf; + } else { + ptr = &buf[1]; + } + while(*ptr == '\t' || *ptr == ' ') { + ptr++; + } + sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */ + + alignment->nseq++; + } + + buf[index++] = c; + + break; + + + case NJ_FASTA_MODE_SEQUENCE: + + if(c == '>') { + + if(first_sequence_flag) { + first_sequence_flag = 0; + + /* allocate our alignment data section here */ + alignment->length = index-1; + + nseqs = NJ_INITIAL_NSEQS; + alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + if(!first_sequence_flag) { + if(index-1 < alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + } + + /* re-allocate if necessary */ + /* + if(seq >= nseqs) { + nseqs *= 2; + alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + */ + + /* copy buf to alignment */ + for(i=1;i<=alignment->length;i++) { + alignment->data[seq*alignment->length+i-1] = buf[i]; + } + + state = NJ_FASTA_MODE_TITLE; + index = 1; + x = 1; + + buf[0] = c; + + seq++; + + } else { + + if(NJ_is_whitespace(c)) { + break; + } + + if(!first_sequence_flag) { + if(index-1 >= alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + } + + + /* + * Here we check to make sure that the symbol read is appropriate + * for the type of data the user specified. (dna or protein). + * We also handle ambiguity codes by converting them to a specific + * assigned ambiguity code character. Ambiguity codes are ignored + * when computing distances + */ + + if(nj_args->dna_flag) { + if(NJ_is_dna(c)) { + buf[index++] = toupper(c); + } else { + if(NJ_is_dna_ambiguity(c)) { + buf[index++] = NJ_AMBIGUITY_CHAR; + } else { + fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq); + goto XIT_BAD; + } + } + } else if(nj_args->protein_flag) { + if(NJ_is_protein(c)) { + buf[index++] = toupper(c); + } else { + if(NJ_is_protein_ambiguity(c)) { + buf[index++] = NJ_AMBIGUITY_CHAR; + } else { + fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq); + goto XIT_BAD; + } + } + } + + } + + break; + + default: + goto XIT_BAD; + + break; + + } + } + + if(index-1 != alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + + /* check for duplicate taxon labels */ + if(!NJ_taxaname_unique(alignment)) { + goto XIT_BAD; + } + + return(alignment); + + + XIT_BAD: + + if(fp) { + fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp)); + } + + if(buf) { + free(buf); + } + + NJ_free_alignment(alignment); + + return(NULL); +} + + + + +/* + * NJ_print_alignment() - Print multiple sequence alignment (for debugging) + * + * INPUTS: + * ------- + * alignment -- A pointer to the alignment + * + * RETURNS: + * -------- + * NONE + * + */ +void +NJ_print_alignment(NJ_alignment *alignment) { + + long int i, j; + + printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length); + + for(i=0;inseq;i++) { + + printf("> %s\n", alignment->titles[i]); + + for(j=0;jlength;j++) { + printf("%c", alignment->data[i*alignment->length+j]); + } + + printf("\n"); + } + + return; +} + + + + + + + +/* + * + * NJ_free_alignment() - Free all of the memory allocated for the + * multiple sequence alignment + * + * INPUTS: + * ------- + * alignment -- A pointer to the multiple sequence alignment + * + * RETURNS: + * -------- + * NONE + * + */ +void +NJ_free_alignment(NJ_alignment *alignment) { + + long int i; + + if(alignment) { + + /* free the allocated titles */ + if(alignment->titles) { + for(i=0;inseq;i++) { + + if(alignment->titles[i]) { + free(alignment->titles[i]); + } + } + + free(alignment->titles); + } + + /* free the alignment data */ + if(alignment->data) { + free(alignment->data); + } + + /* free the alignment itself */ + free(alignment); + } + + return; +} + + + + +/* + * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment + * + * INPUTS: + * ------- + * alignment -- a pointer to a multiple sequence alignment + * + * OUTPUTS: + * -------- + * int -- 0 if all taxanames in alignment are unique + * 1 if all taxanames in alignment are NOT unique + * + * + * DESCRIPTION: + * ------------ + * + * Check to see if the taxanames in the alignment are unique. It + * will be impossible to make sense of the final tree if the taxon + * labels are not unqiue. + * + */ +int +NJ_taxaname_unique(NJ_alignment *alignment) { + + long int i, j; + + for(i=0;inseq;i++) { + for(j=i+1;jnseq;j++) { + if(!strcmp(alignment->titles[i], + alignment->titles[j])) { + fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", + i, j, alignment->titles[i]); + return(0); + } + } + } + + return(1); +} + + +void +NJ_print_titles(NJ_alignment *alignment) { + + int i; + + for(i=0;inseq;i++) { + printf("%d: %s\n", i, alignment->titles[i]); + } + + return; +} diff --git a/getopt_long.c b/getopt_long.c deleted file mode 100644 index 1756a2e..0000000 --- a/getopt_long.c +++ /dev/null @@ -1,2096 +0,0 @@ -/* - This getopt_long() is compatible with GNU's, however, added original - extention (short 1 byte option). - - - Copyright (c) 2004 Koji Arai - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation files - (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - - Compilation for Test: - - GNU: - cc -DUSE_GNU -DDEBUG getopt_long.c -o test_getopt_long_gnu - - not GNU: - cc -I. -DDEBUG getopt_long.c -o test_getopt_long - - ./test_getopt_long - ./test_getopt_long_gnu - - BUGS: - * not implemented any features for getopt() and getopt_long(). -*/ - -#include -#include -#include - -#if DEBUG -static int -puts_argv(char **argv) -{ - int i; - - for (i = 0; argv[i]; i++) { - if (i) printf(" "); - - printf("%s", argv[i]); - } - printf("\n"); - - return 0; -} -#endif - -#ifndef USE_GNU -#include -#include "getopt_long.h" - -char *optarg; -int optind; - -int opterr; -int optopt; - -/* - return value 0: no option (include '-') - 1: short option like '-x' - 2: long option like '--xxx' and just '--' -*/ -static int -is_option(char *arg) -{ - if (arg[0] == '-') { - switch (arg[1]) { - case 0: /* just "-" */ - return 0; - case '-': /* long option (include just "--")*/ - return 2; - default: /* short option */ - return 1; - } - } - return 0; -} - -static int -insert_argv(char **argv, int src, int dest) -{ - int i; - char *tmp = argv[src]; - - if (src > dest) { - for (i = src; i > dest; i--) - argv[i] = argv[i-1]; - } - if (src < dest) { - for (i = src; i < dest; i++) - argv[i] = argv[i+1]; - } - - argv[dest] = tmp; - - return 0; -} - -static int -search_longopt(char *arg, struct option *longopts) -{ - int i, found = -1; - int len; - for (len = 0; arg[len] && arg[len] != '='; len++) - ; - - for (i = 0; longopts[i].name; i++) { - if (strncmp(arg, longopts[i].name, len) == 0) { - if (found != -1) - return -1; /* found some candidate */ - found = i; - } - } - return found; -} - -/* - * implemented my extention feature. - * optional 1 byte argument with [...] - * e.g.) shortopts = "a[0123]b" - * accepts "-a0 -a1b" (same as "-a0 -a1 -b") - */ -static int -has_argument_short(char *arg, const char *shortopts) -{ - int i; - int open_bracket = 0; - for (i = 0; shortopts[i]; i++) { - switch (shortopts[i]) { - case '[': - open_bracket++; - continue; - case ']': - if (open_bracket <= 0) { - fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); - return -1; - } - open_bracket--; - continue; - } - if (open_bracket) continue; - if (*arg != shortopts[i]) continue; - - switch (shortopts[i+1]) { - case ':': - if (shortopts[i+2] != ':') { - if (arg[1]) - return 1; /* following string is argument */ - else - return 2; /* next argv is argument */ - } - else { - /* '::' means optional argument (GNU extention) */ - if (arg[1]) - return 1; - else - return 0; /* no argument */ - } - case '[': - if (arg[1] == '\0') - return 0; /* no argument */ - /* my extention */ - for (i++; shortopts[i] && shortopts[i] != ']'; i++) { - if (arg[1] == shortopts[i]) - return 3; /* has 1 byte argument */ - } - if (!shortopts[i]) { - fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); - return -1; - } - break; - default: - return 0; /* no argument */ - } - } - /* Invalid option */ - return -1; -} - -static int -has_argument_long(char *arg, struct option *longopts) -{ - int i; - - i = search_longopt(arg, longopts); - if (i == -1) { - /* Invalid option */ - return -1; - } - else { - int len = strlen(arg); - char *p = strchr(arg, '='); - if (p) { - len = p - arg; - } - - switch (longopts[i].has_arg) { - case no_argument: - return 0; - case required_argument: - if (arg[len] == '=') - return 1; - else - return 2; - case optional_argument: - if (arg[len] == '=') - return 1; - else - return 0; - default: - assert(0); - } - } -} - -/* - -1: no option - 0: no argument - 1: has argument in this argv - 2: has argument in next argv - 3: has 1 byte argument in this argv -*/ -static int -has_argument(char *arg, - const char *shortopts, - struct option *longopts) -{ - int i, n; - - switch (is_option(arg)) { - case 0: /* no option */ - return -1; - case 1: - /* short option */ - n = -1; - for (i = 1; arg[i]; i++) { - n = has_argument_short(arg+i, shortopts); - if (n == 0 && arg[i+1]) continue; - if (n == 3 && arg[i+2]) { i++; continue; } - break; - } - return n; - case 2: - /* long option */ - return has_argument_long(arg+2, longopts); - break; - default: - assert(0); - } -} - -int -getopt_long(int argc, char **argv, - const char *shortopts, - struct option *longopts, - int *indexptr) -{ - char *opt; - int i; - static int shortoptind; - static int no_optind = 0; - - if (optind == 0) { /* skip first argument (command name) */ - optind++; - no_optind = 0; - shortoptind = 0; - } - - optarg = 0; - - if (no_optind && !shortoptind) { - while (!is_option(argv[no_optind])) - insert_argv(argv, no_optind, optind-1); - - if (has_argument(argv[no_optind], shortopts, longopts) == 2) - no_optind += 2; - else - no_optind++; - - if (argv[optind] && strcmp(argv[optind], "--") == 0) { - while (!is_option(argv[no_optind])) - insert_argv(argv, no_optind, optind); - optind = no_optind; - no_optind = 0; - } - } - - if (optind >= argc) - goto end_of_option; - - retry: - /* - puts_argv(&argv[optind]); - */ - opt = argv[optind]; - - if (shortoptind == 0 && is_option(opt) == 1) { - shortoptind++; - } - - if (shortoptind) { - /* short option */ - char *p = &opt[shortoptind]; - - if (*p == '\0') - assert(0); - - switch (has_argument_short(p, shortopts)) { - case 0: - /* no argument */ - optarg = 0; - - shortoptind++; - if (opt[shortoptind] == '\0') - optind++, shortoptind = 0; - return *p; - case 1: - /* following character is argument */ - optind++, shortoptind = 0; - optarg = &p[1]; - return *p; - case 2: - /* next argv is argument */ - optind++, shortoptind = 0; - optarg = argv[optind++]; - return *p; - case 3: - /* has 1 byte argument */ - optarg = &p[1]; - if (p[2] == 0) - optind++, shortoptind = 0; - else - shortoptind += 2; - return *p; - default: - /* Invalid option */ - if (opterr) - fprintf(stderr, - "%s: invalid option -- %c\n", - argv[0], - *p); - - optind++, shortoptind = 0; - optopt = *p; - return '?'; - } - } - else if (opt[0] == '-' && opt[1] == '-') { - /* long option */ - - if (opt[2] == '\0') { - /* end of command line switch */ - optind++; - return -1; - } - - opt += 2; - - i = search_longopt(opt, longopts); - if (i == -1) { - optind++; - optopt = 0; - return '?'; - } - else { - int len = strlen(opt); - char *p = strchr(opt, '='); - if (p) { - len = p - opt; - } - - switch (longopts[i].has_arg) { - case no_argument: - break; - case required_argument: - if (opt[len] == '=') - optarg = opt + len + 1; - else { - optind++; - optarg = argv[optind]; - if (optarg == 0) { - if (opterr) - fprintf(stderr, - "%s: option `--%s' requires an argument\n", - argv[0], - opt); - - optopt = 0; - return '?'; /* no argument */ - } - } - break; - case optional_argument: - if (opt[len] == '=') - optarg = opt + len + 1; - else { - optarg = 0; - } - break; - default: - break; - } - - *indexptr = i; - optind++; - if (longopts[i].flag) { - *longopts[i].flag = longopts[i].val; - return 0; - } - else { - return longopts[i].val; - } - } - - optind++; - optopt = 0; - return '?'; - } - - /* not option */ - if (no_optind == 0) - no_optind = optind; - - for (i = optind; argv[i]; i++) { - if (is_option(argv[i])) { - optind = i; - goto retry; - } - } - - end_of_option: - if (no_optind) { - optind = no_optind; - no_optind = 0; - } - - return -1; -} -#endif /* USE_GNU */ - -#if DEBUG - -#include -#include -#include - -#if USE_GNU -#include /* use GNU getopt_long() */ -#endif - -static int verbose_flag; -static int option_index; -int argc; -char *argv[50]; -char **p; -int c; -static struct option long_options[] = { - {"verbose", no_argument, &verbose_flag, 1}, - {"brief", no_argument, &verbose_flag, 0}, - {"add", required_argument, 0, 'a'}, - {"append", no_argument, 0, 0}, - {"delete", required_argument, 0, 0}, - {"create", optional_argument, 0, 0}, - {"change", optional_argument, 0, 0}, - {0, 0, 0, 0} -}; - -int -call_getopt_long(int argc, char **argv, - const char *shortopts, - struct option *longopts, - int *indexptr) -{ - int c; - c = getopt_long(argc, argv, shortopts, longopts, indexptr); - puts_argv(argv); - printf("ret=%d(%c) option_index=%d ", c, c, option_index); - printf("optind=%d optarg=[%s] opterr=%d optopt=%d(%c)\n", - optind, optarg, opterr, optopt, optopt); - if (c == 0) { - struct option *opt; - opt = &longopts[*indexptr]; - printf("long option: --%s has_arg=%d\n", opt->name, opt->has_arg); - if (opt->flag) - printf(" flag=[%8p] val=%d\n", opt->flag, *opt->flag); - } - - return c; -} - -void -test1() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-bcd"; - argc++; *p++ = "-d"; - argc++; *p++ = "-e"; - argc++; *p++ = "-f"; - argc++; *p++ = "-g"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'a'); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'b'); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'c'); - assert(option_index == 0); - assert(optind == 3); - assert(optarg == &argv[2][3]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'd'); - assert(option_index == 0); - assert(optind == 5); - assert(optarg == argv[4]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == '?'); - assert(option_index == 0); - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 'f'); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == '?'); - assert(option_index == 0); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 'g'); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == -1); - assert(option_index == 0); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ -} - -void -test2() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--verbose"; - argc++; *p++ = "--brief"; - argc++; *p++ = "--add"; - argc++; *p++ = "add_argument"; - argc++; *p++ = "--add=add_argument"; - argc++; *p++ = "--append"; - argc++; *p++ = "--delete=del_argument"; - argc++; *p++ = "--create=cre_argument"; - argc++; *p++ = "--create"; - argc++; *p++ = "files..."; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "verbose") == 0); - assert(*long_options[option_index].flag == 1); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 1); - assert(optind == 3); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "brief") == 0); - assert(*long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'a'); - assert(option_index == 2); - assert(optind == 5); - assert(optarg == argv[4]); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "add") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 'a'); - assert(option_index == 2); - assert(optind == 6); - assert(optarg == argv[5]+6); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "add") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 3); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "append") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 4); - assert(optind == 8); - assert(optarg == argv[7]+9); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "delete") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 9); - assert(optarg == argv[8]+9); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "create") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 10); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(long_options[option_index].name, "create") == 0); - assert(long_options[option_index].flag == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 10); - assert(optarg == 0); - assert(optopt == 'g'); /* no changed */ - assert(strcmp(argv[optind], "files...") == 0); - -} - -void -test3() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--delete"; /* required argument has no argument */ - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == '?'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); /* changed */ - assert(optarg == 0); - assert(optopt == 0); /* changed */ - assert(argv[optind] == 0); - - /* */ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--file"; /* not option */ - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - assert(c == '?'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - assert(argv[optind] == 0); -} - -void -test4() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "a1"; - argc++; *p++ = "a2"; - argc++; *p++ = "-b"; - argc++; *p++ = "b"; - argc++; *p++ = "-efg"; /* some options in a argument */ - argc++; *p++ = "g"; - argc++; *p++ = "-c"; - argc++; *p++ = "c"; - argc++; *p++ = "d"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'b'); - assert(option_index == 5); /* no changed */ - assert(optind == 5); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'e'); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'f'); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'g'); - assert(option_index == 5); /* no changed */ - assert(optind == 8); - assert(optarg == argv[7]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 10); - assert(optarg == argv[9]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "-efg") == 0); - assert(strcmp(*p++, "g") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "a1") == 0); - assert(strcmp(*p++, "a2") == 0); - assert(strcmp(*p++, "b") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - -} - -void -test5() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-"; - argc++; *p++ = "-b"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(*p == 0); - - assert(c == 'b'); - assert(option_index == 5); /* no changed */ - assert(optind == 4); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "-") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 3); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test6() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-"; - argc++; *p++ = "-"; - argc++; *p++ = "-b"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(*p == 0); - - assert(c == 'b'); - assert(option_index == 5); /* no changed */ - assert(optind == 5); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-b") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 3); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test7() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-"; - argc++; *p++ = "-"; - argc++; *p++ = "-c"; - argc++; *p++ = "c"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == argv[5]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 4); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test8() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-c"; - argc++; *p++ = "c"; - argc++; *p++ = "--"; - argc++; *p++ = "-d"; - argc++; *p++ = "d"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 4); - assert(optarg == argv[3]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 5); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 5); /* no changed */ - assert(optind == 7); - assert(optarg == argv[6]); - assert(optopt == 0); -} - -void -test9() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-"; - argc++; *p++ = "-"; - argc++; *p++ = "-c"; - argc++; *p++ = "c"; - argc++; *p++ = "--"; - argc++; *p++ = "-d"; - argc++; *p++ = "d"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == argv[5]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 5); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "c") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 5); /* no changed */ - assert(optind == 9); - assert(optarg == argv[8]); - assert(optopt == 0); -} - -void -test10() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-cc"; - argc++; *p++ = "-d"; - argc++; *p++ = "d"; - argc++; *p++ = "-c"; /* no argument */ - argc++; *p++ = "-d"; /* at last */ - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 5); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 3); - assert(optarg == argv[2]+2); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 5); /* no changed */ - assert(optind == 4); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 5); /* no changed */ - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-cc") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "-c") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 5); /* no changed */ - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test11() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--verbose"; - argc++; *p++ = "--create=c"; - argc++; *p++ = "--change"; - argc++; *p++ = "d"; - argc++; *p++ = "--create"; /* no argument */ - argc++; *p++ = "--change"; /* at last */ - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 3); - assert(optarg == argv[2]+9); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 6); - assert(optind == 4); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 6); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 6); - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test12() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--verbose"; - argc++; *p++ = "--create=c"; - argc++; *p++ = "files..."; - argc++; *p++ = "--delete"; /* required argument */ - argc++; *p++ = "d"; - argc++; *p++ = "--create"; /* no argument */ - argc++; *p++ = "--change"; /* at last */ - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 3); - assert(optarg == argv[2]+9); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 4); - assert(optind == 6); - assert(optarg == argv[5]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 6); - assert(optind == 8); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--create") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 6); - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); -} - -void -test13() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "--verbose"; - argc++; *p++ = "--create=c"; - argc++; *p++ = "files..."; - argc++; *p++ = "--delete"; - argc++; *p++ = "d"; - argc++; *p++ = "--"; /* option terminator */ - argc++; *p++ = "--change"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 0); - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 5); - assert(optind == 3); - assert(optarg == argv[2]+9); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == 0); - assert(option_index == 4); - assert(optind == 6); - assert(optarg == argv[5]); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "--verbose") == 0); - assert(strcmp(*p++, "--create=c") == 0); - assert(strcmp(*p++, "--delete") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "--") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(strcmp(*p++, "--change") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 4); - assert(optind == 6); - assert(optarg == 0); - assert(optopt == 0); - -} - -void -test14() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-o5"; - argc++; *p++ = "files..."; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "o[567]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-o5") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(*p == 0); - - assert(c == 'o'); - assert(option_index == 4); /* no changed */ - assert(optind == 2); - assert(optarg == argv[1]+2); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-o5") == 0); - assert(strcmp(*p++, "files...") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 4); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - -} - -void -test15() -{ - optind = 0; - argc = 0; - p = argv; - - argc++; *p++ = "command_name"; - argc++; *p++ = "-a"; - argc++; *p++ = "-ccd"; - argc++; *p++ = "-ce"; - argc++; *p++ = "-d"; - argc++; *p++ = "d"; - argc++; *p++ = "-cdd"; - argc++; *p++ = "-d"; - *p = 0; - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'a'); - assert(option_index == 4); /* no changed */ - assert(optind == 2); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 4); /* no changed */ - assert(optind == 2); - assert(optarg == argv[2]+2); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 4); /* no changed */ - assert(optind == 3); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 4); /* no changed */ - assert(optind == 4); - assert(optarg == argv[3]+2); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 4); /* no changed */ - assert(optind == 5); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'c'); - assert(option_index == 4); /* no changed */ - assert(optind == 6); - assert(optarg == argv[6]+2); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 4); /* no changed */ - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "d") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(*p == 0); - - assert(c == 'd'); - assert(option_index == 4); /* no changed */ - assert(optind == 8); - assert(optarg == 0); - assert(optopt == 0); - - - /*************************/ - c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); - - p = argv; - assert(strcmp(*p++, "command_name") == 0); - assert(strcmp(*p++, "-a") == 0); - assert(strcmp(*p++, "-ccd") == 0); - assert(strcmp(*p++, "-ce") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "-cdd") == 0); - assert(strcmp(*p++, "-d") == 0); - assert(strcmp(*p++, "d") == 0); - assert(*p == 0); - - assert(c == -1); - assert(option_index == 4); /* no changed */ - assert(optind == 7); - assert(optarg == 0); - assert(optopt == 0); - - -} - -int -main() -{ - opterr = 0; - optopt = 0; - - test1(); - test2(); - test3(); - test4(); - test5(); - test6(); - test7(); - test8(); - test9(); - test10(); - test11(); - test12(); - test13(); -#ifndef USE_GNU - test14(); - test15(); -#endif - - return 0; -} -#endif diff --git a/getopt_long.cpp b/getopt_long.cpp new file mode 100644 index 0000000..c113881 --- /dev/null +++ b/getopt_long.cpp @@ -0,0 +1,2095 @@ +/* + This getopt_long() is compatible with GNU's, however, added original + extention (short 1 byte option). + + + Copyright (c) 2004 Koji Arai + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + + Compilation for Test: + + GNU: + cc -DUSE_GNU -DDEBUG getopt_long.c -o test_getopt_long_gnu + + not GNU: + cc -I. -DDEBUG getopt_long.c -o test_getopt_long + + ./test_getopt_long + ./test_getopt_long_gnu + + BUGS: + * not implemented any features for getopt() and getopt_long(). +*/ + +#include +#include +#include + +#if DEBUG +static int +puts_argv(char **argv) +{ + int i; + + for (i = 0; argv[i]; i++) { + if (i) printf(" "); + + printf("%s", argv[i]); + } + printf("\n"); + + return 0; +} +#endif + +#ifndef USE_GNU +#include +#include "getopt_long.h" + +char *optarg; +int optind; + +int opterr; +int optopt; + +/* + return value 0: no option (include '-') + 1: short option like '-x' + 2: long option like '--xxx' and just '--' +*/ +static int +is_option(char *arg) +{ + if (arg[0] == '-') { + switch (arg[1]) { + case 0: /* just "-" */ + return 0; + case '-': /* long option (include just "--")*/ + return 2; + default: /* short option */ + return 1; + } + } + return 0; +} + +static int +insert_argv(char **argv, int src, int dest) +{ + int i; + char *tmp = argv[src]; + + if (src > dest) { + for (i = src; i > dest; i--) + argv[i] = argv[i-1]; + } + if (src < dest) { + for (i = src; i < dest; i++) + argv[i] = argv[i+1]; + } + + argv[dest] = tmp; + + return 0; +} + +static int +search_longopt(char *arg, struct option *longopts) +{ + int i, found = -1; + int len; + for (len = 0; arg[len] && arg[len] != '='; len++) + ; + + for (i = 0; longopts[i].name; i++) { + if (strncmp(arg, longopts[i].name, len) == 0) { + if (found != -1) + return -1; /* found some candidate */ + found = i; + } + } + return found; +} + +/* + * implemented my extention feature. + * optional 1 byte argument with [...] + * e.g.) shortopts = "a[0123]b" + * accepts "-a0 -a1b" (same as "-a0 -a1 -b") + */ +static int +has_argument_short(char *arg, const char *shortopts) +{ + int i; + int open_bracket = 0; + for (i = 0; shortopts[i]; i++) { + switch (shortopts[i]) { + case '[': + open_bracket++; + continue; + case ']': + if (open_bracket <= 0) { + fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); + return -1; + } + open_bracket--; + continue; + } + if (open_bracket) continue; + if (*arg != shortopts[i]) continue; + + switch (shortopts[i+1]) { + case ':': + if (shortopts[i+2] != ':') { + if (arg[1]) + return 1; /* following string is argument */ + else + return 2; /* next argv is argument */ + } + else { + /* '::' means optional argument (GNU extention) */ + if (arg[1]) + return 1; + else + return 0; /* no argument */ + } + case '[': + if (arg[1] == '\0') + return 0; /* no argument */ + /* my extention */ + for (i++; shortopts[i] && shortopts[i] != ']'; i++) { + if (arg[1] == shortopts[i]) + return 3; /* has 1 byte argument */ + } + if (!shortopts[i]) { + fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); + return -1; + } + break; + default: + return 0; /* no argument */ + } + } + /* Invalid option */ + return -1; +} + +static int +has_argument_long(char *arg, struct option *longopts) +{ + int i; + + i = search_longopt(arg, longopts); + if (i == -1) { + /* Invalid option */ + return -1; + } + else { + int len = strlen(arg); + char *p = strchr(arg, '='); + if (p) { + len = p - arg; + } + + switch (longopts[i].has_arg) { + case no_argument: + return 0; + case required_argument: + if (arg[len] == '=') + return 1; + else + return 2; + case optional_argument: + if (arg[len] == '=') + return 1; + else + return 0; + default: + assert(0); + } + } +} + +/* + -1: no option + 0: no argument + 1: has argument in this argv + 2: has argument in next argv + 3: has 1 byte argument in this argv +*/ +static int +has_argument(char *arg, + const char *shortopts, + struct option *longopts) +{ + int i, n; + + switch (is_option(arg)) { + case 0: /* no option */ + return -1; + case 1: + /* short option */ + n = -1; + for (i = 1; arg[i]; i++) { + n = has_argument_short(arg+i, shortopts); + if (n == 0 && arg[i+1]) continue; + if (n == 3 && arg[i+2]) { i++; continue; } + break; + } + return n; + case 2: + /* long option */ + return has_argument_long(arg+2, longopts); + break; + default: + assert(0); + } +} + +int +getopt_long(int argc, char **argv, + const char *shortopts, + struct option *longopts, + int *indexptr) +{ + char *opt; + int i; + static int shortoptind; + static int no_optind = 0; + + if (optind == 0) { /* skip first argument (command name) */ + optind++; + no_optind = 0; + shortoptind = 0; + } + + optarg = 0; + + if (no_optind && !shortoptind) { + while (!is_option(argv[no_optind])) + insert_argv(argv, no_optind, optind-1); + + if (has_argument(argv[no_optind], shortopts, longopts) == 2) + no_optind += 2; + else + no_optind++; + + if (argv[optind] && strcmp(argv[optind], "--") == 0) { + while (!is_option(argv[no_optind])) + insert_argv(argv, no_optind, optind); + optind = no_optind; + no_optind = 0; + } + } + + if (optind >= argc) + goto end_of_option; + + retry: + /* + puts_argv(&argv[optind]); + */ + opt = argv[optind]; + if (shortoptind == 0 && is_option(opt) == 1) { + shortoptind++; + } + + if (shortoptind) { + /* short option */ + char *p = &opt[shortoptind]; + + if (*p == '\0') + assert(0); + + switch (has_argument_short(p, shortopts)) { + case 0: + /* no argument */ + optarg = 0; + + shortoptind++; + if (opt[shortoptind] == '\0') + optind++, shortoptind = 0; + return *p; + case 1: + /* following character is argument */ + optind++, shortoptind = 0; + optarg = &p[1]; + return *p; + case 2: + /* next argv is argument */ + optind++, shortoptind = 0; + optarg = argv[optind++]; + return *p; + case 3: + /* has 1 byte argument */ + optarg = &p[1]; + if (p[2] == 0) + optind++, shortoptind = 0; + else + shortoptind += 2; + return *p; + default: + /* Invalid option */ + if (opterr) + fprintf(stderr, + "%s: invalid option -- %c\n", + argv[0], + *p); + + optind++, shortoptind = 0; + optopt = *p; + return '?'; + } + } + else if (opt[0] == '-' && opt[1] == '-') { + /* long option */ + + if (opt[2] == '\0') { + /* end of command line switch */ + optind++; + return -1; + } + + opt += 2; + + i = search_longopt(opt, longopts); + if (i == -1) { + optind++; + optopt = 0; + return '?'; + } + else { + int len = strlen(opt); + char *p = strchr(opt, '='); + if (p) { + len = p - opt; + } + + switch (longopts[i].has_arg) { + case no_argument: + break; + case required_argument: + if (opt[len] == '=') + optarg = opt + len + 1; + else { + optind++; + optarg = argv[optind]; + if (optarg == 0) { + if (opterr) + fprintf(stderr, + "%s: option `--%s' requires an argument\n", + argv[0], + opt); + + optopt = 0; + return '?'; /* no argument */ + } + } + break; + case optional_argument: + if (opt[len] == '=') + optarg = opt + len + 1; + else { + optarg = 0; + } + break; + default: + break; + } + + *indexptr = i; + optind++; + if (longopts[i].flag) { + *longopts[i].flag = longopts[i].val; + return 0; + } + else { + return longopts[i].val; + } + } + + optind++; + optopt = 0; + return '?'; + } + + /* not option */ + if (no_optind == 0) + no_optind = optind; + + for (i = optind; argv[i]; i++) { + if (is_option(argv[i])) { + optind = i; + goto retry; + } + } + + end_of_option: + if (no_optind) { + optind = no_optind; + no_optind = 0; + } + + return -1; +} +#endif /* USE_GNU */ + +#if DEBUG + +#include +#include +#include + +#if USE_GNU +#include /* use GNU getopt_long() */ +#endif + +static int verbose_flag; +static int option_index; +int argc; +char *argv[50]; +char **p; +int c; +static struct option long_options[] = { + {"verbose", no_argument, &verbose_flag, 1}, + {"brief", no_argument, &verbose_flag, 0}, + {"add", required_argument, 0, 'a'}, + {"append", no_argument, 0, 0}, + {"delete", required_argument, 0, 0}, + {"create", optional_argument, 0, 0}, + {"change", optional_argument, 0, 0}, + {0, 0, 0, 0} +}; + +int +call_getopt_long(int argc, char **argv, + const char *shortopts, + struct option *longopts, + int *indexptr) +{ + int c; + c = getopt_long(argc, argv, shortopts, longopts, indexptr); + puts_argv(argv); + printf("ret=%d(%c) option_index=%d ", c, c, option_index); + printf("optind=%d optarg=[%s] opterr=%d optopt=%d(%c)\n", + optind, optarg, opterr, optopt, optopt); + if (c == 0) { + struct option *opt; + opt = &longopts[*indexptr]; + printf("long option: --%s has_arg=%d\n", opt->name, opt->has_arg); + if (opt->flag) + printf(" flag=[%8p] val=%d\n", opt->flag, *opt->flag); + } + + return c; +} + +void +test1() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-bcd"; + argc++; *p++ = "-d"; + argc++; *p++ = "-e"; + argc++; *p++ = "-f"; + argc++; *p++ = "-g"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'b'); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'c'); + assert(option_index == 0); + assert(optind == 3); + assert(optarg == &argv[2][3]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'd'); + assert(option_index == 0); + assert(optind == 5); + assert(optarg == argv[4]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 0); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 'f'); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 0); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == -1); + assert(option_index == 0); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ +} + +void +test2() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--brief"; + argc++; *p++ = "--add"; + argc++; *p++ = "add_argument"; + argc++; *p++ = "--add=add_argument"; + argc++; *p++ = "--append"; + argc++; *p++ = "--delete=del_argument"; + argc++; *p++ = "--create=cre_argument"; + argc++; *p++ = "--create"; + argc++; *p++ = "files..."; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "verbose") == 0); + assert(*long_options[option_index].flag == 1); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 1); + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "brief") == 0); + assert(*long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 2); + assert(optind == 5); + assert(optarg == argv[4]); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "add") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 2); + assert(optind == 6); + assert(optarg == argv[5]+6); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "add") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 3); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "append") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 8); + assert(optarg == argv[7]+9); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "delete") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 9); + assert(optarg == argv[8]+9); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "create") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 10); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "create") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 10); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(argv[optind], "files...") == 0); + +} + +void +test3() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--delete"; /* required argument has no argument */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); /* changed */ + assert(optarg == 0); + assert(optopt == 0); /* changed */ + assert(argv[optind] == 0); + + /* */ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--file"; /* not option */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + assert(argv[optind] == 0); +} + +void +test4() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "a1"; + argc++; *p++ = "a2"; + argc++; *p++ = "-b"; + argc++; *p++ = "b"; + argc++; *p++ = "-efg"; /* some options in a argument */ + argc++; *p++ = "g"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'e'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'f'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'g'); + assert(option_index == 5); /* no changed */ + assert(optind == 8); + assert(optarg == argv[7]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 10); + assert(optarg == argv[9]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test5() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-b"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test6() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-b"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test7() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test8() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "--"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == argv[3]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == argv[6]); + assert(optopt == 0); +} + +void +test9() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "--"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 9); + assert(optarg == argv[8]); + assert(optopt == 0); +} + +void +test10() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-cc"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + argc++; *p++ = "-c"; /* no argument */ + argc++; *p++ = "-d"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == argv[2]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test11() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "--change"; + argc++; *p++ = "d"; + argc++; *p++ = "--create"; /* no argument */ + argc++; *p++ = "--change"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 6); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test12() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "files..."; + argc++; *p++ = "--delete"; /* required argument */ + argc++; *p++ = "d"; + argc++; *p++ = "--create"; /* no argument */ + argc++; *p++ = "--change"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 8); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 6); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test13() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "files..."; + argc++; *p++ = "--delete"; + argc++; *p++ = "d"; + argc++; *p++ = "--"; /* option terminator */ + argc++; *p++ = "--change"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test14() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-o5"; + argc++; *p++ = "files..."; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "o[567]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-o5") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == 'o'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == argv[1]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-o5") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test15() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-ccd"; + argc++; *p++ = "-ce"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + argc++; *p++ = "-cdd"; + argc++; *p++ = "-d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == argv[2]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 4); + assert(optarg == argv[3]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 6); + assert(optarg == argv[6]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 8); + assert(optarg == 0); + assert(optopt == 0); + + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + +} + +int +main() +{ + opterr = 0; + optopt = 0; + + test1(); + test2(); + test3(); + test4(); + test5(); + test6(); + test7(); + test8(); + test9(); + test10(); + test11(); + test12(); + test13(); +#ifndef USE_GNU + test14(); + test15(); +#endif + + return 0; +} +#endif diff --git a/mothur.cpp b/mothur.cpp index 18d8d43..9ed0dfc 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -35,8 +35,7 @@ void ctrlc_handler ( int sig ) { /***********************************************************************/ int main(int argc, char *argv[]){ MothurOut* m = MothurOut::getInstance(); - try { - + try { signal(SIGINT, ctrlc_handler ); time_t ltime = time(NULL); /* calendar time */ @@ -150,7 +149,7 @@ int main(int argc, char *argv[]){ Engine* mothur; bool bail = 0; string input; - + if(argc>1){ input = argv[1]; diff --git a/prng.c b/prng.c deleted file mode 100644 index 212f888..0000000 --- a/prng.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - A C-program for MT19937, with initialization improved 2002/1/26. - Coded by Takuji Nishimura and Makoto Matsumoto. - - Before using, initialize the state by using init_genrand(seed) - or init_by_array(init_key, key_length). - - Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. The names of its contributors may not be used to endorse or promote - products derived from this software without specific prior written - permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - Any feedback is very welcome. - http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html - email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) -*/ - -#include - -#include "prng.h" - -/* Period parameters */ -#define N 624 -#define M 397 -#define MATRIX_A 0x9908b0dfUL /* constant vector a */ -#define UPPER_MASK 0x80000000UL /* most significant w-r bits */ -#define LOWER_MASK 0x7fffffffUL /* least significant r bits */ - -#define NJ_RAND_MAX 0x7fffffffUL - - -static unsigned long mt[N]; /* the array for the state vector */ -static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */ - -/* initializes mt[N] with a seed */ -void init_genrand(unsigned long s) -{ - mt[0]= s & 0xffffffffUL; - for (mti=1; mti> 30)) + mti); - /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ - /* In the previous versions, MSBs of the seed affect */ - /* only MSBs of the array mt[]. */ - /* 2002/01/09 modified by Makoto Matsumoto */ - mt[mti] &= 0xffffffffUL; - /* for >32 bit machines */ - } -} - -/* initialize by an array with array-length */ -/* init_key is the array for initializing keys */ -/* key_length is its length */ -/* slight change for C++, 2004/2/26 */ -void init_by_array(unsigned long init_key[], int key_length) -{ - int i, j, k; - init_genrand(19650218UL); - i=1; j=0; - k = (N>key_length ? N : key_length); - for (; k; k--) { - mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) - + init_key[j] + j; /* non linear */ - mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ - i++; j++; - if (i>=N) { mt[0] = mt[N-1]; i=1; } - if (j>=key_length) j=0; - } - for (k=N-1; k; k--) { - mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) - - i; /* non linear */ - mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ - i++; - if (i>=N) { mt[0] = mt[N-1]; i=1; } - } - - mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ -} - -/* generates a random number on [0,0xffffffff]-interval */ -unsigned long genrand_int32(void) -{ - unsigned long y; - static unsigned long mag01[2]={0x0UL, MATRIX_A}; - /* mag01[x] = x * MATRIX_A for x=0,1 */ - - if (mti >= N) { /* generate N words at one time */ - int kk; - - if (mti == N+1) /* if init_genrand() has not been called, */ - init_genrand(5489UL); /* a default initial seed is used */ - - for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; - } - for (;kk> 1) ^ mag01[y & 0x1UL]; - } - y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); - mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; - - mti = 0; - } - - y = mt[mti++]; - - /* Tempering */ - y ^= (y >> 11); - y ^= (y << 7) & 0x9d2c5680UL; - y ^= (y << 15) & 0xefc60000UL; - y ^= (y >> 18); - - return y; -} - -/* generates a random number on [0,0x7fffffff]-interval */ -long int genrand_int31(void) -{ - return (long)(genrand_int32()>>1); -} - -/* These real versions are due to Isaku Wada, 2002/01/09 added */ - -/* generates a random number on [0,1]-real-interval */ -double genrand_real1(void) -{ - return genrand_int32()*(1.0/4294967295.0); - /* divided by 2^32-1 */ -} - -/* generates a random number on [0,1)-real-interval */ -double genrand_real2(void) -{ - return genrand_int32()*(1.0/4294967296.0); - /* divided by 2^32 */ -} - -/* generates a random number on (0,1)-real-interval */ -double genrand_real3(void) -{ - return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); - /* divided by 2^32 */ -} - -/* generates a random number on [0,1) with 53-bit resolution*/ -double genrand_res53(void) -{ - unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; - return(a*67108864.0+b)*(1.0/9007199254740992.0); -} - - - - -/* - * NJ_genrand_int31_top() - Returns an int in the range 0..top - * - * This function attempts to remove bias in selecting random - * integers in a range. - * - */ -long int -NJ_genrand_int31_top(long int top) { - - long int overflow; - long int r; - long int retval; - - if(top <= 0) { - return(0); - } else { - overflow = (NJ_RAND_MAX / top) * top; - } - - while(1) { - r = genrand_int31(); - if(r < overflow) { - break; - } - } - - retval = r % top; - - return(retval); -} - - - - - - - - diff --git a/prng.cpp b/prng.cpp new file mode 100644 index 0000000..212f888 --- /dev/null +++ b/prng.cpp @@ -0,0 +1,221 @@ +/* + A C-program for MT19937, with initialization improved 2002/1/26. + Coded by Takuji Nishimura and Makoto Matsumoto. + + Before using, initialize the state by using init_genrand(seed) + or init_by_array(init_key, key_length). + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + Any feedback is very welcome. + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) +*/ + +#include + +#include "prng.h" + +/* Period parameters */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0dfUL /* constant vector a */ +#define UPPER_MASK 0x80000000UL /* most significant w-r bits */ +#define LOWER_MASK 0x7fffffffUL /* least significant r bits */ + +#define NJ_RAND_MAX 0x7fffffffUL + + +static unsigned long mt[N]; /* the array for the state vector */ +static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */ + +/* initializes mt[N] with a seed */ +void init_genrand(unsigned long s) +{ + mt[0]= s & 0xffffffffUL; + for (mti=1; mti> 30)) + mti); + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + /* In the previous versions, MSBs of the seed affect */ + /* only MSBs of the array mt[]. */ + /* 2002/01/09 modified by Makoto Matsumoto */ + mt[mti] &= 0xffffffffUL; + /* for >32 bit machines */ + } +} + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +/* slight change for C++, 2004/2/26 */ +void init_by_array(unsigned long init_key[], int key_length) +{ + int i, j, k; + init_genrand(19650218UL); + i=1; j=0; + k = (N>key_length ? N : key_length); + for (; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) + + init_key[j] + j; /* non linear */ + mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ + i++; j++; + if (i>=N) { mt[0] = mt[N-1]; i=1; } + if (j>=key_length) j=0; + } + for (k=N-1; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) + - i; /* non linear */ + mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ + i++; + if (i>=N) { mt[0] = mt[N-1]; i=1; } + } + + mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ +} + +/* generates a random number on [0,0xffffffff]-interval */ +unsigned long genrand_int32(void) +{ + unsigned long y; + static unsigned long mag01[2]={0x0UL, MATRIX_A}; + /* mag01[x] = x * MATRIX_A for x=0,1 */ + + if (mti >= N) { /* generate N words at one time */ + int kk; + + if (mti == N+1) /* if init_genrand() has not been called, */ + init_genrand(5489UL); /* a default initial seed is used */ + + for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; + } + for (;kk> 1) ^ mag01[y & 0x1UL]; + } + y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); + mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; + + mti = 0; + } + + y = mt[mti++]; + + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680UL; + y ^= (y << 15) & 0xefc60000UL; + y ^= (y >> 18); + + return y; +} + +/* generates a random number on [0,0x7fffffff]-interval */ +long int genrand_int31(void) +{ + return (long)(genrand_int32()>>1); +} + +/* These real versions are due to Isaku Wada, 2002/01/09 added */ + +/* generates a random number on [0,1]-real-interval */ +double genrand_real1(void) +{ + return genrand_int32()*(1.0/4294967295.0); + /* divided by 2^32-1 */ +} + +/* generates a random number on [0,1)-real-interval */ +double genrand_real2(void) +{ + return genrand_int32()*(1.0/4294967296.0); + /* divided by 2^32 */ +} + +/* generates a random number on (0,1)-real-interval */ +double genrand_real3(void) +{ + return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); + /* divided by 2^32 */ +} + +/* generates a random number on [0,1) with 53-bit resolution*/ +double genrand_res53(void) +{ + unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; + return(a*67108864.0+b)*(1.0/9007199254740992.0); +} + + + + +/* + * NJ_genrand_int31_top() - Returns an int in the range 0..top + * + * This function attempts to remove bias in selecting random + * integers in a range. + * + */ +long int +NJ_genrand_int31_top(long int top) { + + long int overflow; + long int r; + long int retval; + + if(top <= 0) { + return(0); + } else { + overflow = (NJ_RAND_MAX / top) * top; + } + + while(1) { + r = genrand_int31(); + if(r < overflow) { + break; + } + } + + retval = r % top; + + return(retval); +} + + + + + + + + diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index d98a4cf..1b3cc93 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -421,7 +421,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ outputNames.push_back(goodNameFile); ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut); - + while(!inputNames.eof()){ if (m->control_pressed) { goodNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); return 0; } @@ -449,7 +449,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ } inputNames.close(); goodNameOut.close(); - + //we were unable to remove some of the bad sequences if (badSeqNames.size() != 0) { for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { diff --git a/sharedace.h b/sharedace.h index d8f69e1..94fa006 100644 --- a/sharedace.h +++ b/sharedace.h @@ -19,7 +19,7 @@ It is a child of the calculator class. */ class SharedAce : public Calculator { public: - SharedAce(int n=10) : abund(n), Calculator("sharedace", 3, false) {}; + SharedAce(int n=10) : abund(n), Calculator("sharedace", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedchao1.h b/sharedchao1.h index 243039e..e802cc3 100644 --- a/sharedchao1.h +++ b/sharedchao1.h @@ -21,7 +21,7 @@ It is a child of the calculator class. */ class SharedChao1 : public Calculator { public: - SharedChao1() : Calculator("sharedchao", 3, true) {}; + SharedChao1() : Calculator("sharedchao", 1, true) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedjabund.h b/sharedjabund.h index a5fab2f..1bd4198 100644 --- a/sharedjabund.h +++ b/sharedjabund.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class JAbund : public Calculator { public: - JAbund() : Calculator("jabund", 3, false) {}; + JAbund() : Calculator("jabund", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedjclass.h b/sharedjclass.h index d18dfee..59aea19 100644 --- a/sharedjclass.h +++ b/sharedjclass.h @@ -19,7 +19,7 @@ It is a child of the calculator class. */ class Jclass : public Calculator { public: - Jclass() : Calculator("jclass", 3, false) {}; + Jclass() : Calculator("jclass", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedjest.h b/sharedjest.h index 8eaf70f..c4111bc 100644 --- a/sharedjest.h +++ b/sharedjest.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class Jest : public Calculator { public: - Jest() : Calculator("jest", 3, false) {}; + Jest() : Calculator("jest", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedsorabund.h b/sharedsorabund.h index 0dbf4a4..7ec112e 100644 --- a/sharedsorabund.h +++ b/sharedsorabund.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class SorAbund : public Calculator { public: - SorAbund() : Calculator("sorabund", 3, false) {}; + SorAbund() : Calculator("sorabund", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedsorclass.h b/sharedsorclass.h index 75df657..dc6f14c 100644 --- a/sharedsorclass.h +++ b/sharedsorclass.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class SorClass : public Calculator { public: - SorClass() : Calculator("sorclass", 3, false) {}; + SorClass() : Calculator("sorclass", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedsorest.h b/sharedsorest.h index 8e044aa..94bcc32 100644 --- a/sharedsorest.h +++ b/sharedsorest.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class SorEst : public Calculator { public: - SorEst() : Calculator("sorest", 3, false) {}; + SorEst() : Calculator("sorest", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/sharedthetan.h b/sharedthetan.h index 4637b97..b3693d6 100644 --- a/sharedthetan.h +++ b/sharedthetan.h @@ -20,7 +20,7 @@ It is a child of the calculator class. */ class ThetaN : public Calculator { public: - ThetaN() : Calculator("thetan", 3, false) {}; + ThetaN() : Calculator("thetan", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector); private: diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index cda2ef0..1046c1b 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -234,6 +234,7 @@ int SummarySharedCommand::execute(){ outputFileHandle << "label" <<'\t' << "comparison" << '\t'; for(int i=0;igetName(); + if (sumCalculators[i]->getCols() == 3) { outputFileHandle << "\tlci\thci"; } } outputFileHandle << endl; diff --git a/whittaker.h b/whittaker.h index 540dc0b..7ffd0cc 100644 --- a/whittaker.h +++ b/whittaker.h @@ -18,7 +18,7 @@ class Whittaker : public Calculator { public: - Whittaker() : Calculator("whittaker", 3, false) {}; + Whittaker() : Calculator("whittaker", 1, false) {}; EstOutput getValues(SAbundVector*) {return data;}; EstOutput getValues(vector);