From: westcott Date: Fri, 20 Aug 2010 14:58:44 +0000 (+0000) Subject: modified precluster to use less memory. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=51d85e945d26324e50d6406829ce006ccc9eb1c0 modified precluster to use less memory. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index b081984..e319668 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -456,6 +456,22 @@ A7DA217A113FECD400BF472F /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = ""; }; A7DA217B113FECD400BF472F /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7DA217C113FECD400BF472F /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; + A7DF0AD3121EBB14004A03EA /* clearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = clearcut.c; sourceTree = ""; }; + A7DF0AD4121EBB14004A03EA /* clearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearcut.h; sourceTree = ""; }; + A7DF0AD5121EBB14004A03EA /* cmdargs.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = cmdargs.c; sourceTree = ""; }; + A7DF0AD6121EBB14004A03EA /* cmdargs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cmdargs.h; sourceTree = ""; }; + A7DF0AD7121EBB14004A03EA /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = ""; }; + A7DF0AD8121EBB14004A03EA /* dayhoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dayhoff.h; sourceTree = ""; }; + A7DF0AD9121EBB14004A03EA /* distclearcut.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = distclearcut.c; sourceTree = ""; }; + A7DF0ADA121EBB14004A03EA /* distclearcut.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distclearcut.h; sourceTree = ""; }; + A7DF0ADB121EBB14004A03EA /* dmat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dmat.c; sourceTree = ""; }; + A7DF0ADC121EBB14004A03EA /* dmat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dmat.h; sourceTree = ""; }; + A7DF0ADD121EBB14004A03EA /* fasta.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fasta.c; sourceTree = ""; }; + A7DF0ADE121EBB14004A03EA /* fasta.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fasta.h; sourceTree = ""; }; + A7DF0ADF121EBB14004A03EA /* getopt_long.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = getopt_long.c; sourceTree = ""; }; + A7DF0AE0121EBB14004A03EA /* getopt_long.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getopt_long.h; sourceTree = ""; }; + A7DF0AE1121EBB14004A03EA /* prng.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = prng.c; sourceTree = ""; }; + A7DF0AE2121EBB14004A03EA /* prng.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prng.h; sourceTree = ""; }; A7E8338B115BBDAA00739EC4 /* parsesffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsesffcommand.cpp; sourceTree = ""; }; A7E8338C115BBDAA00739EC4 /* parsesffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsesffcommand.h; sourceTree = ""; }; /* End PBXFileReference section */ @@ -475,6 +491,7 @@ A7CB593B11402EF90010EB83 /* calculators */, A7CB594A11402FB40010EB83 /* chimeras */, A7CB594511402F6E0010EB83 /* classifers */, + A7DF0AE5121EBC26004A03EA /* clearcutsource */, A7CB593E11402F110010EB83 /* commands */, A7CB594211402F430010EB83 /* containers */, A7DA2023113FECD400BF472F /* collect.cpp */, @@ -967,6 +984,29 @@ name = read; sourceTree = ""; }; + A7DF0AE5121EBC26004A03EA /* clearcutsource */ = { + isa = PBXGroup; + children = ( + A7DF0AD4121EBB14004A03EA /* clearcut.h */, + A7DF0AD3121EBB14004A03EA /* clearcut.c */, + A7DF0AD6121EBB14004A03EA /* cmdargs.h */, + A7DF0AD7121EBB14004A03EA /* common.h */, + A7DF0AD8121EBB14004A03EA /* dayhoff.h */, + A7DF0AD5121EBB14004A03EA /* cmdargs.c */, + A7DF0AD9121EBB14004A03EA /* distclearcut.c */, + A7DF0ADA121EBB14004A03EA /* distclearcut.h */, + A7DF0ADB121EBB14004A03EA /* dmat.c */, + A7DF0ADC121EBB14004A03EA /* dmat.h */, + A7DF0ADD121EBB14004A03EA /* fasta.c */, + A7DF0ADE121EBB14004A03EA /* fasta.h */, + A7DF0ADF121EBB14004A03EA /* getopt_long.c */, + A7DF0AE0121EBB14004A03EA /* getopt_long.h */, + A7DF0AE2121EBB14004A03EA /* prng.h */, + A7DF0AE1121EBB14004A03EA /* prng.c */, + ); + name = clearcutsource; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXLegacyTarget section */ diff --git a/clearcut.c b/clearcut.c new file mode 100644 index 0000000..3a13ab6 --- /dev/null +++ b/clearcut.c @@ -0,0 +1,2158 @@ +/* + * clearcut.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * An implementation of the Relaxed Neighbor-Joining algorithm + * of Evans, J., Sheneman, L., and Foster, J. + * + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dist.h" +#include "dmat.h" +#include "fasta.h" +#include "cmdargs.h" +#include "common.h" +#include "clearcut.h" +#include "prng.h" + + + +/* + * main() - + * + * The entry point to the program. + * + */ +int +main(int argc, + char *argv[]) { + + DMAT *dmat; /* The working distance matrix */ + DMAT *dmat_backup = NULL;/* A backup distance matrix */ + NJ_TREE *tree; /* The phylogenetic tree */ + NJ_ARGS *nj_args; /* Structure for holding command-line arguments */ + long int i; + + /* some variables for tracking time */ + struct timeval tv; + unsigned long long startUs, endUs; + + + /* check and parse supplied command-line arguments */ + nj_args = NJ_handle_args(argc, argv); + if(!nj_args) { + fprintf(stderr, "Clearcut: Error processing command-line arguments.\n"); + exit(-1); + } + + /* for verbose reporting, print the random number seed to stdout */ + if(nj_args->verbose_flag) { + printf("PRNG SEED: %d\n", nj_args->seed); + } + + /* Initialize Mersenne Twister PRNG */ + init_genrand(nj_args->seed); + + + switch(nj_args->input_mode) { + + /* If the input type is a distance matrix */ + case NJ_INPUT_MODE_DISTANCE: + + /* parse the distance matrix */ + dmat = NJ_parse_distance_matrix(nj_args); + if(!dmat) { + exit(-1); + } + + break; + + /* If the input type is a multiple sequence alignment */ + case NJ_INPUT_MODE_ALIGNED_SEQUENCES: + + /* build a distance matrix from a multiple sequence alignment */ + dmat = NJ_build_distance_matrix(nj_args); + if(!dmat) { + fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n"); + exit(-1); + } + + break; + + default: + + fprintf(stderr, "Clearcut: Could not determine how to process input\n"); + exit(-1); + } + + /* + * Output the computed distance matrix, + * if the user specified one. + */ + if(nj_args->matrixout) { + NJ_output_matrix(nj_args, dmat); + } + + /* + * If we are going to generate multiple trees from + * the same distance matrix, we need to make a backup + * of the original distance matrix. + */ + if(nj_args->ntrees > 1) { + dmat_backup = NJ_dup_dmat(dmat); + } + + /* process n trees */ + for(i=0;intrees;i++) { + + /* + * If the user has specified matrix shuffling, we need + * to randomize the distance matrix + */ + if(nj_args->shuffle) { + NJ_shuffle_distance_matrix(dmat); + } + + /* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + startUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + + /* + * Invoke either the Relaxed Neighbor-Joining algorithm (default) + * or the "traditional" Neighbor-Joining algorithm + */ + if(nj_args->neighbor) { + tree = NJ_neighbor_joining(nj_args, dmat); + } else { + tree = NJ_relaxed_nj(nj_args, dmat); + } + + if(!tree) { + fprintf(stderr, "Clearcut: Failed to construct tree.\n"); + exit(0); + } + + /* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */ + gettimeofday(&tv, NULL); + endUs = ((unsigned long long) tv.tv_sec * 1000000ULL) + + ((unsigned long long) tv.tv_usec); + + /* print the time taken to perform the neighbor join */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + fprintf(stderr, "NJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } else { + fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n", + (endUs - startUs) / 1000000ULL, + (endUs - startUs) % 1000000ULL); + } + } + + /* Output the neighbor joining tree here */ + NJ_output_tree(nj_args, tree, dmat, i); + + NJ_free_tree(tree); /* Free the tree */ + NJ_free_dmat(dmat); /* Free the working distance matrix */ + + /* + * If we need to do another iteration, lets re-initialize + * our working distance matrix. + */ + if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) { + dmat = NJ_dup_dmat(dmat_backup); + } + } + + /* Free the backup distance matrix */ + if(nj_args->ntrees > 1) { + NJ_free_dmat(dmat_backup); + } + + /* If verbosity, describe where the tree output is */ + if(nj_args->verbose_flag) { + if(nj_args->neighbor) { + printf("NJ tree(s) in %s\n", nj_args->outfilename); + } else { + printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename); + } + } + + exit(0); +} + + + + + +/* + * NJ_find_hmin() - Find minimum transformed values along horizontal + * + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * a -- The index of the specific taxon in the distance matrix + * + * RETURNS: + * -------- + * -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * hmincount -- Return the number of minima along the horizontal + * (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "horizontal" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal structures. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the horizonal portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that horizontal component. + * + * Since multiple minima can exist along the horizontal portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Just searching along the horizontal portion of a row is very fast + * since the data is stored linearly and contiguously in memory and + * cache locality is exploited in the distance matrix representation. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_hmin(DMAT *dmat, + long int a, + long int *min, + long int *hmincount) { + + long int i; /* index variable for looping */ + int size; /* current size of distance matrix */ + int mindex = 0; /* holds the current index to the chosen minimum */ + float curval; /* used to hold current transformed values */ + float hmin; /* the value of the transformed minimum */ + + float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */ + + /* values used for stochastic selection among multiple minima */ + float p, x; + long int smallcnt; + + /* initialize the min to something large */ + hmin = (float)HUGE_VAL; + + /* setup some pointers to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + /* initialize values associated with minima tie breaking */ + p = 1.0; + smallcnt = 0; + + + ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */ + for(i=a+1;i -- The value of the selected minimum + * min -- Used to transport the index of the minima out + * of the function (by reference) + * vmincount -- The number of minima along the vertical + * return by reference. + * + * DESCRIPTION: + * ------------ + * + * A fast, inline function to find the smallest transformed value + * along the "vertical" portion of an entry in a distance matrix. + * + * Distance matrices are stored internally as continguously-allocated + * upper-diagonal matrices. With the exception of the taxa at + * row 0 of this upper-diagonal matrix, all taxa have both a horizontal + * and vertical component in the distance matrix. This function + * scans the vertical portion of the entry in the distance matrix + * for the specified taxon and finds the minimum transformed value + * along that vertical component. + * + * Since multiple minima can exist along the vertical portion + * of the entry, I consider all minima and break ties + * stochastically to help avoid systematic bias. + * + * Due to cache locality reasons, searching along the vertical + * component is going to be considerably slower than searching + * along the horizontal. + * + * Look at nj.h for more information on how the distance matrix + * is architected. + * + */ +static inline +float +NJ_find_vmin(DMAT *dmat, + long int a, + long int *min, + long int *vmincount) { + + long int i; /* index variable used for looping */ + long int size; /* track the size of the matrix */ + long int mindex = 0;/* track the index to the minimum */ + float curval; /* track value of current transformed distance */ + float vmin; /* the index to the smallest "vertical" minimum */ + + /* pointers which are used to reduce pointer dereferencing in inner loop */ + float *ptr, *r2, *val; + + /* values used in stochastically breaking ties */ + float p, x; + long int smallcnt; + + /* initialize the vertical min to something really big */ + vmin = (float)HUGE_VAL; + + /* save off some values to limit dereferencing later */ + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + + p = 1.0; + smallcnt = 0; + + /* start on the first row and work down */ + ptr = &(val[NJ_MAP(0, a, size)]); + for(i=0;isize; + val = dmat->val; + r = dmat->r+a+1; + + /* + * Loop through the rows and decrement the stored r values + * by the distances stored in the rows and columns of the distance + * matrix which are being removed post-join. + * + * We do the rows altogether in order to benefit from cache locality. + */ + ptrx = &(val[NJ_MAP(a, a+1, size)]); + ptry = &(val[NJ_MAP(b, b+1, size)]); + + for(i=a+1;ib) { + *r -= *(ptry++); + } + + r++; + } + + /* Similar to the above loop, we now do the columns */ + ptrx = &(val[NJ_MAP(0, a, size)]); + ptry = &(val[NJ_MAP(0, b, size)]); + r = dmat->r; + for(i=0;isize-1) { + /* if we can't do a row here, lets do a column */ + if(a==0) { + if(b==1) { + target = 2; + } else { + target = 1; + } + } else { + target = 0; + } + } else { + target = b+1; + } + + + /* distance between a and the root of clade (a,b) */ + a2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + + /* distance between b and the root of clade (a,b) */ + b2clade = + ( (dmat->val[NJ_MAP(a, b, dmat->size)]) + + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + /* distance between the clade (a,b) and the target taxon */ + if(bval[NJ_MAP(a, target, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illigitimate */ + } + + } else { + + /* compute the distance from the clade root to the target */ + clade_dist = + ( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) + + (dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0; + + /* + * Check to see that distance from clade root to target + distance from + * b to clade root are equal to the distance from b to the target + */ + if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)], + (clade_dist + b2clade))) { + return(1); /* join is legitimate */ + } else { + return(0); /* join is illegitimate */ + } + } +} + + + + + + + +/* + * NJ_check() - Check to see if two taxa can be joined + + * + * INPUTS: + * ------- + * nj_args -- Pointer to the data structure holding command-line args + * dmat -- distance matrix + * a -- index into dmat for one of the rows to be joined + * b -- index into dmat for another row to be joined + * min -- the minimum value found + * additivity -- a flag (0 = not additive mode, 1 = additive mode) + * + * OUTPUTS: + * -------- + * int 1 if join is okay + * 0 if join is not okay + * + * DESCRIPTION: + * ------------ + * + * This function ultimately takes two rows and makes sure that the + * intersection of those two rows, which has a transformed distance of + * "min", is actually the smallest (or equal to the smallest) + * transformed distance for both rows (a, b). If so, it returns + * 1, else it returns 0. + * + * Basically, we want to join two rows only if the minimum + * transformed distance on either row is at the intersection of + * those two rows. + * + */ +static inline +int +NJ_check(NJ_ARGS *nj_args, + DMAT *dmat, + long int a, + long int b, + float min, + int additivity) { + + + long int i, size; + float *ptr, *val, *r2; + + + /* some aliases for speed and readability reasons */ + val = dmat->val; + r2 = dmat->r2; + size = dmat->size; + + + /* now determine if joining a, b will result in broken distances */ + if(additivity) { + if(!NJ_check_additivity(dmat, a, b)) { + return(0); + } + } + + /* scan the horizontal of row b, punt if anything < min */ + ptr = &(val[NJ_MAP(b, b+1, size)]); + for(i=b+1;inorandom) { /* if we are doing random joins, we checked this */ + ptr = val + a; + for(i=0;i reduce pointer dereferencing */ + float a2clade; /* distance from a to the new node that joins a and b */ + float b2clade; /* distance from b to the new node that joins a and b */ + float cval; /* stores distance information during loop */ + float *vptr; /* pointer to elements in first row of dist matrix */ + float *ptra; /* pointer to elements in row a of distance matrix */ + float *ptrb; /* pointer to elements in row b of distance matrix */ + + float *val, *r, *r2; /* simply used to limit pointer dereferencing */ + + + /* We must assume that a < b */ + if(a >= b) { + fprintf(stderr, "Clearcut: (aval; + r = dmat->r; + r2 = dmat->r2; + size = dmat->size; + + /* compute the distance from the clade components (a, b) to the new node */ + a2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0; + b2clade = + ( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0; + + + r[a] = 0.0; /* we are removing row a, so clear dist. in r */ + + /* + * Fill the horizontal part of the "a" row and finish computing r and r2 + * we handle the horizontal component first to maximize cache locality + */ + ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */ + ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */ + for(i=a+1;ir = r+1; + + + /* + * Collapse r2 here by copying contents of r2[0] into r2[b] and + * incrementing pointer to the beginning of r2 by one row + */ + r2[b] = r2[0]; + dmat->r2 = r2+1; + + /* increment dmat pointer to next row */ + dmat->val += size; + + /* decrement the total size of the distance matrix by one row */ + dmat->size--; + + return; +} + + + + + + + + + +/* + * NJ_neighbor_joining() - Perform a traditional Neighbor-Joining + * + * + * INPUTS: + * ------- + * nj_args -- A pointer to a structure containing the command-line arguments + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * NJ_TREE * -- A pointer to the Neighbor-Joining tree. + * + * DESCRIPTION: + * ------------ + * + * This function performs a traditional Neighbor-Joining operation in which + * the distance matrix is exhaustively searched for the global minimum + * transformed distance. The two nodes which intersect at the global + * minimum transformed distance are then joined and the distance + * matrix is collapsed. This process continues until there are only + * two nodes left, at which point those nodes are joined. + * + */ +NJ_TREE * +NJ_neighbor_joining(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree = NULL; + NJ_VERTEX *vertex = NULL; + + long int a, b; + float min; + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + if(!vertex) { + fprintf(stderr, "Clearcut: Could not initialize vertex in NJ_neighbor_joining()\n"); + return(NULL); + } + + /* we iterate until the working distance matrix has only 2 entries */ + while(vertex->nactive > 2) { + + /* + * Find the global minimum transformed distance from the distance matrix + */ + min = NJ_min_transform(dmat, &a, &b); + + /* + * Build the tree by removing nodes a and b from the vertex array + * and inserting a new internal node which joins a and b. Collapse + * the vertex array similarly to how the distance matrix and r and r2 + * are compacted. + */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* decrement the r and r2 vectors by the distances corresponding to a, b */ + NJ_compute_r(dmat, a, b); + + /* compact the distance matrix and the r and r2 vectors */ + NJ_collapse(dmat, vertex, a, b); + } + + /* Properly join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + /* return the computed tree to the calling function */ + return(tree); +} + + + + + + + + +/* + * NJ_relaxed_nj() - Construct a tree using the Relaxed Neighbor-Joining + * + * INPUTS: + * ------- + * nj_args -- A pointer to a data structure containing the command-line args + * dmat -- A pointer to the distance matrix + * + * RETURNS: + * -------- + * + * NJ_TREE * -- A pointer to a Relaxed Neighbor-Joining tree + * + * DESCRIPTION: + * ------------ + * + * This function implements the Relaxed Neighbor-Joining algorithm of + * Evans, J., Sheneman, L., and Foster, J. + * + * Relaxed Neighbor-Joining works by choosing a local minimum transformed + * distance when determining when to join two nodes. (Traditional + * Neighbor-Joining chooses a global minimum transformed distance). + * + * The algorithm shares the property with traditional NJ that if the + * input distances are additive (self-consistent), then the algorithm + * will manage to construct the true tree consistent with the additive + * distances. Additivity state is tracked and every proposed join is checked + * to make sure it maintains additivity constraints. If no + * additivity-preserving join is possible in a single pass, then the distance + * matrix is non-additive, and additivity checking is abandoned. + * + * The algorithm will either attempt joins randomly, or it will perform joins + * in a particular order. The default behavior is to perform joins randomly, + * but this can be switched off with a command-line switch. + * + * For randomized joins, all attempts are made to alleviate systematic bias + * for the choice of rows to joins. All tie breaking is done in a way which + * is virtually free of bias. + * + * To perform randomized joins, a random permutation is constructed which + * specifies the order in which to attempt joins. I iterate through the + * random permutation, and for each row in the random permutation, I find + * the minimum transformed distance for that row. If there are multiple + * minima, I break ties evenly. For the row which intersects our + * randomly chosen row at the chosen minimum, if we are are still in + * additivity mode, I check to see if joining the two rows will break + * our additivity constraints. If not, I check to see if there exists + * a transformed distance which is smaller than the minimum found on the + * original row. If there is, then we proceed through the random permutation + * trying additional rows in the random order specified in the permutation. + * If there is no smaller minimum transformed distance on either of the + * two rows, then we join them, collapse the distance matrix, and compute + * a new random permutation. + * + * If the entire random permutation is traversed and no joins are possible + * due to additivity constraints, then the distance matrix is not + * additive, and additivity constraint-checking is disabled. + * + */ +NJ_TREE * +NJ_relaxed_nj(NJ_ARGS *nj_args, + DMAT *dmat) { + + + NJ_TREE *tree; + NJ_VERTEX *vertex; + long int a, b, t, bh, bv, i; + float hmin, vmin, hvmin; + float p, q, x; + int join_flag; + int additivity_mode; + long int hmincount, vmincount; + long int *permutation = NULL; + + + + /* initialize the r and r2 vectors */ + NJ_init_r(dmat); + + additivity_mode = 1; + + /* allocate the permutation vector, if we are in randomize mode */ + if(!nj_args->norandom) { + permutation = (long int *)calloc(dmat->size, sizeof(long int)); + if(!permutation) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_relaxed_nj()\n"); + return(NULL); + } + } + + /* allocate and initialize our vertex vector used for tree construction */ + vertex = NJ_init_vertex(dmat); + + /* loop until there are only 2 nodes left to join */ + while(vertex->nactive > 2) { + + switch(nj_args->norandom) { + + /* RANDOMIZED JOINS */ + case 0: + + join_flag = 0; + + NJ_permute(permutation, dmat->size-1); + for(i=0;isize-1 && (vertex->nactive>2) ;i++) { + + a = permutation[i]; + + /* find min trans dist along horiz. of row a */ + hmin = NJ_find_hmin(dmat, a, &bh, &hmincount); + if(a) { + /* find min trans dist along vert. of row a */ + vmin = NJ_find_vmin(dmat, a, &bv, &vmincount); + } else { + vmin = hmin; + bv = bh; + vmincount = 0; + } + + if(NJ_FLT_EQ(hmin, vmin)) { + + /* + * The minima along the vertical and horizontal are + * the same. Compute the proportion of minima along + * the horizonal (p) and the proportion of minima + * along the vertical (q). + * + * If the same minima exist along the horizonal and + * vertical, we break the tie in a way which is + * non-biased. That is, we break the tie based on the + * proportion of horiz. minima versus vertical minima. + * + */ + p = (float)hmincount / ((float)hmincount + (float)vmincount); + q = 1.0 - p; + x = genrand_real2(); + + if(x < p) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + } else if(NJ_FLT_LT(hmin, vmin) ) { + hvmin = hmin; + b = bh; + } else { + hvmin = vmin; + b = bv; + } + + if(NJ_check(nj_args, dmat, a, b, hvmin, additivity_mode)) { + + /* swap a and b, if necessary, to make sure a < b */ + if(b < a) { + t = a; + a = b; + b = t; + } + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + NJ_permute(permutation, dmat->size-1); + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + + + + /* DETERMINISTIC JOINS */ + case 1: + + join_flag = 0; + + for(a=0;asize-1 && (vertex->nactive > 2) ;) { + + /* find the min along the horizontal of row a */ + hmin = NJ_find_hmin(dmat, a, &b, &hmincount); + + if(NJ_check(nj_args, dmat, a, b, hmin, additivity_mode)) { + + join_flag = 1; + + /* join taxa from rows a and b */ + NJ_decompose(dmat, vertex, a, b, 0); + + /* collapse matrix */ + NJ_compute_r(dmat, a, b); + NJ_collapse(dmat, vertex, a, b); + + if(a) { + a--; + } + + } else { + a++; + } + } + + /* turn off additivity if go through an entire cycle without joining */ + if(!join_flag) { + additivity_mode = 0; + } + + break; + } + + } /* WHILE */ + + /* Join the last two nodes on the vertex list */ + tree = NJ_decompose(dmat, vertex, 0, 1, NJ_LAST); + + if(nj_args->verbose_flag) { + if(additivity_mode) { + printf("Tree is additive\n"); + } else { + printf("Tree is not additive\n"); + } + } + + if(vertex) { + NJ_free_vertex(vertex); + } + + if(!nj_args->norandom && permutation) { + free(permutation); + } + + return(tree); +} + + + + + + +/* + * NJ_print_distance_matrix() - + * + * Print a distance matrix + * + */ +void +NJ_print_distance_matrix(DMAT *dmat) { + + long int i, j; + + printf("ntaxa: %ld\n", dmat->ntaxa); + printf(" size: %ld\n", dmat->size); + + for(i=0;isize;i++) { + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)]); + } else { + printf(" -"); + } + } + + + if(dmat->r && dmat->r2) { + printf("\t\t%0.4f", dmat->r[i]); + printf("\t%0.4f", dmat->r2[i]); + + + + printf("\n"); + + for(j=0;jsize;j++) { + if(j>i) { + printf(" %0.4f", dmat->val[NJ_MAP(i, j, dmat->size)] - (dmat->r2[i] + dmat->r2[j])); + } else { + printf(" "); + } + } + + printf("\n\n"); + } + } + + printf("\n"); + + return; +} + + + + + + + +/* + * NJ_output_tree() - + * + * A wrapper for the function that really prints the tree, + * basically to get a newline in there conveniently. :-) + * + * Print n trees, as specified in command-args + * using "count" variable from 0 to (n-1) + * + */ +void +NJ_output_tree(NJ_ARGS *nj_args, + NJ_TREE *tree, + DMAT *dmat, + long int count) { + + FILE *fp; + + if(nj_args->stdout_flag) { + fp = stdout; + } else { + + if(count == 0) { + fp = fopen(nj_args->outfilename, "w"); /* open for writing */ + } else { + fp = fopen(nj_args->outfilename, "a"); /* open for appending */ + } + + if(!fp) { + fprintf(stderr, "Clearcut: Failed to open outfile %s\n", nj_args->outfilename); + exit(-1); + } + } + + NJ_output_tree2(fp, nj_args, tree, tree, dmat); + fprintf(fp, ";\n"); + + if(!nj_args->stdout_flag) { + fclose(fp); + } + + return; +} + + + + + +/* + * NJ_output_tree2() - + * + * + */ +void +NJ_output_tree2(FILE *fp, + NJ_ARGS *nj_args, + NJ_TREE *tree, + NJ_TREE *root, + DMAT *dmat) { + + if(!tree) { + return; + } + + if(tree->taxa_index != NJ_INTERNAL_NODE) { + + if(nj_args->expblen) { + fprintf(fp, "%s:%e", + dmat->taxaname[tree->taxa_index], + tree->dist); + } else { + fprintf(fp, "%s:%f", + dmat->taxaname[tree->taxa_index], + tree->dist); + } + + } else { + + + if(tree->left && tree->right) { + fprintf(fp, "("); + } + if(tree->left) { + NJ_output_tree2(fp, nj_args, tree->left, root, dmat); + } + + if(tree->left && tree->right) { + fprintf(fp, ","); + } + if(tree->right) { + NJ_output_tree2(fp, nj_args, tree->right, root, dmat); + } + + if(tree != root->left) { + if(tree->left && tree->right) { + if(tree != root) { + if(nj_args->expblen) { + fprintf(fp, "):%e", tree->dist); + } else { + fprintf(fp, "):%f", tree->dist); + } + } else { + fprintf(fp, ")"); + } + } + } else { + fprintf(fp, ")"); + } + } + + return; +} + + + + + + + +/* + * NJ_init_r() + * + * This function computes the r column in our matrix + * + */ +void +NJ_init_r(DMAT *dmat) { + + long int i, j, size; + long int index; + float *r, *r2, *val; + long int size1; + float size2; + + r = dmat->r; + r2 = dmat->r2; + val = dmat->val; + size = dmat->size; + size1 = size-1; + size2 = (float)(size-2); + + index = 0; + for(i=0;inodes = (NJ_TREE **)calloc(dmat->ntaxa, sizeof(NJ_TREE *)); + vertex->nodes_handle = vertex->nodes; + + /* initialize our size and active variables */ + vertex->nactive = dmat->ntaxa; + vertex->size = dmat->ntaxa; + + /* initialize the nodes themselves */ + for(i=0;intaxa;i++) { + + vertex->nodes[i] = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + vertex->nodes[i]->left = NULL; + vertex->nodes[i]->right = NULL; + + vertex->nodes[i]->taxa_index = i; + } + + return(vertex); +} + + + + + +/* + * NJ_decompose() - + * + * This function decomposes the star by creating new internal nodes + * and joining two existing tree nodes to it + * + */ +NJ_TREE * +NJ_decompose(DMAT *dmat, + NJ_VERTEX *vertex, + long int x, + long int y, + int last_flag) { + + NJ_TREE *new_node; + float x2clade, y2clade; + + /* compute the distance from the clade components to the new node */ + if(last_flag) { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + x2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[x] - dmat->r2[y])/2); + } + + vertex->nodes[x]->dist = x2clade; + + if(last_flag) { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)]); + } else { + y2clade = + (dmat->val[NJ_MAP(x, y, dmat->size)])/2 + + ((dmat->r2[y] - dmat->r2[x])/2); + } + + vertex->nodes[y]->dist = y2clade; + + /* allocate new node to connect two sub-clades */ + new_node = (NJ_TREE *)calloc(1, sizeof(NJ_TREE)); + + new_node->left = vertex->nodes[x]; + new_node->right = vertex->nodes[y]; + new_node->taxa_index = NJ_INTERNAL_NODE; /* this is not a terminal node, no taxa index */ + + if(last_flag) { + return(new_node); + } + + vertex->nodes[x] = new_node; + vertex->nodes[y] = vertex->nodes[0]; + + vertex->nodes = &(vertex->nodes[1]); + + vertex->nactive--; + + return(new_node); +} + + + +/* + * NJ_print_vertex() - + * + * For debugging, print the contents of the vertex + * + */ +void +NJ_print_vertex(NJ_VERTEX *vertex) { + + long int i; + + printf("Number of active nodes: %ld\n", vertex->nactive); + + for(i=0;inactive;i++) { + printf("%ld ", vertex->nodes[i]->taxa_index); + } + printf("\n"); + + return; +} + + + + + + + + + +/* + * NJ_print_r() - + * + */ +void +NJ_print_r(DMAT *dmat) { + + long int i; + + printf("\n"); + for(i=0;isize;i++) { + printf("r[%ld] = %0.2f\n", i, dmat->r[i]); + } + printf("\n"); + + return; +} + + + + + +/* + * NJ_print_taxanames() - + * + * Print taxa names here + * + */ +void +NJ_print_taxanames(DMAT *dmat) { + + long int i; + + printf("Number of taxa: %ld\n", dmat->ntaxa); + + for(i=0;intaxa;i++) { + printf("%ld) %s\n", i, dmat->taxaname[i]); + } + + printf("\n"); + + return; +} + + + + +/* + * NJ_shuffle_distance_matrix() - + * + * Randomize a distance matrix here + * + */ +void +NJ_shuffle_distance_matrix(DMAT *dmat) { + + + long int *perm = NULL; + char **tmp_taxaname = NULL; + float *tmp_val = NULL; + long int i, j; + + + /* alloc the random permutation and a new matrix to hold the shuffled vals */ + perm = (long int *)calloc(dmat->size, sizeof(long int)); + tmp_taxaname = (char **)calloc(dmat->size, sizeof(char *)); + tmp_val = (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); + if(!tmp_taxaname || !perm || !tmp_val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_shuffle_distance_matrix()\n"); + exit(-1); + } + + /* compute a permutation which will describe how to shuffle the matrix */ + NJ_permute(perm, dmat->size); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + if(perm[j] < perm[i]) { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[j], perm[i], dmat->size)]; + } else { + tmp_val[NJ_MAP(i, j, dmat->size)] = dmat->val[NJ_MAP(perm[i], perm[j], dmat->size)]; + } + + } + + tmp_taxaname[i] = dmat->taxaname[perm[i]]; + } + + /* free our random permutation */ + if(perm) { + free(perm); + } + + /* free the old value matrix */ + if(dmat->val) { + free(dmat->val); + } + + /* re-assign the value matrix pointers */ + dmat->val = tmp_val; + dmat->valhandle = dmat->val; + + /* + * Free our old taxaname with its particular ordering + * and re-assign to the new. + */ + if(dmat->taxaname) { + free(dmat->taxaname); + } + dmat->taxaname = tmp_taxaname; + + return; +} + + + +/* + * NJ_free_tree() - + * + * Free a given NJ tree + */ +void +NJ_free_tree(NJ_TREE *node) { + + if(!node) { + return; + } + + if(node->left) { + NJ_free_tree(node->left); + } + + if(node->right) { + NJ_free_tree(node->right); + } + + free(node); + + return; +} + + + + + + + + + +/* + * NJ_print_permutation() + * + * Print a permutation + * + */ +void +NJ_print_permutation(long int *perm, + long int size) { + + long int i; + + for(i=0;intaxa = src->ntaxa; + dest->size = src->size; + + /* allocate space for array of pointers to taxanames */ + dest->taxaname = (char **)calloc(dest->ntaxa, sizeof(char *)); + if(!dest->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the taxanames themselves */ + for(i=0;intaxa;i++) { + dest->taxaname[i] = (char *)calloc(strlen(src->taxaname[i])+1, sizeof(char)); + if(!dest->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + } + + /* allocate space for the distance values */ + dest->val = (float *)calloc(NJ_NCELLS(src->ntaxa), sizeof(float)); + if(!dest->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_dup_dmat()\n"); + goto XIT_BAD; + } + + /* allocate space for the r and r2 vectors */ + dest->r = (float *)calloc(src->ntaxa, sizeof(float)); + dest->r2 = (float *)calloc(src->ntaxa, sizeof(float)); + + /* copy titles */ + for(i=0;intaxa;i++) { + strcpy(dest->taxaname[i], src->taxaname[i]); + } + + /* copy values */ + memcpy(dest->val, src->valhandle, NJ_NCELLS(src->ntaxa)*sizeof(float)); + + /* copy r and r2 */ + memcpy(dest->r, src->rhandle, src->ntaxa*sizeof(float)); + memcpy(dest->r2, src->r2handle, src->ntaxa*sizeof(float)); + + /* track some memory addresses */ + dest->valhandle = dest->val; + dest->rhandle = dest->r; + dest->r2handle = dest->r2; + + return(dest); + + XIT_BAD: + + /* free what we may have allocated */ + NJ_free_dmat(dest); + + return(NULL); +} + + + + +/* + * NJ_free_dmat() - + */ +void +NJ_free_dmat(DMAT *dmat) { + + long int i; + + if(dmat) { + + if(dmat->taxaname) { + + for(i=0;intaxa;i++) { + if(dmat->taxaname[i]) { + free(dmat->taxaname[i]); + } + } + + free(dmat->taxaname); + } + + if(dmat->valhandle) { + free(dmat->valhandle); + } + + if(dmat->rhandle) { + free(dmat->rhandle); + } + + if(dmat->r2handle) { + free(dmat->r2handle); + } + + free(dmat); + } + + return; +} + + + + + +/* + * NJ_free_vertex() - + * + * Free the vertex data structure + * + */ +void +NJ_free_vertex(NJ_VERTEX *vertex) { + + if(vertex) { + if(vertex->nodes_handle) { + free(vertex->nodes_handle); + } + free(vertex); + } + + return; +} + + + + + + + + + +/* + * + * NJ_min_transform() - Find the smallest transformed value to identify + * which nodes to join. + * + * INPUTS: + * ------- + * dmat -- The distance matrix + * + * RETURNS: + * -------- + * -- The minimimum transformed distance + * ret_i -- The row of the smallest transformed distance (by reference) + * ret_j -- The col of the smallest transformed distance (by reference) + * + * + * DESCRIPTION: + * ------------ + * + * Used only with traditional Neighbor-Joining, this function checks the entire + * working distance matrix and identifies the smallest transformed distance. + * This requires traversing the entire diagonal matrix, which is itself a + * O(N^2) operation. + * + */ +float +NJ_min_transform(DMAT *dmat, + long int *ret_i, + long int *ret_j) { + + long int i, j; /* indices used for looping */ + long int tmp_i = 0;/* to limit pointer dereferencing */ + long int tmp_j = 0;/* to limit pointer dereferencing */ + float smallest; /* track the smallest trans. dist */ + float curval; /* the current trans. dist in loop */ + + float *ptr; /* pointer into distance matrix */ + float *r2; /* pointer to r2 matrix for computing transformed dists */ + + smallest = (float)HUGE_VAL; + + /* track these here to limit pointer dereferencing in inner loop */ + ptr = dmat->val; + r2 = dmat->r2; + + /* for every row */ + for(i=0;isize;i++) { + ptr++; /* skip diagonal */ + for(j=i+1;jsize;j++) { /* for every column */ + + /* find transformed distance in matrix at i, j */ + curval = *(ptr++) - (r2[i] + r2[j]); + + /* if the transformed distanance is less than the known minimum */ + if(curval < smallest) { + + smallest = curval; + tmp_i = i; + tmp_j = j; + } + } + } + + /* pass back (by reference) the coords of the min. transformed distance */ + *ret_i = tmp_i; + *ret_j = tmp_j; + + return(smallest); /* return the min transformed distance */ +} + + + + + diff --git a/clearcut.h b/clearcut.h new file mode 100644 index 0000000..2710a8d --- /dev/null +++ b/clearcut.h @@ -0,0 +1,281 @@ +/* + * clearcut.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_CLEARCUT_H_ +#define _INC_CLEARCUT_H_ 1 + +#include "common.h" +#include "cmdargs.h" + + +#define NJ_VERSION "1.0.9" + + +#define NJ_INTERNAL_NODE -1 +#define NJ_LAST 101 + +#define NJ_INPUT_MODE_UNKNOWN 0 +#define NJ_INPUT_MODE_DISTANCE 100 +#define NJ_INPUT_MODE_UNALIGNED_SEQUENCES 101 +#define NJ_INPUT_MODE_ALIGNED_SEQUENCES 102 + +#define NJ_MODEL_NONE 100 +#define NJ_MODEL_JUKES 101 +#define NJ_MODEL_KIMURA 102 + + + + +/* + * DMAT - Distance Matrix + * + * This is arguably the most important structure in the + * program. This is the distance matrix, and it is used + * by many functions throughout the application. + * + * The matrix is architected as a contiguously allocated + * upper-diagonal matrix of floats which include the + * diagonal. + * + * Example: + * + * 0 1 2 3 4 5 + * 0 0.0 1.0 0.3 0.2 0.1 0.3 + * 1 0.0 0.3 0.2 0.1 0.8 + * 2 0.0 0.1 0.3 0.5 + * 3 0.0 0.2 0.1 + * 4 0.0 0.2 + * 5 0.0 + * + * The distance matrix shrinks with every join operation, + * so I track the original and working size of the matrix + * inside the matrix. + * + * One fast optimization to shrink the distance matrix + * involves incrementing the "val" pointer. Thus, in + * addition to tracking the pointer to the distances, + * I also track the original pointer to that I can + * free the memory associated with the working distance + * matrix. + * + * This also applies to the r and r2 vectors which are + * used to compute the transformed distances in the + * matrix. + * + */ + +typedef struct _STRUCT_DMAT { + + long int ntaxa; /* the original size of the distance matrix */ + long int size; /* the current/effective size of the distance matrix */ + + char **taxaname; /* a pointer to an array of taxa name strings */ + + float *val; /* the distances */ + float *valhandle; /* to track the orig. pointer to free memory */ + + float *r, *r2; /* r and r2 vectors (used to compute transformed dists) */ + float *rhandle, *r2handle; /* track orig. pointers to free memory */ + +} DMAT; + + + +/* + * NJ_TREE - The Tree Data Structure + * + * + * The tree is represented internally as a rooted + * binary tree. Each internal node has a left and a right child. + * + * Additionally, I track the distance between the current node + * and that node's parent (i.e. the branch length). + * + * Finally, I track the index of the taxa for leaf nodes. + * + */ +typedef struct _STRUCT_NJ_TREE { + + struct _STRUCT_NJ_TREE *left; /* left child */ + struct _STRUCT_NJ_TREE *right; /* right child */ + + float dist; /* branch length. i.e. dist from node to parent */ + + long int taxa_index; /* for terminal nodes, track the taxon index */ + +} NJ_TREE; + + + +/* + * NJ_VERTEX + * + * This structure is used for building trees. It is a vector + * which, represents the center of the star when building the RNJ/NJ + * tree through star-decomposition. + * + * It contains a vector of tree (node) pointers. These pointers + * get joined together by a new internal node, and the new internal + * node is placed back into the vector of nodes (which is now smaller). + * + * To keep this vector in sync. with the shrinking matrix, parts of + * the vector are shuffled around, and so a pointer to the originally + * allocated vector is stored such that it can be freed from memory + * later. + * + * The original and working sizes of the vector are also tracked. + * + */ +typedef struct _STRUCT_NJ_VERTEX { + + NJ_TREE **nodes; + NJ_TREE **nodes_handle; /* original memory handle for freeing */ + + long int nactive; /* number of active nodes in the list */ + long int size; /* the total size of the vertex */ + +} NJ_VERTEX; + + + + + + +/* some function prototypes */ + +/* core function for performing Relaxed Neighbor Joining */ +NJ_TREE * +NJ_relaxed_nj(NJ_ARGS *nj_args, DMAT *dmat); + +/* function for performing traditional Neighbor-Joining */ +NJ_TREE * +NJ_neighbor_joining(NJ_ARGS *nj_args, DMAT *dmat); + +/* print the distance matrix (for debugging) */ +void +NJ_print_distance_matrix(DMAT *dmat); + +/* output the computed tree to stdout or to the specified file */ +void +NJ_output_tree(NJ_ARGS *nj_args, + NJ_TREE *tree, + DMAT *dmat, + long int count); + +/* the recursive function for outputting trees */ +void +NJ_output_tree2(FILE *fp, + NJ_ARGS *nj_args, + NJ_TREE *tree, + NJ_TREE *root, + DMAT *dmat); + +/* initialize vertex */ +NJ_VERTEX * +NJ_init_vertex(DMAT *dmat); + +/* used to decompose the star topology and build the tree */ +NJ_TREE * +NJ_decompose(DMAT *dmat, + NJ_VERTEX *vertex, + long int x, + long int y, + int last_flag); + +/* print the vertex vector (for debugging) */ +void +NJ_print_vertex(NJ_VERTEX *vertex); + +/* print taxa names (for debugging) */ +void +NJ_print_taxanames(DMAT *dmat); + +/* initialize r-vector prior to RNJ/NJ */ +void +NJ_init_r(DMAT *dmat); + +/* print the r-vector (for debugging) */ +void +NJ_print_r(DMAT *dmat); + +/* shuffle the distance matrix, usually after reading in input */ +void +NJ_shuffle_distance_matrix(DMAT *dmat); + +/* free memory from the tree */ +void +NJ_free_tree(NJ_TREE *node); + +/* print permutations (for debugging) */ +void +NJ_print_permutation(long int *perm, + long int size); + +/* duplicate a distance matrix for multiple iterations */ +DMAT * +NJ_dup_dmat(DMAT *src); + +/* free the distance matrix */ +void +NJ_free_dmat(DMAT *dmat); + +/* free the vertex vector */ +void +NJ_free_vertex(NJ_VERTEX *vertex); + +/* for computing the global minimum transformed distance in traditional NJ */ +float +NJ_min_transform(DMAT *dmat, + long int *ret_i, + long int *ret_j); + +#endif /* _INC_CLEARCUT_H_ */ + + + + + + diff --git a/cmdargs.c b/cmdargs.c new file mode 100644 index 0000000..156b1da --- /dev/null +++ b/cmdargs.c @@ -0,0 +1,526 @@ +/* + * cmdargs.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include + + +#ifdef USE_GNU +#include +#else +#include "getopt_long.h" +#endif /* USE_GNU*/ + + +#include "clearcut.h" +#include "cmdargs.h" + + +/* + * NJ_handle_args() - + * + */ +NJ_ARGS * +NJ_handle_args(int argc, + char *argv[]) { + + static NJ_ARGS nj_args; + int option_index, c; + + struct option NJ_long_options[] = { + + /* These options don't set a flag */ + {"in", required_argument, NULL, 'i'}, + {"out", required_argument, NULL, 'o'}, + {"seed", required_argument, NULL, 's'}, + {"matrixout", required_argument, NULL, 'm'}, + {"ntrees", required_argument, NULL, 'n'}, + + /* These options set a flag */ + {"verbose", no_argument, &(nj_args.verbose_flag), 1}, + {"quiet", no_argument, &(nj_args.quiet_flag), 1}, + {"distance", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_DISTANCE}, + {"alignment", no_argument, &(nj_args.input_mode), NJ_INPUT_MODE_ALIGNED_SEQUENCES}, + {"help", no_argument, &(nj_args.help), 1}, + {"version", no_argument, &(nj_args.version), 1}, + {"norandom", no_argument, &(nj_args.norandom), 1}, + {"shuffle", no_argument, &(nj_args.shuffle), 1}, + {"stdin", no_argument, &(nj_args.stdin_flag), 1}, + {"stdout", no_argument, &(nj_args.stdout_flag), 1}, + {"dna", no_argument, &(nj_args.dna_flag), 1}, + {"DNA", no_argument, &(nj_args.dna_flag), 1}, + {"protein", no_argument, &(nj_args.protein_flag), 1}, + {"neighbor", no_argument, &(nj_args.neighbor), 1}, + {"expblen", no_argument, &(nj_args.expblen), 1}, + {"expdist", no_argument, &(nj_args.expdist), 1}, + + {"jukes", no_argument, &(nj_args.jukes_flag), 1}, + {"kimura", no_argument, &(nj_args.kimura_flag), 1}, + + {0, 0, 0, 0} + + }; + + /* initializes options to their default */ + nj_args.infilename = NULL; + nj_args.outfilename = NULL; + nj_args.matrixout = NULL; + nj_args.seed = time(0); + nj_args.verbose_flag = 0; + nj_args.quiet_flag = 0; + nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; + nj_args.help = 0; + nj_args.version = 0; + nj_args.norandom = 0; + nj_args.shuffle = 0; + nj_args.stdin_flag = 0; + nj_args.stdout_flag = 0; + nj_args.dna_flag = 0; + nj_args.protein_flag = 0; + nj_args.correction_model = NJ_MODEL_NONE; + nj_args.jukes_flag = 0; + nj_args.kimura_flag = 0; + nj_args.neighbor = 0; + nj_args.ntrees = 1; + nj_args.expblen = 0; + nj_args.expdist = 0; + + while(1) { + c = getopt_long(argc, + argv, + "i:o:s:m:n:vqduahVSIOrDPjkNeE", + NJ_long_options, + &option_index); + + if(c == -1) { + break; + } + + switch(c) { + + case 0: + if(NJ_long_options[option_index].flag) { + break; + } + + printf("option %s", NJ_long_options[option_index].name); + if(optarg) { + printf(" with arg %s", optarg); + } + printf("\n"); + break; + + case 'i': + nj_args.infilename = optarg; + break; + + case 'o': + nj_args.outfilename = optarg; + break; + + case 's': + nj_args.seed = atoi(optarg); + break; + + case 'm': + nj_args.matrixout = optarg; + break; + + case 'n': + nj_args.ntrees = atoi(optarg); + break; + + case 'v': + nj_args.verbose_flag = 1; + break; + + case 'q': + nj_args.quiet_flag = 1; + break; + + case 'd': + nj_args.input_mode = NJ_INPUT_MODE_DISTANCE; + break; + + case 'a': + nj_args.input_mode = NJ_INPUT_MODE_ALIGNED_SEQUENCES; + break; + + case 'h': + nj_args.help = 1; + break; + + case 'V': + nj_args.version = 1; + break; + + case 'S': + nj_args.shuffle = 1; + break; + + case 'I': + nj_args.stdin_flag = 1; + break; + + case 'O': + nj_args.stdin_flag = 1; + break; + + case 'r': + nj_args.norandom = 1; + break; + + case 'D': + nj_args.dna_flag = 1; + break; + + case 'P': + nj_args.protein_flag = 1; + break; + + case 'j': + nj_args.jukes_flag = 1; + break; + + case 'k': + nj_args.kimura_flag = 1; + break; + + case 'N': + nj_args.neighbor = 1; + break; + + case 'e': + nj_args.expblen = 1; + break; + + case 'E': + nj_args.expdist = 1; + break; + + default: + NJ_usage(); + exit(-1); + } + } + + if(optind < argc) { + fprintf(stderr, "Clearcut: Unknown command-line argument:\n --> %s\n", argv[optind]); + NJ_usage(); + exit(-1); + } + + if(nj_args.version) { + printf("Clearcut Version: %s\n", NJ_VERSION); + exit(0); + } + + if(nj_args.help) { + NJ_usage(); + exit(0); + } + + /* if stdin & explicit filename are specified for input */ + if(nj_args.stdin_flag) { + if(nj_args.infilename) { + fprintf(stderr, "Clearcut: Ambiguous input source specified. Specify input filename OR stdin.\n"); + NJ_usage(); + exit(-1); + } + } + + /* if stdout & explicit filename are specified for output */ + if(nj_args.stdout_flag) { + if(nj_args.outfilename) { + fprintf(stderr, "Clearcut: Ambiguous output specified. Specify output filename OR stdout.\n"); + NJ_usage(); + exit(-1); + } + } + + /* if user did not specify stdin or filename, default to stdin */ + if(!nj_args.stdin_flag) { + if(!nj_args.infilename) { + + fprintf(stderr, "Clearcut: No input file specified. Using stdin.\n"); + nj_args.stdin_flag = 1; + } + } + + /* if user did not specify stdout or filename, default to stdout */ + if(!nj_args.stdout_flag) { + if(!nj_args.outfilename) { + + fprintf(stderr, "Clearcut: No output file specified. Using stdout.\n"); + nj_args.stdout_flag = 1; + } + } + + /* User must specify distance matrix or alignment */ + if(nj_args.input_mode == NJ_INPUT_MODE_UNKNOWN) { + fprintf(stderr, "Clearcut: Must specify input type (--distance | --alignment)\n"); + NJ_usage(); + exit(-1); + } + + /* do not allow protein or DNA options for distance matrix input */ + if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { + if(nj_args.dna_flag || nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Ambiguous arguments. (--protein | --DNA) do not apply to distance \n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure different filenames were specified for input and output */ + if(!nj_args.stdin_flag && !nj_args.stdout_flag) { + + if(!strcmp(nj_args.infilename, nj_args.outfilename)) { + fprintf(stderr, "Clearcut: Input filename and output filename must be unique.\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure that user specifies DNA or Protein if dealing with alignment input */ + if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + if(!nj_args.dna_flag && !nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Must specify protein or DNA for alignment input.\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure that user does not specify both protein and DNA when dealing with alignment input */ + if(nj_args.input_mode == NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + if(nj_args.dna_flag && nj_args.protein_flag) { + fprintf(stderr, "Clearcut: Specifying protein and DNA sequences are mutually exclusive options\n"); + NJ_usage(); + exit(-1); + } + } + + /* make sure verbose and quiet were not specified together */ + if(nj_args.verbose_flag && nj_args.quiet_flag) { + fprintf(stderr, "Clearcut: Verbose and Quiet mode are mutually exclusive.\n"); + NJ_usage(); + exit(-1); + } + + /* make sure that a correction model was specified only when providing an alignment */ + if(nj_args.input_mode == NJ_INPUT_MODE_DISTANCE) { + if(nj_args.jukes_flag || nj_args.kimura_flag) { + fprintf(stderr, "Clearcut: Only specify correction model for alignment input.\n"); + NJ_usage(); + exit(-1); + } + } else { + if(nj_args.jukes_flag && nj_args.kimura_flag) { + fprintf(stderr, "Clearcut: Only specify one correction model\n"); + NJ_usage(); + exit(-1); + } else { + if(nj_args.jukes_flag && !nj_args.kimura_flag) { + nj_args.correction_model = NJ_MODEL_JUKES; + } else if(nj_args.kimura_flag && !nj_args.jukes_flag) { + nj_args.correction_model = NJ_MODEL_KIMURA; + } else { + nj_args.correction_model = NJ_MODEL_NONE; /* DEFAULT */ + } + } + } + + /* make sure that the number of output trees is reasonable */ + if(nj_args.ntrees <= 0) { + fprintf(stderr, "Clearcut: Number of output trees must be a positive integer.\n"); + NJ_usage(); + exit(-1); + } + + /* + * make sure that if exponential distances are specified, + * we are dealing with alignment input + */ + if(nj_args.expdist && nj_args.input_mode != NJ_INPUT_MODE_ALIGNED_SEQUENCES) { + fprintf(stderr, "Clearcut: Exponential notation for distance matrix output requires that input be an alignment\n"); + NJ_usage(); + exit(-1); + } + + return(&nj_args); +} + + + + + +/* + * NJ_print_args() - + * + */ +void +NJ_print_args(NJ_ARGS *nj_args) { + + char input_mode[32]; + + switch (nj_args->input_mode) { + case NJ_INPUT_MODE_DISTANCE: + sprintf(input_mode, "Distance Matrix"); + break; + case NJ_INPUT_MODE_UNALIGNED_SEQUENCES: + sprintf(input_mode, "Unaligned Sequences"); + break; + case NJ_INPUT_MODE_ALIGNED_SEQUENCES: + sprintf(input_mode, "Aligned Sequences"); + break; + default: + sprintf(input_mode, "UNKNOWN"); + break; + } + + printf("\n*** Command Line Arguments ***\n"); + + printf("Input Mode: %s\n", input_mode); + + if(nj_args->stdin_flag) { + printf("Input from STDIN\n"); + } else { + printf("Input File: %s\n", nj_args->infilename); + } + + if(nj_args->stdout_flag) { + printf("Output from STDOUT\n"); + } else { + printf("Output File: %s\n", nj_args->outfilename); + } + + if(nj_args->input_mode != NJ_INPUT_MODE_DISTANCE) { + if(nj_args->aligned_flag) { + printf("Input Sequences Aligned: YES\n"); + } else { + printf("Input Sequences Aligned: NO\n"); + } + } + + if(nj_args->verbose_flag) { + printf("Verbose Mode: ON\n"); + } else { + printf("Verbose Mode: OFF\n"); + } + + if(nj_args->quiet_flag) { + printf("Quiet Mode: ON\n"); + } else { + printf("Quiet Mode: OFF\n"); + } + + if(nj_args->seed) { + printf("Random Seed: %d\n", nj_args->seed); + } + + printf("\n*******\n"); + + return; +} + + + + +/* + * NJ_usage() - + * + * Print a usage message + * + */ +void +NJ_usage(void) { + + printf("Usage: clearcut --in= --out= [options]...\n"); + printf("GENERAL OPTIONS:\n"); + printf(" -h, --help Display this information.\n"); + printf(" -V, --version Print the version of this program.\n"); + printf(" -v, --verbose More output. (Default: OFF)\n"); + printf(" -q, --quiet Silent operation. (Default: ON)\n"); + printf(" -s, --seed= Explicitly set the PRNG seed to a specific value.\n"); + printf(" -r, --norandom Attempt joins deterministically. (Default: OFF)\n"); + printf(" -S, --shuffle Randomly shuffle the distance matrix. (Default: OFF)\n"); + printf(" -N, --neighbor Use traditional Neighbor-Joining algorithm. (Default: OFF)\n"); + + printf("\n"); + printf("INPUT OPTIONS:\n"); + printf(" -I, --stdin Read input from STDIN.\n"); + printf(" -d, --distance Input file is a distance matrix. (Default: ON)\n"); + printf(" -a, --alignment Input file is a set of aligned sequences. (Default: OFF)\n"); + printf(" -D, --DNA Input alignment are DNA sequences.\n"); + printf(" -P, --protein Input alignment are protein sequences.\n"); + + printf("\n"); + printf("CORRECTION MODEL FOR COMPUTING DISTANCE MATRIX (Default: NO Correction):\n"); + printf(" -j, --jukes Use Jukes-Cantor correction for computing distance matrix.\n"); + printf(" -k, --kimura Use Kimura correction for distance matrix.\n"); + + printf("\n"); + printf("OUTPUT OPTIONS:\n"); + printf(" -O, --stdout Output tree to STDOUT.\n"); + printf(" -m, --matrixout= Output distance matrix to specified file.\n"); + printf(" -n, --ntrees= Output n trees. (Default: 1)\n"); + printf(" -e, --expblen Exponential notation for branch lengths. (Default: OFF)\n"); + printf(" -E, --expdist Exponential notation in distance output. (Default: OFF)\n"); + + printf("\n"); + printf("EXAMPLES:\n"); + printf(" Compute tree by supplying distance matrix via stdin:\n"); + printf(" clearcut --distance < distances.txt > treefile.tre\n"); + printf("\n"); + printf(" Compute tree by supplying an alignment of DNA sequences from a file:\n"); + printf(" clearcut --alignment --DNA --in=alignment.txt --out=treefile.tre\n"); + + return; +} + + + diff --git a/cmdargs.h b/cmdargs.h new file mode 100644 index 0000000..d44a330 --- /dev/null +++ b/cmdargs.h @@ -0,0 +1,113 @@ +/* + * njdist.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_NJ_CMDARGS_H_ +#define _INC_NJ_CMDARGS_H_ 1 + +#include "clearcut.h" + + +/* some datatypes */ +typedef struct _STRUCT_NJ_ARGS { + + char *infilename; /* the name of the input file */ + char *outfilename; /* the name of the output tree */ + char *matrixout; /* the name of the distance matrix output file */ + + int input_mode; + int aligned_flag; + + int verbose_flag; + int quiet_flag; + + int stdin_flag; + int stdout_flag; + + int help; + int version; + + int norandom; + int shuffle; + + int dna_flag; + int protein_flag; + + int seed; + + /* correction models for distance */ + int correction_model; + int jukes_flag; + int kimura_flag; + + /* flag for using traditional neighbor-joining */ + int neighbor; + + /* number of trees to output */ + int ntrees; + + /* exponential notation output */ + int expblen; /* exp notation for tree branch lengths */ + int expdist; /* exp notation for distances in matrix output */ + +} NJ_ARGS; + + + +/* some function prototypes */ + +NJ_ARGS * +NJ_handle_args(int argc, + char *argv[]); + +void +NJ_print_args(NJ_ARGS *nj_args); + +void +NJ_usage(void); + + +#endif /* _INC_NJ_CMDARGS_H_ */ + diff --git a/common.h b/common.h new file mode 100644 index 0000000..573d78c --- /dev/null +++ b/common.h @@ -0,0 +1,141 @@ +/* + * common.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * A header file filled with common definitions and simple inline functions + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_NJ_COMMON_H_ +#define _INC_NJ_COMMON_H_ 1 + +#include +#include + + +#define NJ_AMBIGUITY_CHAR 63 /* ? character */ + + +/* + * this macro defines the number of cells in the diagonal matrix + * based on the number of taxa involved + * + */ +#define NJ_NCELLS(a) ( ((a)*(a+1))/2 ) + + + + +/* + * NJ_MAP() - + * + * Thus function maps i, j coordinates to the correct offset into + * the distance matrix + * + */ +static inline +long int +NJ_MAP(long int i, + long int j, + long int ntaxa) { + + return((i*(2*ntaxa-i-1))/2 + j); +} + + +static inline +int +NJ_FLT_EQ(float x, + float y) { + + if(fabs(x - y) y) { + return(1); + } else { + return(0); + } + } +} + + + + +#endif /* _INC_NJ_COMMON_H_ */ + + + diff --git a/dayhoff.h b/dayhoff.h new file mode 100644 index 0000000..44be916 --- /dev/null +++ b/dayhoff.h @@ -0,0 +1,100 @@ +/* + * dayhoff.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_NJ_DAYHOFF_H_ +#define _INC_NJ_DAYHOFF_H_ 1 + +/* + * As sequence divergence increases, we need to correct for multiple hits + * when using Kimura distance correction method for amino acid sequences. + * + * This matrix of values represents the estimated "Accepted Point Mutations" + * or PAMs for a range of amino acid sequence divergence, starting at 75% + * up through 93% (in 0.1% increments). + * + * This model is derived from Dayhoff (1978). + * + * This Dayhoff matrix and the shortcut methods for dealing with Kimura + * correction at high sequence divergence (> 75%) are derived from similar + * work in Clustal W: + * + * Thompson, J.D., Higgins, D.G., Gibson, T.J., "CLUSTAL W: + * improving the sensitivity of progressive multiple sequence + * alignment through sequence weighting, position-specific gap + * penalties and weight matrix choice.", + * Nucleic Acids Research, 22:4673-4680, 1994 + * + */ + + +int NJ_dayhoff[]={ + 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, + 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 236, 237, 238, 239, 240, 241, 243, 244, 245, + 246, 248, 249, 250, 252, 253, 254, 255, 257, 258, + 260, 261, 262, 264, 265, 267, 268, 270, 271, 273, + 274, 276, 277, 279, 281, 282, 284, 285, 287, 289, + 291, 292, 294, 296, 298, 299, 301, 303, 305, 307, + 309, 311, 313, 315, 317, 319, 321, 323, 325, 328, + 330, 332, 335, 337, 339, 342, 344, 347, 349, 352, + 354, 357, 360, 362, 365, 368, 371, 374, 377, 380, + 383, 386, 389, 393, 396, 399, 403, 407, 410, 414, + 418, 422, 426, 430, 434, 438, 442, 447, 451, 456, + 461, 466, 471, 476, 482, 487, 493, 498, 504, 511, + 517, 524, 531, 538, 545, 553, 560, 569, 577, 586, + 595, 605, 615, 626, 637, 649, 661, 675, 688, 703, + 719, 736, 754, 775, 796, 819, 845, 874, 907, 945, + 988 +}; + + + +#endif /* _INC_NJ_DAYHOFF_H_ */ + + + diff --git a/distclearcut.c b/distclearcut.c new file mode 100644 index 0000000..b688aa6 --- /dev/null +++ b/distclearcut.c @@ -0,0 +1,668 @@ +/* + * dist.c + * + * $Id$ + * + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Compute a distance matrix given a set of sequences + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include +#include + +#include "common.h" +#include "dayhoff.h" +#include "fasta.h" +#include "dist.h" + + + + +/* + * NJ_build_distance_matrix() - + * + * Given a filename for an alignment, read the alignment + * into memory and then compute the distance matrix + * using the appropriate correction model + */ +DMAT * +NJ_build_distance_matrix(NJ_ARGS *nj_args) { + + DMAT *dmat; + NJ_alignment *alignment; + + /* Read an alignment in FASTA format */ + alignment = + NJ_read_fasta(nj_args); + + if(!alignment) { + return(NULL); + } + + /* + * Given a global multiple sequence alignment (MSA) and + * a specified distance correction model, compute a + * corrected distance matrix + * + * From proteins, we may want to allow users to specify + * a substitution matrix (feature) + */ + dmat = + NJ_compute_dmat(nj_args, + alignment); + + // NJ_print_taxanames(dmat); + + if(!dmat) { + fprintf(stderr, "Clearcut: Error computing distance matrix\n"); + } + + /* now free the memory associated with the alignment */ + NJ_free_alignment(alignment); + + return(dmat); +} + + + + + +/* + * NJ_compute_dmat() - + * + * Given an alignment and a correction model, compute the + * distance matrix and return it + * + */ +DMAT * +NJ_compute_dmat(NJ_ARGS *nj_args, + NJ_alignment *alignment) { + + DMAT *dmat; + long int i; + + + /* allocate distance matrix here */ + dmat = (DMAT *)calloc(1, sizeof(DMAT)); + if(!dmat) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + dmat->ntaxa = alignment->nseq; + dmat->size = alignment->nseq; + + /* allocate memory to hold the taxa names */ + dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *)); + if(!dmat->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* copy sequence titles */ + for(i=0;inseq;i++) { + dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char)); + if(!dmat->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i])); + } + + /* allocate val matrix in dmat */ + dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float)); + if(!dmat->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* now lets allocate space for the r and r2 columns */ + dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); + dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); + + /* track some memory addresses */ + dmat->rhandle = dmat->r; + dmat->r2handle = dmat->r2; + dmat->valhandle = dmat->val; + + /* apply model correction to matrix */ + switch(nj_args->correction_model) { + + case NJ_MODEL_JUKES: + + if(nj_args->dna_flag) { + NJ_DNA_jc_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_jc_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_KIMURA: + + if(nj_args->dna_flag) { + NJ_DNA_k2p_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_kimura_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_NONE: + + NJ_no_correction(dmat, alignment); + + break; + + default: + fprintf(stderr, "Clearcut: Invalid distance correction model.\n"); + return(NULL); + } + + return(dmat); +} + + + + + +/* + * NJ_no_correction() - + * + * Compute the distance matrix without correction + * (straight percent ID) + * + * Resolve ambiguities in sequence data by skipping + * those nucleotides/residues + * + */ +void +NJ_no_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float pdiff; + + /* compute pairwise percent identity */ + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + pdiff = 1.0 - NJ_pw_percentid(alignment, i, j); + dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff; + } + } + + return; +} + + + + +/* + * NJ_DNA_jc_correction() - + * + * Compute the distance matrix with jukes-cantor correction + * and assign high distance if sequence divergence exceeds + * 0.75 + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_DNA_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int k; + float d, cutoff, dist; + long int residues; + + cutoff = 0.75; + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + k = NJ_pw_differences(alignment, i, j, &residues); + d = 1.0 - NJ_pw_percentid(alignment, i, j); + + if(d > cutoff) { + dist = NJ_BIGDIST; + } else { + dist = (-0.75) * log(1.0 - (4.0/3.0)*d); + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + + + return; +} + + + + + + +/* + * NJ_PROTEIN_jc_correction() - + * + * This function performs modified jukes/cantor correction on + * a protein alignment + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_PROTEIN_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist, x; + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + + dist = (float)diff/(float)residues; + x = ((20.0/19.0)*dist); + + if(NJ_FLT_GT(x, 1.0)) { + dist = NJ_BIGDIST; + } else { + dist = -(19.0/20.0) * log(1.0 - x); + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + + +/* + * NJ_DNA_k2p_correction() - + * + * Correct a distance matrix using k2p correction using + * cutoffs to avoid problems with logarithms. + * + * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q) + * + * But due to the logarithms, this is only valid when + * + * (2P+Q <= 1) && + * (2Q <= 1) + * + * So assign arbitary distances when these constraints are + * not strictly followed. + * + * Kimura, M. (1980), A simple method for estimating evolutionary + * rates of base substitutions through comparative studies of + * nucleotide sequences. J. Mol. Evol., 16, 111-120 + * + */ +void +NJ_DNA_k2p_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float P; /* proportion of transitions */ + float Q; /* proportion of transversions */ + long int nucleotides; + long int transitions, transversions; + float dist; + float log_x = 0.0; /* the params for the first log */ + float log_y = 0.0; /* the params for the second log */ + + int blowup; /* a flag to specify if we have a log blowup */ + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + blowup = 0; + + /* count the number of transitions and transversions */ + NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides); + + if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */ + P = 0.0; + Q = 0.0; + } else { + P = (float)transitions / (float)nucleotides; + Q = (float)transversions / (float)nucleotides; + } + + /* the first log blows up if 2*P+Q = 1.0 */ + if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) { + blowup = 1; + } else { + if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) { + blowup = 1; + } else { + log_x = log(1.0 - 2.0*P - Q); + } + } + + /* the second log blows up if 2*Q >= 1.0 */ + if( NJ_FLT_EQ((2.0 * Q), 1.0) || + NJ_FLT_GT((2.0 * Q), 1.0) ) { + blowup = 1; + } else { + log_y = log(1.0 - 2.0*Q); + } + + /* if our logarithms blow up, we just set the distance to the max */ + if(blowup) { + dist = NJ_BIGDIST; + } else { + dist = (-0.5)*log_x - 0.25*log_y; + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + return; +} + + + + +/* + * NJ_PROTEIN_kimura_correction() - + * + * Perform Kimura correction for distances derived from protein + * alignments. + * + * Kimura, M. (1983), The Neutral Theory of Molecular Evolution. + * p. 75., Cambridge University Press, Cambridge, England + * + */ +void +NJ_PROTEIN_kimura_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist; + + + printf("NJ_PROTEIN_kimura_correction()\n"); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + dist = (float)diff/(float)residues; + } + + if(NJ_FLT_LT(dist, 0.75)) { + if(NJ_FLT_GT(dist, 0.0) ) { + dist = -log(1.0 - dist - (dist * dist/5.0) ); + } + } else { + if(NJ_FLT_GT(dist, 0.93) ) { + dist = 10.0; + } else { + dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ; + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + +/* + * NJ_DNA_count_tt() - + * + * Count the number of transitions and transversions + * between two aligned DNA sequences + * + * This routine automatically skips ambiguities when + * counting transitions and transversions. + * + */ +void +NJ_DNA_count_tt(NJ_alignment *alignment, + long int x, + long int y, + long int *transitions, + long int *transversions, + long int *residues) { + + long int tmp_transitions = 0; + long int tmp_transversions = 0; + long int tmp_residues = 0; + char a, b; + long int i; + + for(i=0;ilength;i++) { + + a = toupper(alignment->data[x*alignment->length+i]); + b = toupper(alignment->data[y*alignment->length+i]); + + if( (a == 'A' && b == 'T') || + (a == 'T' && b == 'A') || + (a == 'A' && b == 'C') || + (a == 'C' && b == 'A') || + (a == 'T' && b == 'G') || + (a == 'G' && b == 'T') || + (a == 'C' && b == 'G') || + (a == 'G' && b == 'C') ) { + tmp_transversions++; + } + + if( (a == 'C' && b == 'T') || + (a == 'T' && b == 'C') || + (a == 'G' && b == 'A') || + (a == 'A' && b == 'G') ) { + tmp_transitions++; + } + + /* count the number of residues */ + if(a != NJ_AMBIGUITY_CHAR && + b != NJ_AMBIGUITY_CHAR ) { + tmp_residues++; + } + + } + + *transitions = tmp_transitions; + *transversions = tmp_transversions; + + if(residues) { + *residues = tmp_residues; + } + + return; +} + + + + + +/* + * NJ_pw_percentid() - + * + * Given an alignment and a specification + * for two rows, compute the pairwise + * percent identity between the two + * + */ +float +NJ_pw_percentid(NJ_alignment *alignment, + long int x, + long int y) { + + float pid; + long int i; + long int residues; + long int same; + char c1, c2; + + residues = 0; + same = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + residues++; + + if(c1 == c2) { + same++; + } + } + + } + + pid = (float)same/(float)residues; + + return(pid); +} + + + +/* + * NJ_pw_differences() - + * + * Given an alignment and a specification + * for two rows in the alignment, compute the + * number of differences between the two sequences + * + * With respect to ambiguity codes, we will want to + * disregard those sites entirely in our count. + * + */ +long int +NJ_pw_differences(NJ_alignment *alignment, + long int x, + long int y, + long int *residues) { + + long int i; + long int diff; + char c1, c2; + long int tmp_residues; + + diff = 0; + tmp_residues = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + tmp_residues++; + + if(c1 != c2) { + diff++; + } + } + + } + + *residues = tmp_residues; + + return(diff); +} + + + + + + diff --git a/distclearcut.h b/distclearcut.h new file mode 100644 index 0000000..25a6541 --- /dev/null +++ b/distclearcut.h @@ -0,0 +1,125 @@ +/* + * dist.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Compute a distance matrix given a set of sequences + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_DIST_H_ +#define _INC_DIST_H_ 1 + +#include "fasta.h" +#include "clearcut.h" + + + +/* + * An arbitrarily large distance to represent distances + * which are too great to accurately correct. + */ +#define NJ_BIGDIST 10.0 + + + +/* some function prototypes */ +DMAT * +NJ_build_distance_matrix(NJ_ARGS *nj_args); + +DMAT * +NJ_compute_dmat(NJ_ARGS *nj_args, + NJ_alignment *alignment); + + +float +NJ_pw_percentid(NJ_alignment *alignment, + long int x, + long int y); + +long int +NJ_pw_differences(NJ_alignment *alignment, + long int x, + long int y, + long int *residues); + +void +NJ_no_correction(DMAT *dmat, + NJ_alignment *alignment); + +void +NJ_DNA_jc_correction(DMAT *dmat, + NJ_alignment *alignment); + +void +NJ_PROTEIN_jc_correction(DMAT *dmat, + NJ_alignment *alignment); + +void +NJ_DNA_k2p_correction(DMAT *dmat, + NJ_alignment *alignment); + +void +NJ_PROTEIN_kimura_correction(DMAT *dmat, + NJ_alignment *alignment); + +void +NJ_DNA_count_tt(NJ_alignment *alignment, + long int x, + long int y, + long int *transitions, + long int *transversions, + long int *residues); + + +#endif /* _INC_DIST_H_ */ + + + + + + + + + diff --git a/dmat.c b/dmat.c new file mode 100644 index 0000000..1d00cff --- /dev/null +++ b/dmat.c @@ -0,0 +1,869 @@ +/* + * dmat.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Distance matrix parser + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + + +#include +#include +#include +#include +#include + + + +#include "common.h" +#include "clearcut.h" + +#include "dmat.h" + + + + + +/* + * + * NJ_is_alpha() - determine if character is an alphabetic character + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is alphabetic (A-Z || a-z) + * 0 if character is NOT alphabetic + * + */ +static inline +int +NJ_is_alpha(char c) { + + if( (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') ) { + return(1); + } else { + return(0); + } +} + + + +/* + * + * NJ_is_whitespace() - determine if character is a whitespace character + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is whitespace (space, tab, CR, LF) + * 0 if character is NOT whitespace + * + */ +static inline +int +NJ_is_whitespace(char c) { + + if( c == ' ' || /* space */ + c == '\n' || /* newline */ + c == '\r' || /* carriage-return */ + c == '\v' || /* vertical tab */ + c == '\f' || /* form feed */ + c == '\t' ) { /* horizontal tab */ + return(1); + } else { + return(0); + } + +} + + + + +/* + * + * NJ_is_number() - determine if character is a number + * + * INPUT: + * ------ + * c -- character to test + * + * RETURN: + * ------- + * int -- 1 if character is a number (0-9) + * 0 if character is NOT a number + * + */ +static inline +int +NJ_is_number(char c) { + if(c >= '0' && c <= '9') { + return(1); + } else { + return(0); + } +} + + + +/* + * NJ_is_distance() - check if string is a properly formatted distance value + * + */ +static inline +int +NJ_is_distance(char *token) { + + int i; + char c; + int exponent_state; + int expsign_state; + int dpoint_state; + + /* if token is NULL return failure */ + if(!token) { + return(0); + } + + exponent_state = 0; + expsign_state = 0; + dpoint_state = 0; + + /* The first character must be a number, a decimal point or a sign */ + c = token[0]; + if(!NJ_is_number(c) && + c != '.' && + c != '-' && + c != '+' ) { + + goto BAD; + } + + /* + * if the first character is not a number, and string is only one + * character long, then we return failure. + */ + if(strlen(token) == 1) { + if(!NJ_is_number(c)) { + goto BAD; + } + } + + for(i=0;i0 && !exponent_state) { + if(c == '-' || + c == '+') { + goto BAD; + } + } + + /* if we are in the exponent state, and we've already seen a sign */ + if(exponent_state && expsign_state) { + if(c == '-' || + c == '+') { + goto BAD; + } + } + + /* if we are in the exponent state and we see a decimal point */ + if(exponent_state) { + if(c == '.') { + goto BAD; + } + } + + /* if we are in the exponent state and see another e or E */ + if(exponent_state) { + if(c == 'e' || + c == 'E') { + goto BAD; + } + } + + /* if we are dpoint_state and see another decimal point */ + if(dpoint_state) { + if(c == '.') { + goto BAD; + } + } + + + /* enter the exponent state if we need to */ + if(!exponent_state) { + if(c == 'e' || + c == 'E') { + exponent_state = 1; + } + } + + /* enter the expsign_state if we need to */ + if(exponent_state && !expsign_state) { + if(c == '-' || + c == '+') { + expsign_state = 1; + } + } + + /* if not in dpoint state and we see a dpoint */ + if(!dpoint_state) { + if(c == '.') { + dpoint_state = 1; + } + } + + } + + /* the token must end in a number char */ + if(!NJ_is_number(token[strlen(token)-1])) { + goto BAD; + } + + /* token is a valid numerical distance */ + return(1); + + BAD: + + /* token is invalid distance format */ + return(0); +} + + + + +/* + * NJ_is_label() - + * + * Simply, if token is not a valid number, then it is a name + * + */ +static inline +int +NJ_is_label(char *token) { + if(NJ_is_distance(token)) { + return(0); + } else { + return(1); + } +} + + + +/* + * NJ_get_token() - get a token from an input stream + * + */ +static inline +int +NJ_get_token(FILE *fp, + NJ_DIST_TOKEN *token) { + + char c; + int index; + + c = fgetc(fp); + if(feof(fp)) { + token->type = NJ_EOF_STATE; + return(token->type); + } + + if(NJ_is_whitespace(c)) { + token->buf[0] = c; + token->buf[1] = '\0'; + token->type = NJ_WS_STATE; + + return NJ_WS_STATE; + } + + index = 0; + while(!NJ_is_whitespace(c)) { + + /* reallocate our buffer if necessary */ + if(index >= token->bufsize) { + token->bufsize *= 2; + token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char)); + if(!token->buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n"); + exit(-1); + } + } + + token->buf[index++] = c; + + c = fgetc(fp); + if(feof(fp)) { + token->type = NJ_EOF_STATE; + break; + } + } + + token->buf[index] = '\0'; + + if(token->type != NJ_EOF_STATE) { + + if(NJ_is_distance(token->buf)) { + token->type = NJ_FLOAT_STATE; + } else { + token->type = NJ_NAME_STATE; + } + + } + + return(token->type); +} + + + + + + +/* + * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix + * + * + * INPUT: + * ------ + * nj_args -- a pointer to a structure containing the command-line arguments + * + * OUTPUT: + * ------- + * -- NULL (failure) + * -- A pointer to a populated distance matrix + * + * DESCRIPTION: + * ------------ + * This function implements a simple state machine to parse a distance matrix + * in approximate PHYLIP format. This function auto-detects whether the + * distance matrix is in upper, lower, or fully-symmetric format and handles + * it accordingly. For full/symmetric matrices, values must be symmetric + * around the diagonal, which is required to be zeroes. Names and values must + * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can + * include numbers, but must start with non-numerical symbols. + * + * + * *** UPPER FORMAT EXAMPLE *** + * + * 4 + * seq1 0.2 0.3 0.1 + * seq2 0.2 0.3 + * seq3 0.1 + * seq4 + * + * *** LOWER FORMAT EXAMPLE *** + * + * 4 + * seq1 + * seq2 0.3 + * seq3 0.2 0.4 + * seq4 0.3 0.1 0.3 + * + * *** SYMMETRIC (FULL) EXAMPLE *** + * + * 4 + * seq1 0.0 0.3 0.5 0.3 + * seq2 0.3 0.0 0.1 0.2 + * seq3 0.5 0.1 0.0 0.9 + * seq4 0.3 0.2 0.9 0.0 + * + * Values in the distance matrix can be positive or negative, integers or + * real values. Values can also be parsed in exponential notation form. + * + */ +DMAT * +NJ_parse_distance_matrix(NJ_ARGS *nj_args) { + + DMAT *dmat = NULL; + FILE *fp = NULL; + NJ_DIST_TOKEN *token = NULL; + + int state, dmat_type; + int row; + int fltcnt; + int x, y, i; + int numvalread; + int expectedvalues = -1; + float val; + int first_state = 0; + + + /* allocate our distance matrix and token structure */ + dmat = (DMAT *)calloc(1, sizeof(DMAT)); + token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN)); + if(token) { + token->bufsize = NJ_INITIAL_BUFSIZE; + token->buf = (char *)calloc(token->bufsize, sizeof(char)); + } + if(!dmat || !token || !token->buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* open distance matrix file here */ + if(nj_args->stdin_flag) { + fp = stdin; + } else { + fp = fopen(nj_args->infilename, "r"); + if(!fp) { + fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename); + perror("Clearcut"); + goto XIT_BAD; + } + } + + /* get the number of taxa in this file */ + fscanf(fp, "%ld\n", &dmat->ntaxa); + if(dmat->ntaxa < 2) { + fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n"); + + goto XIT_BAD; + } + + /* set our initial working size according to the # of taxa */ + dmat->size = dmat->ntaxa; + + /* allocate space for the distance matrix values here */ + dmat->val = + (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float)); + if(!dmat->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* taxa names */ + dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *)); + if(!dmat->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* set the initial state of our state machine */ + dmat_type = NJ_PARSE_UNKNOWN; + row = -1; + fltcnt = 0; + numvalread = 0; + + + /* read the input one character at a time to drive simple state machine */ + state = NJ_get_token(fp, token); + while(state != NJ_EOF_STATE) { + + switch(state) { + + case NJ_NAME_STATE: + + if(first_state == 0) { + first_state = 1; + } + + row++; + + if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) { + + if(fltcnt == dmat->ntaxa) { + + dmat_type = NJ_PARSE_SYMMETRIC; + expectedvalues = dmat->ntaxa * dmat->ntaxa; + + } else if (fltcnt == dmat->ntaxa-1) { + + dmat_type = NJ_PARSE_UPPER; + expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; + + /* shift everything in first row by one char */ + for(i=dmat->ntaxa-2;i>=0;i--) { + dmat->val[i+1] = dmat->val[i]; + } + + } else if (fltcnt == 0) { + + dmat_type = NJ_PARSE_LOWER; + expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2; + + } else { + goto XIT_BAD; + } + } + + if(row >= dmat->ntaxa) { + goto XIT_BAD; + } + + /* allocate space for this taxon label */ + dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char)); + if(!dmat->taxaname[row]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + strcpy(dmat->taxaname[row], token->buf); + + fltcnt = 0; + + break; + + + case NJ_FLOAT_STATE: + + if(first_state == 0) { + goto XIT_BAD; + } + + val = atof(token->buf); + if(errno) { + fprintf(stderr, "Clearcut: Distance value out-of-range.\n"); + goto XIT_BAD; + } + + x = row; + y = fltcnt; + + switch(dmat_type) { + + case NJ_PARSE_UNKNOWN: + + dmat->val[NJ_MAP(x, y, dmat->size)] = val; + + break; + + case NJ_PARSE_SYMMETRIC: + + if(fltcnt >= dmat->ntaxa) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + if(x < y) { + dmat->val[NJ_MAP(x, y, dmat->size)] = val; + } else if(x > y) { + if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) { + fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n"); + goto XIT_BAD; + } + } else { + if(!NJ_FLT_EQ(val, 0.0)) { + fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); + goto XIT_BAD; + + } + } + + break; + + case NJ_PARSE_UPPER: + + if(fltcnt > dmat->ntaxa-row) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val; + + break; + + case NJ_PARSE_LOWER: + + if(fltcnt > row-1) { + fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n"); + goto XIT_BAD; + } + + dmat->val[NJ_MAP(y, x, dmat->size)] = val; + + break; + + default: + + goto XIT_BAD; + + break; + } + + fltcnt++; + numvalread++; + + break; + + case NJ_WS_STATE: + + break; + + case NJ_EOF_STATE: + + if(first_state == 0) { + goto XIT_BAD; + } + + break; + + default: + + fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n"); + break; + + } + + /* get next token from stream */ + state = NJ_get_token(fp, token); + } + + + /* + * At the end, if we have not read the number of values that we predicted + * we would need, then there was a problem and we need to punt. + */ + if(numvalread != expectedvalues) { + fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n"); + goto XIT_BAD; + } + + /* special check to make sure first value read is 0.0 */ + if(dmat_type == NJ_PARSE_SYMMETRIC) { + if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) { + fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n"); + goto XIT_BAD; + } + } + + + /* now lets allocate space for the r and r2 columns */ + dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); + dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); + if(!dmat->r || !dmat->r2) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n"); + goto XIT_BAD; + } + + /* track some memory addresses */ + dmat->rhandle = dmat->r; + dmat->r2handle = dmat->r2; + dmat->valhandle = dmat->val; + + /* close matrix file here */ + if(!nj_args->stdin_flag) { + fclose(fp); + } + + if(token) { + if(token->buf) { + free(token->buf); + } + free(token); + } + + return(dmat); + + + + /* clean up our partial progress */ + XIT_BAD: + + if(fp) { + fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp)); + } + + /* close matrix file here */ + if(!nj_args->stdin_flag) { + if(fp) { + fclose(fp); + } + } + + /* if we have a valid dmat (partial or complete), we need to free it */ + if(dmat) { + NJ_free_dmat(dmat); + } + + if(token) { + if(token->buf) { + free(token->buf); + } + free(token); + } + + return(NULL); +} + + + + + + + +/* + * NJ_output_matrix() - Output a distance matrix to the specified file + * + * + * INPUTS: + * ------- + * nj_args -- a pointer to a data structure holding the command-line args + * dmat -- a pointer to a distance matrix + * + * + * RETURNS: + * -------- + * NOTHING + * + * + * DESCRIPTION: + * ------------ + * If the appropriate flag was specified in the command-line, this function + * now outputs the parsed or computed distance matrix to a file. This + * can be useful if generating a distance matrix was the primary goal of + * running the program, or if one wanted to debug and/or verify the + * correctness of the program. + * + * Currently this function outputs full/symmetric matrices only. + * + */ +void +NJ_output_matrix(NJ_ARGS *nj_args, + DMAT *dmat) { + + FILE *fp = NULL; + long int i, j; + + + + /* if we haven't specieid matrixout, return immediately */ + if(!nj_args->matrixout) { + return; + } + + /* open the specified matrix file for writing */ + fp = fopen(nj_args->matrixout, "w"); + if(!fp) { + fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout); + return; + } + + /* output the number of taxa in the matrix */ + fprintf(fp, " %ld\n", dmat->size); + + fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop + + for(i=1;isize;i++) { + + /* output taxaname */ + fprintf(fp, "%s\t", dmat->taxaname[i]); + + for(j=0;jexpdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } + } + + fprintf(fp, "\n"); + } + +#ifdef FULL_SYMMETRIC_MATRIX + + /* output the number of taxa in the matrix */ + fprintf(fp, " %ld\n", dmat->size); + for(i=0;isize;i++) { + + /* output taxaname */ + fprintf(fp, "%s\t", dmat->taxaname[i]); + + for(j=0;jsize;j++) { + if(i>j) { + if(nj_args->expdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]); + } + } else if(iexpdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]); + } else { + fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]); + } + } else { + if(nj_args->expdist) { /* exponential notation (or not) */ + fprintf(fp, "%e ", 0.0); + } else { + fprintf(fp, "%f ", 0.0); + } + } + } + + fprintf(fp, "\n"); + } + +#endif // FULL_SYMMETRIC_MATRIX + + /* close the file here */ + if(fp) { + fclose(fp); + } + + return; +} + + + + + diff --git a/dmat.h b/dmat.h new file mode 100644 index 0000000..1c2be55 --- /dev/null +++ b/dmat.h @@ -0,0 +1,91 @@ +/* + * dmat.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Distance matrix parser header file + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + */ + + +#ifndef _INC_DMAT_H_ +#define _INC_DMAT_H_ 1 + +#include "clearcut.h" + + +#define NJ_INITIAL_BUFSIZE 32 + +#define NJ_NAME_STATE 100 +#define NJ_FLOAT_STATE 101 +#define NJ_WS_STATE 102 +#define NJ_EOF_STATE 103 + +#define NJ_PARSE_SYMMETRIC 100 +#define NJ_PARSE_LOWER 101 +#define NJ_PARSE_UPPER 102 +#define NJ_PARSE_UNKNOWN 103 + + +/* some data structures */ +typedef struct _NJ_DIST_TOKEN_STRUCT { + + char *buf; + long int bufsize; + int type; + +} NJ_DIST_TOKEN; + + + +/* some function prototypes */ + +DMAT * +NJ_parse_distance_matrix(NJ_ARGS *nj_args); + +void +NJ_output_matrix(NJ_ARGS *nj_args, + DMAT *dmat); + + +#endif /* _INC_DMAT_H_ */ + diff --git a/fasta.c b/fasta.c new file mode 100644 index 0000000..1dc17fc --- /dev/null +++ b/fasta.c @@ -0,0 +1,755 @@ +/* + * fasta.c + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Functions for parsing FASTA formatted alignment files + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include + +#include "clearcut.h" +#include "common.h" +#include "fasta.h" + + +#define NJ_NUM_DNA_AMBIGUITY_SYMS 14 +static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = +{ + 'M', 'R', 'W', 'S', 'Y', 'K', + 'V', 'H', 'D', 'B', 'X', 'N', + '-', '.' +}; + + +#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6 +static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] = +{ + 'X', 'B', 'Z', '*', '-', '.' +}; + +#define NJ_NUM_DNA_SYMS 5 +static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = +{ + 'A', 'G', 'C', 'T', 'U' +}; + + +#define NJ_NUM_PROTEIN_SYMS 20 +static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = +{ + 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', + 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' +}; + + + + + +/* + * NJ_is_whitespace() - Check to see if character is whitespace + * + * INPUTS: + * ------- + * c -- character to check + * + * RETURNS: + * -------- + * int -- 0 if not whitespace + * 1 if whitespace + */ +static inline +int +NJ_is_whitespace(char c) { + if( c == ' ' || /* space */ + c == '\n' || /* newline */ + c == '\r' || /* carriage-return */ + c == '\v' || /* vertical tab */ + c == '\f' || /* form feed */ + c == '\t' ) { /* horizontal tab */ + return(1); + } else { + return(0); + } +} + + + + + +/* + * NJ_is_dna() - + * + * Determines if the given symbol is DNA + * + * RETURNS: 1 if DNA + * 0 if not DNA + * + */ +static inline +int +NJ_is_dna(char c) { + + int i; + char up_c; + + up_c = toupper(c); + + for(i=0;i sequence1 + * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT- + * > sequence2 + * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT + * > sequence3 + * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC-- + * + * This function will parse alignments for DNA or protein, and will do + * so mindful of ambiguity codes for these kinds of sequences. All + * ambiguity codes are ignored by this program for the purposes of + * computing a distance matrix from a multiple alignment. By design, + * this program does not auto-detect DNA vs. Protein, and requires that + * the user explictly specify that on the command-line. + * + * Gaps can be represented either with the '-' or '.' characters. + * + * Newlines and other whitespace are allowed to be interspersed + * throughout the sequences. + * + * Taxon labels are required to be unique, and they must start with + * an alphabetic character (not a number, etc.). The parser will read + * the first token after the > character in the description line up until the + * first whitespace and use that for the taxon label. + * + * For example, in the line "> taxon1 is homo sapien", the taxon label will be + * "taxon1" + * + */ +NJ_alignment * +NJ_read_fasta(NJ_ARGS *nj_args) { + + FILE *fp = NULL; + char *buf = NULL; + char *ptr = NULL; + NJ_alignment *alignment = NULL; + + char c; + int state; + long int index, x, seq; + long int i; + long int bufsize, nseqs = NJ_INITIAL_NSEQS; + int first_sequence_flag; + + + + + /* + * In this function, we implement a FASTA alignment parser which + * reads in an alignment character-by-character, maintaining state + * information which guides the parser. + * + * The program reads either DNA or Protein alignments. All title lines + * and sequences can be arbitrarily long. Gaps can be represented by + * "-" or "." characters. + * + * Ambiguity codes are also handled. + * + */ + + /* + * We can't handle reading fasta input unless the user explicity + * specifies the input type...just to be sure. + */ + if( (!nj_args->dna_flag && !nj_args->protein_flag) || + (nj_args->dna_flag && nj_args->protein_flag) ) { + fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n"); + goto XIT_BAD; + } + + /* open specified fasta file here */ + if(nj_args->stdin_flag) { + fp = stdin; + } else { + fp = fopen(nj_args->infilename, "r"); + if(!fp) { + fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename); + perror("Clearcut"); + goto XIT_BAD; + } + } + + /* allocate the initial buffer */ + bufsize = NJ_INITIAL_BUFFER_SIZE; + buf = (char *)calloc(bufsize, sizeof(char)); + + /* allocate the alignment container here */ + alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment)); + + /* allocate initial title array */ + // printf("allocating initial title array\n"); + alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *)); + + /* make sure that we successfully allocated memory */ + if(!buf || !alignment || !alignment->titles) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + /* a flag */ + first_sequence_flag = 1; + + index = 0; /* tracks the position in buffer */ + x = 0; /* tracks the position on sequence */ + seq = 0; /* tracks the active sequence */ + + /* intitial state of state machine */ + state = NJ_FASTA_MODE_UNKNOWN; + + while(1) { + + /* get the next character */ + c = fgetc(fp); + if(feof(fp)) { + + if(state == NJ_FASTA_MODE_SEQUENCE) { + buf[index+1] = '\0'; + + /* copy buf to alignment */ + for(i=1;i<=alignment->length;i++) { + alignment->data[seq*alignment->length+i-1] = buf[i]; + } + } + + break; + } + + /* make sure our dynamic buffer is big enough */ + if(index >= bufsize) { + bufsize *= 2; + buf = (char *)realloc(buf, bufsize); + if(!buf) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + switch(state) { + + case NJ_FASTA_MODE_UNKNOWN: + + if(!NJ_is_whitespace(c)) { + if(c == '>') { + state = NJ_FASTA_MODE_TITLE; + buf[0] = '>'; + } else { + goto XIT_BAD; + } + } + + break; + + case NJ_FASTA_MODE_TITLE: + + if( c == '\n' || + c == '\r' ) { + + buf[index] = '\0'; + state = NJ_FASTA_MODE_SEQUENCE; + index = 0; + x = -1; + + /* make sure we've allocated enough space for titles and sequences */ + if(seq == nseqs) { + + // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs); + + nseqs *= 2; + + alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *)); + if(!alignment->titles) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf); + + alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char)); + if(!alignment->titles[seq]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + + /* lets forward to the first non-space (space/tab) character after the '>' */ + + if(first_sequence_flag) { + ptr = buf; + } else { + ptr = &buf[1]; + } + while(*ptr == '\t' || *ptr == ' ') { + ptr++; + } + sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */ + + alignment->nseq++; + } + + buf[index++] = c; + + break; + + + case NJ_FASTA_MODE_SEQUENCE: + + if(c == '>') { + + if(first_sequence_flag) { + first_sequence_flag = 0; + + /* allocate our alignment data section here */ + alignment->length = index-1; + + nseqs = NJ_INITIAL_NSEQS; + alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + + if(!first_sequence_flag) { + if(index-1 < alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + } + + /* re-allocate if necessary */ + /* + if(seq >= nseqs) { + nseqs *= 2; + alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); + if(!alignment->data) { + fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); + goto XIT_BAD; + } + } + */ + + /* copy buf to alignment */ + for(i=1;i<=alignment->length;i++) { + alignment->data[seq*alignment->length+i-1] = buf[i]; + } + + state = NJ_FASTA_MODE_TITLE; + index = 1; + x = 1; + + buf[0] = c; + + seq++; + + } else { + + if(NJ_is_whitespace(c)) { + break; + } + + if(!first_sequence_flag) { + if(index-1 >= alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + } + + + /* + * Here we check to make sure that the symbol read is appropriate + * for the type of data the user specified. (dna or protein). + * We also handle ambiguity codes by converting them to a specific + * assigned ambiguity code character. Ambiguity codes are ignored + * when computing distances + */ + + if(nj_args->dna_flag) { + if(NJ_is_dna(c)) { + buf[index++] = toupper(c); + } else { + if(NJ_is_dna_ambiguity(c)) { + buf[index++] = NJ_AMBIGUITY_CHAR; + } else { + fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq); + goto XIT_BAD; + } + } + } else if(nj_args->protein_flag) { + if(NJ_is_protein(c)) { + buf[index++] = toupper(c); + } else { + if(NJ_is_protein_ambiguity(c)) { + buf[index++] = NJ_AMBIGUITY_CHAR; + } else { + fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq); + goto XIT_BAD; + } + } + } + + } + + break; + + default: + goto XIT_BAD; + + break; + + } + } + + if(index-1 != alignment->length) { + fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); + goto XIT_BAD; + } + + /* check for duplicate taxon labels */ + if(!NJ_taxaname_unique(alignment)) { + goto XIT_BAD; + } + + return(alignment); + + + XIT_BAD: + + if(fp) { + fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp)); + } + + if(buf) { + free(buf); + } + + NJ_free_alignment(alignment); + + return(NULL); +} + + + + +/* + * NJ_print_alignment() - Print multiple sequence alignment (for debugging) + * + * INPUTS: + * ------- + * alignment -- A pointer to the alignment + * + * RETURNS: + * -------- + * NONE + * + */ +void +NJ_print_alignment(NJ_alignment *alignment) { + + long int i, j; + + printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length); + + for(i=0;inseq;i++) { + + printf("> %s\n", alignment->titles[i]); + + for(j=0;jlength;j++) { + printf("%c", alignment->data[i*alignment->length+j]); + } + + printf("\n"); + } + + return; +} + + + + + + + +/* + * + * NJ_free_alignment() - Free all of the memory allocated for the + * multiple sequence alignment + * + * INPUTS: + * ------- + * alignment -- A pointer to the multiple sequence alignment + * + * RETURNS: + * -------- + * NONE + * + */ +void +NJ_free_alignment(NJ_alignment *alignment) { + + long int i; + + if(alignment) { + + /* free the allocated titles */ + if(alignment->titles) { + for(i=0;inseq;i++) { + + if(alignment->titles[i]) { + free(alignment->titles[i]); + } + } + + free(alignment->titles); + } + + /* free the alignment data */ + if(alignment->data) { + free(alignment->data); + } + + /* free the alignment itself */ + free(alignment); + } + + return; +} + + + + +/* + * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment + * + * INPUTS: + * ------- + * alignment -- a pointer to a multiple sequence alignment + * + * OUTPUTS: + * -------- + * int -- 0 if all taxanames in alignment are unique + * 1 if all taxanames in alignment are NOT unique + * + * + * DESCRIPTION: + * ------------ + * + * Check to see if the taxanames in the alignment are unique. It + * will be impossible to make sense of the final tree if the taxon + * labels are not unqiue. + * + */ +int +NJ_taxaname_unique(NJ_alignment *alignment) { + + long int i, j; + + for(i=0;inseq;i++) { + for(j=i+1;jnseq;j++) { + if(!strcmp(alignment->titles[i], + alignment->titles[j])) { + fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", + i, j, alignment->titles[i]); + return(0); + } + } + } + + return(1); +} + + +void +NJ_print_titles(NJ_alignment *alignment) { + + int i; + + for(i=0;inseq;i++) { + printf("%d: %s\n", i, alignment->titles[i]); + } + + return; +} diff --git a/fasta.h b/fasta.h new file mode 100644 index 0000000..29ea288 --- /dev/null +++ b/fasta.h @@ -0,0 +1,89 @@ +/* + * fasta.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_NJ_FASTA_H_ +#define _INC_NJ_FASTA_H_ 1 + +#include "clearcut.h" + +#define NJ_INITIAL_BUFFER_SIZE 512 +#define NJ_INITIAL_NSEQS 64 + +#define NJ_FASTA_MODE_TITLE 100 +#define NJ_FASTA_MODE_SEQUENCE 101 +#define NJ_FASTA_MODE_NEWLINE 102 +#define NJ_FASTA_MODE_UNKNOWN 103 + + +typedef struct _STRUCT_NJ_ALIGNMENT { + + long int nseq; + long int length; + + char **titles; + + char *data; + +} NJ_alignment; + + +NJ_alignment * +NJ_read_fasta(NJ_ARGS *nj_args); + +void +NJ_print_alignment(NJ_alignment *alignment); + +void +NJ_free_alignment(NJ_alignment *alignment); + +int +NJ_taxaname_unique(NJ_alignment *alignment); + +#endif /* _INC_NJ_FASTA_H_ */ + + + + diff --git a/getopt_long.c b/getopt_long.c new file mode 100644 index 0000000..1756a2e --- /dev/null +++ b/getopt_long.c @@ -0,0 +1,2096 @@ +/* + This getopt_long() is compatible with GNU's, however, added original + extention (short 1 byte option). + + + Copyright (c) 2004 Koji Arai + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + + Compilation for Test: + + GNU: + cc -DUSE_GNU -DDEBUG getopt_long.c -o test_getopt_long_gnu + + not GNU: + cc -I. -DDEBUG getopt_long.c -o test_getopt_long + + ./test_getopt_long + ./test_getopt_long_gnu + + BUGS: + * not implemented any features for getopt() and getopt_long(). +*/ + +#include +#include +#include + +#if DEBUG +static int +puts_argv(char **argv) +{ + int i; + + for (i = 0; argv[i]; i++) { + if (i) printf(" "); + + printf("%s", argv[i]); + } + printf("\n"); + + return 0; +} +#endif + +#ifndef USE_GNU +#include +#include "getopt_long.h" + +char *optarg; +int optind; + +int opterr; +int optopt; + +/* + return value 0: no option (include '-') + 1: short option like '-x' + 2: long option like '--xxx' and just '--' +*/ +static int +is_option(char *arg) +{ + if (arg[0] == '-') { + switch (arg[1]) { + case 0: /* just "-" */ + return 0; + case '-': /* long option (include just "--")*/ + return 2; + default: /* short option */ + return 1; + } + } + return 0; +} + +static int +insert_argv(char **argv, int src, int dest) +{ + int i; + char *tmp = argv[src]; + + if (src > dest) { + for (i = src; i > dest; i--) + argv[i] = argv[i-1]; + } + if (src < dest) { + for (i = src; i < dest; i++) + argv[i] = argv[i+1]; + } + + argv[dest] = tmp; + + return 0; +} + +static int +search_longopt(char *arg, struct option *longopts) +{ + int i, found = -1; + int len; + for (len = 0; arg[len] && arg[len] != '='; len++) + ; + + for (i = 0; longopts[i].name; i++) { + if (strncmp(arg, longopts[i].name, len) == 0) { + if (found != -1) + return -1; /* found some candidate */ + found = i; + } + } + return found; +} + +/* + * implemented my extention feature. + * optional 1 byte argument with [...] + * e.g.) shortopts = "a[0123]b" + * accepts "-a0 -a1b" (same as "-a0 -a1 -b") + */ +static int +has_argument_short(char *arg, const char *shortopts) +{ + int i; + int open_bracket = 0; + for (i = 0; shortopts[i]; i++) { + switch (shortopts[i]) { + case '[': + open_bracket++; + continue; + case ']': + if (open_bracket <= 0) { + fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); + return -1; + } + open_bracket--; + continue; + } + if (open_bracket) continue; + if (*arg != shortopts[i]) continue; + + switch (shortopts[i+1]) { + case ':': + if (shortopts[i+2] != ':') { + if (arg[1]) + return 1; /* following string is argument */ + else + return 2; /* next argv is argument */ + } + else { + /* '::' means optional argument (GNU extention) */ + if (arg[1]) + return 1; + else + return 0; /* no argument */ + } + case '[': + if (arg[1] == '\0') + return 0; /* no argument */ + /* my extention */ + for (i++; shortopts[i] && shortopts[i] != ']'; i++) { + if (arg[1] == shortopts[i]) + return 3; /* has 1 byte argument */ + } + if (!shortopts[i]) { + fprintf(stderr, "getopt_long() -- unbalanced bracket in short options"); + return -1; + } + break; + default: + return 0; /* no argument */ + } + } + /* Invalid option */ + return -1; +} + +static int +has_argument_long(char *arg, struct option *longopts) +{ + int i; + + i = search_longopt(arg, longopts); + if (i == -1) { + /* Invalid option */ + return -1; + } + else { + int len = strlen(arg); + char *p = strchr(arg, '='); + if (p) { + len = p - arg; + } + + switch (longopts[i].has_arg) { + case no_argument: + return 0; + case required_argument: + if (arg[len] == '=') + return 1; + else + return 2; + case optional_argument: + if (arg[len] == '=') + return 1; + else + return 0; + default: + assert(0); + } + } +} + +/* + -1: no option + 0: no argument + 1: has argument in this argv + 2: has argument in next argv + 3: has 1 byte argument in this argv +*/ +static int +has_argument(char *arg, + const char *shortopts, + struct option *longopts) +{ + int i, n; + + switch (is_option(arg)) { + case 0: /* no option */ + return -1; + case 1: + /* short option */ + n = -1; + for (i = 1; arg[i]; i++) { + n = has_argument_short(arg+i, shortopts); + if (n == 0 && arg[i+1]) continue; + if (n == 3 && arg[i+2]) { i++; continue; } + break; + } + return n; + case 2: + /* long option */ + return has_argument_long(arg+2, longopts); + break; + default: + assert(0); + } +} + +int +getopt_long(int argc, char **argv, + const char *shortopts, + struct option *longopts, + int *indexptr) +{ + char *opt; + int i; + static int shortoptind; + static int no_optind = 0; + + if (optind == 0) { /* skip first argument (command name) */ + optind++; + no_optind = 0; + shortoptind = 0; + } + + optarg = 0; + + if (no_optind && !shortoptind) { + while (!is_option(argv[no_optind])) + insert_argv(argv, no_optind, optind-1); + + if (has_argument(argv[no_optind], shortopts, longopts) == 2) + no_optind += 2; + else + no_optind++; + + if (argv[optind] && strcmp(argv[optind], "--") == 0) { + while (!is_option(argv[no_optind])) + insert_argv(argv, no_optind, optind); + optind = no_optind; + no_optind = 0; + } + } + + if (optind >= argc) + goto end_of_option; + + retry: + /* + puts_argv(&argv[optind]); + */ + opt = argv[optind]; + + if (shortoptind == 0 && is_option(opt) == 1) { + shortoptind++; + } + + if (shortoptind) { + /* short option */ + char *p = &opt[shortoptind]; + + if (*p == '\0') + assert(0); + + switch (has_argument_short(p, shortopts)) { + case 0: + /* no argument */ + optarg = 0; + + shortoptind++; + if (opt[shortoptind] == '\0') + optind++, shortoptind = 0; + return *p; + case 1: + /* following character is argument */ + optind++, shortoptind = 0; + optarg = &p[1]; + return *p; + case 2: + /* next argv is argument */ + optind++, shortoptind = 0; + optarg = argv[optind++]; + return *p; + case 3: + /* has 1 byte argument */ + optarg = &p[1]; + if (p[2] == 0) + optind++, shortoptind = 0; + else + shortoptind += 2; + return *p; + default: + /* Invalid option */ + if (opterr) + fprintf(stderr, + "%s: invalid option -- %c\n", + argv[0], + *p); + + optind++, shortoptind = 0; + optopt = *p; + return '?'; + } + } + else if (opt[0] == '-' && opt[1] == '-') { + /* long option */ + + if (opt[2] == '\0') { + /* end of command line switch */ + optind++; + return -1; + } + + opt += 2; + + i = search_longopt(opt, longopts); + if (i == -1) { + optind++; + optopt = 0; + return '?'; + } + else { + int len = strlen(opt); + char *p = strchr(opt, '='); + if (p) { + len = p - opt; + } + + switch (longopts[i].has_arg) { + case no_argument: + break; + case required_argument: + if (opt[len] == '=') + optarg = opt + len + 1; + else { + optind++; + optarg = argv[optind]; + if (optarg == 0) { + if (opterr) + fprintf(stderr, + "%s: option `--%s' requires an argument\n", + argv[0], + opt); + + optopt = 0; + return '?'; /* no argument */ + } + } + break; + case optional_argument: + if (opt[len] == '=') + optarg = opt + len + 1; + else { + optarg = 0; + } + break; + default: + break; + } + + *indexptr = i; + optind++; + if (longopts[i].flag) { + *longopts[i].flag = longopts[i].val; + return 0; + } + else { + return longopts[i].val; + } + } + + optind++; + optopt = 0; + return '?'; + } + + /* not option */ + if (no_optind == 0) + no_optind = optind; + + for (i = optind; argv[i]; i++) { + if (is_option(argv[i])) { + optind = i; + goto retry; + } + } + + end_of_option: + if (no_optind) { + optind = no_optind; + no_optind = 0; + } + + return -1; +} +#endif /* USE_GNU */ + +#if DEBUG + +#include +#include +#include + +#if USE_GNU +#include /* use GNU getopt_long() */ +#endif + +static int verbose_flag; +static int option_index; +int argc; +char *argv[50]; +char **p; +int c; +static struct option long_options[] = { + {"verbose", no_argument, &verbose_flag, 1}, + {"brief", no_argument, &verbose_flag, 0}, + {"add", required_argument, 0, 'a'}, + {"append", no_argument, 0, 0}, + {"delete", required_argument, 0, 0}, + {"create", optional_argument, 0, 0}, + {"change", optional_argument, 0, 0}, + {0, 0, 0, 0} +}; + +int +call_getopt_long(int argc, char **argv, + const char *shortopts, + struct option *longopts, + int *indexptr) +{ + int c; + c = getopt_long(argc, argv, shortopts, longopts, indexptr); + puts_argv(argv); + printf("ret=%d(%c) option_index=%d ", c, c, option_index); + printf("optind=%d optarg=[%s] opterr=%d optopt=%d(%c)\n", + optind, optarg, opterr, optopt, optopt); + if (c == 0) { + struct option *opt; + opt = &longopts[*indexptr]; + printf("long option: --%s has_arg=%d\n", opt->name, opt->has_arg); + if (opt->flag) + printf(" flag=[%8p] val=%d\n", opt->flag, *opt->flag); + } + + return c; +} + +void +test1() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-bcd"; + argc++; *p++ = "-d"; + argc++; *p++ = "-e"; + argc++; *p++ = "-f"; + argc++; *p++ = "-g"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'b'); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'c'); + assert(option_index == 0); + assert(optind == 3); + assert(optarg == &argv[2][3]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'd'); + assert(option_index == 0); + assert(optind == 5); + assert(optarg == argv[4]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 0); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 'f'); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 0); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == -1); + assert(option_index == 0); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ +} + +void +test2() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--brief"; + argc++; *p++ = "--add"; + argc++; *p++ = "add_argument"; + argc++; *p++ = "--add=add_argument"; + argc++; *p++ = "--append"; + argc++; *p++ = "--delete=del_argument"; + argc++; *p++ = "--create=cre_argument"; + argc++; *p++ = "--create"; + argc++; *p++ = "files..."; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "verbose") == 0); + assert(*long_options[option_index].flag == 1); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 1); + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "brief") == 0); + assert(*long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 2); + assert(optind == 5); + assert(optarg == argv[4]); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "add") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 'a'); + assert(option_index == 2); + assert(optind == 6); + assert(optarg == argv[5]+6); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "add") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 3); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "append") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 8); + assert(optarg == argv[7]+9); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "delete") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 9); + assert(optarg == argv[8]+9); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "create") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 10); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(long_options[option_index].name, "create") == 0); + assert(long_options[option_index].flag == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 10); + assert(optarg == 0); + assert(optopt == 'g'); /* no changed */ + assert(strcmp(argv[optind], "files...") == 0); + +} + +void +test3() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--delete"; /* required argument has no argument */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); /* changed */ + assert(optarg == 0); + assert(optopt == 0); /* changed */ + assert(argv[optind] == 0); + + /* */ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--file"; /* not option */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + assert(c == '?'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + assert(argv[optind] == 0); +} + +void +test4() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "a1"; + argc++; *p++ = "a2"; + argc++; *p++ = "-b"; + argc++; *p++ = "b"; + argc++; *p++ = "-efg"; /* some options in a argument */ + argc++; *p++ = "g"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'e'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'f'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'g'); + assert(option_index == 5); /* no changed */ + assert(optind == 8); + assert(optarg == argv[7]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 10); + assert(optarg == argv[9]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:efg:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-efg") == 0); + assert(strcmp(*p++, "g") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "a1") == 0); + assert(strcmp(*p++, "a2") == 0); + assert(strcmp(*p++, "b") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test5() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-b"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test6() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-b"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(*p == 0); + + assert(c == 'b'); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-b") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test7() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test8() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "--"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == argv[3]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == argv[6]); + assert(optopt == 0); +} + +void +test9() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-"; + argc++; *p++ = "-"; + argc++; *p++ = "-c"; + argc++; *p++ = "c"; + argc++; *p++ = "--"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc:d:", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "c") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 9); + assert(optarg == argv[8]); + assert(optopt == 0); +} + +void +test10() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-cc"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + argc++; *p++ = "-c"; /* no argument */ + argc++; *p++ = "-d"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 5); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 3); + assert(optarg == argv[2]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 5); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-cc") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-c") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 5); /* no changed */ + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test11() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "--change"; + argc++; *p++ = "d"; + argc++; *p++ = "--create"; /* no argument */ + argc++; *p++ = "--change"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 4); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 6); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test12() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "files..."; + argc++; *p++ = "--delete"; /* required argument */ + argc++; *p++ = "d"; + argc++; *p++ = "--create"; /* no argument */ + argc++; *p++ = "--change"; /* at last */ + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 6); + assert(optind == 8); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--create") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 6); + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); +} + +void +test13() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "--verbose"; + argc++; *p++ = "--create=c"; + argc++; *p++ = "files..."; + argc++; *p++ = "--delete"; + argc++; *p++ = "d"; + argc++; *p++ = "--"; /* option terminator */ + argc++; *p++ = "--change"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 0); + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 5); + assert(optind == 3); + assert(optarg == argv[2]+9); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == 0); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == argv[5]); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc::d::", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "--verbose") == 0); + assert(strcmp(*p++, "--create=c") == 0); + assert(strcmp(*p++, "--delete") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "--") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(strcmp(*p++, "--change") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); + assert(optind == 6); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test14() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-o5"; + argc++; *p++ = "files..."; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "o[567]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-o5") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == 'o'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == argv[1]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-o5") == 0); + assert(strcmp(*p++, "files...") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + +} + +void +test15() +{ + optind = 0; + argc = 0; + p = argv; + + argc++; *p++ = "command_name"; + argc++; *p++ = "-a"; + argc++; *p++ = "-ccd"; + argc++; *p++ = "-ce"; + argc++; *p++ = "-d"; + argc++; *p++ = "d"; + argc++; *p++ = "-cdd"; + argc++; *p++ = "-d"; + *p = 0; + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'a'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 2); + assert(optarg == argv[2]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 3); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 4); + assert(optarg == argv[3]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 5); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'c'); + assert(option_index == 4); /* no changed */ + assert(optind == 6); + assert(optarg == argv[6]+2); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "d") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(*p == 0); + + assert(c == 'd'); + assert(option_index == 4); /* no changed */ + assert(optind == 8); + assert(optarg == 0); + assert(optopt == 0); + + + /*************************/ + c = call_getopt_long(argc, argv, "abc[cde]d[fgh]", long_options, &option_index); + + p = argv; + assert(strcmp(*p++, "command_name") == 0); + assert(strcmp(*p++, "-a") == 0); + assert(strcmp(*p++, "-ccd") == 0); + assert(strcmp(*p++, "-ce") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "-cdd") == 0); + assert(strcmp(*p++, "-d") == 0); + assert(strcmp(*p++, "d") == 0); + assert(*p == 0); + + assert(c == -1); + assert(option_index == 4); /* no changed */ + assert(optind == 7); + assert(optarg == 0); + assert(optopt == 0); + + +} + +int +main() +{ + opterr = 0; + optopt = 0; + + test1(); + test2(); + test3(); + test4(); + test5(); + test6(); + test7(); + test8(); + test9(); + test10(); + test11(); + test12(); + test13(); +#ifndef USE_GNU + test14(); + test15(); +#endif + + return 0; +} +#endif diff --git a/getopt_long.h b/getopt_long.h new file mode 100644 index 0000000..a1a9add --- /dev/null +++ b/getopt_long.h @@ -0,0 +1,52 @@ +/* + This getopt_long() is compatible with GNU's, however, added original + extention (short 1 byte option). + + + Copyright (c) 2004 Koji Arai + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef _GETOPT_H + +struct option { + const char *name; + int has_arg; + + /* values of has_arg */ +#define no_argument 0 +#define required_argument 1 +#define optional_argument 2 + + int *flag; + int val; +}; + +extern char *optarg; +extern int optind; + +int getopt_long(int argc, char **argv, + const char *shortopts, + struct option *longopts, + int *indexptr); + +#endif /* _GETOPT_H */ diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 215b5bd..0275648 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -123,59 +123,66 @@ int PreClusterCommand::execute(){ if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; } if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; } - //clear sizes since you only needed this info to build the alignSeqs seqPNode structs -// sizes.clear(); - + string fileroot = outputDir + getRootName(getSimpleName(fastafile)); + string newFastaFile = fileroot + "precluster" + getExtension(fastafile); + string newNamesFile = fileroot + "precluster.names"; + ofstream outFasta; + ofstream outNames; + + openOutputFile(newFastaFile, outFasta); + openOutputFile(newNamesFile, outNames); + //sort seqs by number of identical seqs - sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); + alignSeqs.sort(comparePriority); int count = 0; + int i = 0; //think about running through twice... - for (int i = 0; i < numSeqs; i++) { - - //are you active - // itActive = active.find(alignSeqs[i].seq.getName()); - - if (alignSeqs[i].active) { //this sequence has not been merged yet - - //try to merge it with all smaller seqs - for (int j = i+1; j < numSeqs; j++) { + list::iterator itList; + list::iterator itList2; + for (itList = alignSeqs.begin(); itList != alignSeqs.end(); itList++) { + //try to merge it with all smaller seqs + for (itList2 = alignSeqs.begin(); itList2 != alignSeqs.end(); itList2++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { outFasta.close(); outNames.close(); remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } - if (alignSeqs[j].active) { //this sequence has not been merged yet - //are you within "diff" bases - int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); + if ((*itList).seq.getName() != (*itList2).seq.getName()) { //you don't want to merge with yourself + //are you within "diff" bases + int mismatch = calcMisMatches((*itList).seq.getAligned(), (*itList2).seq.getAligned()); - if (mismatch <= diffs) { - //merge - alignSeqs[i].names += ',' + alignSeqs[j].names; - alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; + if (mismatch <= diffs) { + //merge + (*itList).names += ',' + (*itList2).names; + (*itList).numIdentical += (*itList2).numIdentical; - alignSeqs[j].active = 0; - alignSeqs[j].numIdentical = 0; - count++; - } - }//end if j active - }//end if i != j + alignSeqs.erase(itList2); itList2--; + count++; + } + } + } + + //ouptut this sequence + printData(outFasta, outNames, (*itList)); + + //remove sequence + alignSeqs.erase(itList); itList--; - //remove from active list - alignSeqs[i].active = 0; - }//end if active i - if(i % 100 == 0) { cout << i << '\t' << numSeqs - count << '\t' << count << endl; } + i++; + + if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } } + + if(i % 100 != 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } - string fileroot = outputDir + getRootName(getSimpleName(fastafile)); - - string newFastaFile = fileroot + "precluster" + getExtension(fastafile); - string newNamesFile = fileroot + "precluster.names"; + outFasta.close(); + outNames.close(); - if (m->control_pressed) { return 0; } + if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } + - m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); + m->mothurOut("Total number of sequences before precluster was " + toString(numSeqs) + "."); m->mothurOutEndLine(); m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); - printData(newFastaFile, newNamesFile); if (m->control_pressed) { remove(newFastaFile.c_str()); remove(newNamesFile.c_str()); return 0; } @@ -233,12 +240,12 @@ int PreClusterCommand::readFASTA(){ else{ seqPNode tempNode(itSize->second, seq, names[seq.getName()]); alignSeqs.push_back(tempNode); - if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); } + if (seq.getAligned().length() > length) { length = seq.getAligned().length(); } } }else { //no names file, you are identical to yourself seqPNode tempNode(1, seq, seq.getName()); alignSeqs.push_back(tempNode); - if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); } + if (seq.getAligned().length() > length) { length = seq.getAligned().length(); } } } } @@ -275,32 +282,16 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ /**************************************************************************************************/ -void PreClusterCommand::printData(string newfasta, string newname){ +void PreClusterCommand::printData(ofstream& outFasta, ofstream& outNames, seqPNode thisSeq){ try { - ofstream outFasta; - ofstream outNames; - - openOutputFile(newfasta, outFasta); - openOutputFile(newname, outNames); - - - for (int i = 0; i < alignSeqs.size(); i++) { - if (alignSeqs[i].numIdentical != 0) { - alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; - } - } - - outFasta.close(); - outNames.close(); - + thisSeq.seq.printSequence(outFasta); + outNames << thisSeq.seq.getName() << '\t' << thisSeq.names << endl; } catch(exception& e) { m->errorOut(e, "PreClusterCommand", "printData"); exit(1); } } - /**************************************************************************************************/ void PreClusterCommand::readNameFile(){ diff --git a/preclustercommand.h b/preclustercommand.h index de6a572..a26fda4 100644 --- a/preclustercommand.h +++ b/preclustercommand.h @@ -20,9 +20,8 @@ struct seqPNode { int numIdentical; Sequence seq; string names; - bool active; seqPNode() {} - seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {} + seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm) {} ~seqPNode() {} }; /************************************************************/ @@ -39,7 +38,7 @@ private: int diffs, length; bool abort; string fastafile, namefile, outputDir; - vector alignSeqs; //maps the number of identical seqs to a sequence + list alignSeqs; //maps the number of identical seqs to a sequence map names; //represents the names file first column maps to second column map sizes; //this map a seq name to the number of identical seqs in the names file map::iterator itSize; @@ -49,7 +48,7 @@ private: void readNameFile(); //int readNamesFASTA(); int calcMisMatches(string, string); - void printData(string, string); //fasta filename, names file name + void printData(ofstream&, ofstream&, seqPNode); //fasta filename, names file name }; /************************************************************/ diff --git a/prng.c b/prng.c new file mode 100644 index 0000000..212f888 --- /dev/null +++ b/prng.c @@ -0,0 +1,221 @@ +/* + A C-program for MT19937, with initialization improved 2002/1/26. + Coded by Takuji Nishimura and Makoto Matsumoto. + + Before using, initialize the state by using init_genrand(seed) + or init_by_array(init_key, key_length). + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + Any feedback is very welcome. + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) +*/ + +#include + +#include "prng.h" + +/* Period parameters */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0dfUL /* constant vector a */ +#define UPPER_MASK 0x80000000UL /* most significant w-r bits */ +#define LOWER_MASK 0x7fffffffUL /* least significant r bits */ + +#define NJ_RAND_MAX 0x7fffffffUL + + +static unsigned long mt[N]; /* the array for the state vector */ +static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */ + +/* initializes mt[N] with a seed */ +void init_genrand(unsigned long s) +{ + mt[0]= s & 0xffffffffUL; + for (mti=1; mti> 30)) + mti); + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + /* In the previous versions, MSBs of the seed affect */ + /* only MSBs of the array mt[]. */ + /* 2002/01/09 modified by Makoto Matsumoto */ + mt[mti] &= 0xffffffffUL; + /* for >32 bit machines */ + } +} + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +/* slight change for C++, 2004/2/26 */ +void init_by_array(unsigned long init_key[], int key_length) +{ + int i, j, k; + init_genrand(19650218UL); + i=1; j=0; + k = (N>key_length ? N : key_length); + for (; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) + + init_key[j] + j; /* non linear */ + mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ + i++; j++; + if (i>=N) { mt[0] = mt[N-1]; i=1; } + if (j>=key_length) j=0; + } + for (k=N-1; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) + - i; /* non linear */ + mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ + i++; + if (i>=N) { mt[0] = mt[N-1]; i=1; } + } + + mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ +} + +/* generates a random number on [0,0xffffffff]-interval */ +unsigned long genrand_int32(void) +{ + unsigned long y; + static unsigned long mag01[2]={0x0UL, MATRIX_A}; + /* mag01[x] = x * MATRIX_A for x=0,1 */ + + if (mti >= N) { /* generate N words at one time */ + int kk; + + if (mti == N+1) /* if init_genrand() has not been called, */ + init_genrand(5489UL); /* a default initial seed is used */ + + for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; + } + for (;kk> 1) ^ mag01[y & 0x1UL]; + } + y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); + mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; + + mti = 0; + } + + y = mt[mti++]; + + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680UL; + y ^= (y << 15) & 0xefc60000UL; + y ^= (y >> 18); + + return y; +} + +/* generates a random number on [0,0x7fffffff]-interval */ +long int genrand_int31(void) +{ + return (long)(genrand_int32()>>1); +} + +/* These real versions are due to Isaku Wada, 2002/01/09 added */ + +/* generates a random number on [0,1]-real-interval */ +double genrand_real1(void) +{ + return genrand_int32()*(1.0/4294967295.0); + /* divided by 2^32-1 */ +} + +/* generates a random number on [0,1)-real-interval */ +double genrand_real2(void) +{ + return genrand_int32()*(1.0/4294967296.0); + /* divided by 2^32 */ +} + +/* generates a random number on (0,1)-real-interval */ +double genrand_real3(void) +{ + return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); + /* divided by 2^32 */ +} + +/* generates a random number on [0,1) with 53-bit resolution*/ +double genrand_res53(void) +{ + unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; + return(a*67108864.0+b)*(1.0/9007199254740992.0); +} + + + + +/* + * NJ_genrand_int31_top() - Returns an int in the range 0..top + * + * This function attempts to remove bias in selecting random + * integers in a range. + * + */ +long int +NJ_genrand_int31_top(long int top) { + + long int overflow; + long int r; + long int retval; + + if(top <= 0) { + return(0); + } else { + overflow = (NJ_RAND_MAX / top) * top; + } + + while(1) { + r = genrand_int31(); + if(r < overflow) { + break; + } + } + + retval = r % top; + + return(retval); +} + + + + + + + + diff --git a/prng.h b/prng.h new file mode 100644 index 0000000..64f8ef3 --- /dev/null +++ b/prng.h @@ -0,0 +1,90 @@ +/* + * prng.h + * + * $Id$ + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Some function prototypes for the Mersenne Twister PRNG + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + + +#ifndef _INC_PRNG_H_ +#define _INC_PRNG_H_ 1 + +#define NJ_RAND_MAX 0x7fffffffUL + + +/* some function prototypes */ +void +init_genrand(unsigned long s); + +void +init_by_array(unsigned long init_key[], + int key_length); + +unsigned long +genrand_int32(void); + +long int +genrand_int31(void); + +double +genrand_real1(void); + +double +genrand_real2(void); + +double +genrand_real3(void); + +double +genrand_res53(void); + +long int +NJ_genrand_int31_top(long int top); + +#endif /* _INC_PRNG_H_ */ + + + +