6 *****************************************************************************
8 * Copyright (c) 2004, Luke Sheneman
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
15 * + Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * + Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
21 * + The names of its contributors may not be used to endorse or promote
22 * products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
37 *****************************************************************************
39 * Distance matrix parser
41 *****************************************************************************
46 * sheneman@cs.uidaho.edu
71 * NJ_is_alpha() - determine if character is an alphabetic character
75 * c -- character to test
79 * int -- 1 if character is alphabetic (A-Z || a-z)
80 * 0 if character is NOT alphabetic
87 if( (c >= 'A' && c <= 'Z') ||
88 (c >= 'a' && c <= 'z') ) {
99 * NJ_is_whitespace() - determine if character is a whitespace character
103 * c -- character to test
107 * int -- 1 if character is whitespace (space, tab, CR, LF)
108 * 0 if character is NOT whitespace
113 NJ_is_whitespace(char c) {
115 if( c == ' ' || /* space */
116 c == '\n' || /* newline */
117 c == '\r' || /* carriage-return */
118 c == '\v' || /* vertical tab */
119 c == '\f' || /* form feed */
120 c == '\t' ) { /* horizontal tab */
133 * NJ_is_number() - determine if character is a number
137 * c -- character to test
141 * int -- 1 if character is a number (0-9)
142 * 0 if character is NOT a number
147 NJ_is_number(char c) {
148 if(c >= '0' && c <= '9') {
158 * NJ_is_distance() - check if string is a properly formatted distance value
163 NJ_is_distance(char *token) {
171 /* if token is NULL return failure */
180 /* The first character must be a number, a decimal point or a sign */
182 if(!NJ_is_number(c) &&
191 * if the first character is not a number, and string is only one
192 * character long, then we return failure.
194 if(strlen(token) == 1) {
195 if(!NJ_is_number(c)) {
200 for(i=0;i<strlen(token);i++) {
204 /* make sure that all chars in dist string are in list of valid chars */
205 if(!NJ_is_number(c) &&
215 /* not the first char and we are not in exponent state but read (+,-) */
216 if(i>0 && !exponent_state) {
223 /* if we are in the exponent state, and we've already seen a sign */
224 if(exponent_state && expsign_state) {
231 /* if we are in the exponent state and we see a decimal point */
238 /* if we are in the exponent state and see another e or E */
246 /* if we are dpoint_state and see another decimal point */
254 /* enter the exponent state if we need to */
255 if(!exponent_state) {
262 /* enter the expsign_state if we need to */
263 if(exponent_state && !expsign_state) {
270 /* if not in dpoint state and we see a dpoint */
279 /* the token must end in a number char */
280 if(!NJ_is_number(token[strlen(token)-1])) {
284 /* token is a valid numerical distance */
289 /* token is invalid distance format */
299 * Simply, if token is not a valid number, then it is a name
304 NJ_is_label(char *token) {
305 if(NJ_is_distance(token)) {
315 * NJ_get_token() - get a token from an input stream
320 NJ_get_token(FILE *fp,
321 NJ_DIST_TOKEN *token) {
328 token->type = NJ_EOF_STATE;
332 if(NJ_is_whitespace(c)) {
334 token->buf[1] = '\0';
335 token->type = NJ_WS_STATE;
341 while(!NJ_is_whitespace(c)) {
343 /* reallocate our buffer if necessary */
344 if(index >= token->bufsize) {
346 token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char));
348 fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n");
353 token->buf[index++] = c;
357 token->type = NJ_EOF_STATE;
362 token->buf[index] = '\0';
364 if(token->type != NJ_EOF_STATE) {
366 if(NJ_is_distance(token->buf)) {
367 token->type = NJ_FLOAT_STATE;
369 token->type = NJ_NAME_STATE;
383 * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix
388 * nj_args -- a pointer to a structure containing the command-line arguments
392 * <DMAT *> -- NULL (failure)
393 * -- A pointer to a populated distance matrix
397 * This function implements a simple state machine to parse a distance matrix
398 * in approximate PHYLIP format. This function auto-detects whether the
399 * distance matrix is in upper, lower, or fully-symmetric format and handles
400 * it accordingly. For full/symmetric matrices, values must be symmetric
401 * around the diagonal, which is required to be zeroes. Names and values must
402 * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can
403 * include numbers, but must start with non-numerical symbols.
406 * *** UPPER FORMAT EXAMPLE ***
414 * *** LOWER FORMAT EXAMPLE ***
422 * *** SYMMETRIC (FULL) EXAMPLE ***
425 * seq1 0.0 0.3 0.5 0.3
426 * seq2 0.3 0.0 0.1 0.2
427 * seq3 0.5 0.1 0.0 0.9
428 * seq4 0.3 0.2 0.9 0.0
430 * Values in the distance matrix can be positive or negative, integers or
431 * real values. Values can also be parsed in exponential notation form.
435 NJ_parse_distance_matrix(NJ_ARGS *nj_args) {
439 NJ_DIST_TOKEN *token = NULL;
441 int state, dmat_type;
446 int expectedvalues = -1;
451 /* allocate our distance matrix and token structure */
452 dmat = (DMAT *)calloc(1, sizeof(DMAT));
453 token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN));
455 token->bufsize = NJ_INITIAL_BUFSIZE;
456 token->buf = (char *)calloc(token->bufsize, sizeof(char));
458 if(!dmat || !token || !token->buf) {
459 fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
463 /* open distance matrix file here */
464 if(nj_args->stdin_flag) {
467 fp = fopen(nj_args->infilename, "r");
469 fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename);
475 /* get the number of taxa in this file */
476 fscanf(fp, "%ld\n", &dmat->ntaxa);
477 if(dmat->ntaxa < 2) {
478 fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n");
483 /* set our initial working size according to the # of taxa */
484 dmat->size = dmat->ntaxa;
486 /* allocate space for the distance matrix values here */
488 (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
490 fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
495 dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *));
496 if(!dmat->taxaname) {
497 fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
501 /* set the initial state of our state machine */
502 dmat_type = NJ_PARSE_UNKNOWN;
508 /* read the input one character at a time to drive simple state machine */
509 state = NJ_get_token(fp, token);
510 while(state != NJ_EOF_STATE) {
516 if(first_state == 0) {
522 if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) {
524 if(fltcnt == dmat->ntaxa) {
526 dmat_type = NJ_PARSE_SYMMETRIC;
527 expectedvalues = dmat->ntaxa * dmat->ntaxa;
529 } else if (fltcnt == dmat->ntaxa-1) {
531 dmat_type = NJ_PARSE_UPPER;
532 expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
534 /* shift everything in first row by one char */
535 for(i=dmat->ntaxa-2;i>=0;i--) {
536 dmat->val[i+1] = dmat->val[i];
539 } else if (fltcnt == 0) {
541 dmat_type = NJ_PARSE_LOWER;
542 expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
549 if(row >= dmat->ntaxa) {
553 /* allocate space for this taxon label */
554 dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char));
555 if(!dmat->taxaname[row]) {
556 fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
560 strcpy(dmat->taxaname[row], token->buf);
569 if(first_state == 0) {
573 val = atof(token->buf);
575 fprintf(stderr, "Clearcut: Distance value out-of-range.\n");
584 case NJ_PARSE_UNKNOWN:
586 dmat->val[NJ_MAP(x, y, dmat->size)] = val;
590 case NJ_PARSE_SYMMETRIC:
592 if(fltcnt >= dmat->ntaxa) {
593 fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
598 dmat->val[NJ_MAP(x, y, dmat->size)] = val;
600 if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) {
601 fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n");
605 if(!NJ_FLT_EQ(val, 0.0)) {
606 fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
616 if(fltcnt > dmat->ntaxa-row) {
617 fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
621 dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val;
628 fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
632 dmat->val[NJ_MAP(y, x, dmat->size)] = val;
654 if(first_state == 0) {
662 fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n");
667 /* get next token from stream */
668 state = NJ_get_token(fp, token);
673 * At the end, if we have not read the number of values that we predicted
674 * we would need, then there was a problem and we need to punt.
676 if(numvalread != expectedvalues) {
677 fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n");
681 /* special check to make sure first value read is 0.0 */
682 if(dmat_type == NJ_PARSE_SYMMETRIC) {
683 if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) {
684 fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
690 /* now lets allocate space for the r and r2 columns */
691 dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
692 dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
693 if(!dmat->r || !dmat->r2) {
694 fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
698 /* track some memory addresses */
699 dmat->rhandle = dmat->r;
700 dmat->r2handle = dmat->r2;
701 dmat->valhandle = dmat->val;
703 /* close matrix file here */
704 if(!nj_args->stdin_flag) {
719 /* clean up our partial progress */
723 fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp));
726 /* close matrix file here */
727 if(!nj_args->stdin_flag) {
733 /* if we have a valid dmat (partial or complete), we need to free it */
755 * NJ_output_matrix() - Output a distance matrix to the specified file
760 * nj_args -- a pointer to a data structure holding the command-line args
761 * dmat -- a pointer to a distance matrix
771 * If the appropriate flag was specified in the command-line, this function
772 * now outputs the parsed or computed distance matrix to a file. This
773 * can be useful if generating a distance matrix was the primary goal of
774 * running the program, or if one wanted to debug and/or verify the
775 * correctness of the program.
777 * Currently this function outputs full/symmetric matrices only.
781 NJ_output_matrix(NJ_ARGS *nj_args,
789 /* if we haven't specieid matrixout, return immediately */
790 if(!nj_args->matrixout) {
794 /* open the specified matrix file for writing */
795 fp = fopen(nj_args->matrixout, "w");
797 fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout);
801 /* output the number of taxa in the matrix */
802 fprintf(fp, " %ld\n", dmat->size);
804 fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop
806 for(i=1;i<dmat->size;i++) {
808 /* output taxaname */
809 fprintf(fp, "%s\t", dmat->taxaname[i]);
812 if(nj_args->expdist) { /* exponential notation (or not) */
813 fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
815 fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
822 #ifdef FULL_SYMMETRIC_MATRIX
824 /* output the number of taxa in the matrix */
825 fprintf(fp, " %ld\n", dmat->size);
826 for(i=0;i<dmat->size;i++) {
828 /* output taxaname */
829 fprintf(fp, "%s\t", dmat->taxaname[i]);
831 for(j=0;j<dmat->size;j++) {
833 if(nj_args->expdist) { /* exponential notation (or not) */
834 fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
836 fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
839 if(nj_args->expdist) { /* exponential notation (or not) */
840 fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]);
842 fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]);
845 if(nj_args->expdist) { /* exponential notation (or not) */
846 fprintf(fp, "%e ", 0.0);
848 fprintf(fp, "%f ", 0.0);
856 #endif // FULL_SYMMETRIC_MATRIX
858 /* close the file here */