+++ /dev/null
-/*
- * dmat.c
- *
- * $Id$
- *
- *****************************************************************************
- *
- * Copyright (c) 2004, Luke Sheneman
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * + Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * + Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * + The names of its contributors may not be used to endorse or promote
- * products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *****************************************************************************
- *
- * Distance matrix parser
- *
- *****************************************************************************
- *
- * AUTHOR:
- *
- * Luke Sheneman
- * sheneman@cs.uidaho.edu
- *
- */
-
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <errno.h>
-#include <string.h>
-
-
-
-#include "common.h"
-#include "clearcut.h"
-
-#include "dmat.h"
-
-
-
-
-
-/*
- *
- * NJ_is_alpha() - determine if character is an alphabetic character
- *
- * INPUT:
- * ------
- * c -- character to test
- *
- * RETURN:
- * -------
- * int -- 1 if character is alphabetic (A-Z || a-z)
- * 0 if character is NOT alphabetic
- *
- */
-static inline
-int
-NJ_is_alpha(char c) {
-
- if( (c >= 'A' && c <= 'Z') ||
- (c >= 'a' && c <= 'z') ) {
- return(1);
- } else {
- return(0);
- }
-}
-
-
-
-/*
- *
- * NJ_is_whitespace() - determine if character is a whitespace character
- *
- * INPUT:
- * ------
- * c -- character to test
- *
- * RETURN:
- * -------
- * int -- 1 if character is whitespace (space, tab, CR, LF)
- * 0 if character is NOT whitespace
- *
- */
-static inline
-int
-NJ_is_whitespace(char c) {
-
- if( c == ' ' || /* space */
- c == '\n' || /* newline */
- c == '\r' || /* carriage-return */
- c == '\v' || /* vertical tab */
- c == '\f' || /* form feed */
- c == '\t' ) { /* horizontal tab */
- return(1);
- } else {
- return(0);
- }
-
-}
-
-
-
-
-/*
- *
- * NJ_is_number() - determine if character is a number
- *
- * INPUT:
- * ------
- * c -- character to test
- *
- * RETURN:
- * -------
- * int -- 1 if character is a number (0-9)
- * 0 if character is NOT a number
- *
- */
-static inline
-int
-NJ_is_number(char c) {
- if(c >= '0' && c <= '9') {
- return(1);
- } else {
- return(0);
- }
-}
-
-
-
-/*
- * NJ_is_distance() - check if string is a properly formatted distance value
- *
- */
-static inline
-int
-NJ_is_distance(char *token) {
-
- int i;
- char c;
- int exponent_state;
- int expsign_state;
- int dpoint_state;
-
- /* if token is NULL return failure */
- if(!token) {
- return(0);
- }
-
- exponent_state = 0;
- expsign_state = 0;
- dpoint_state = 0;
-
- /* The first character must be a number, a decimal point or a sign */
- c = token[0];
- if(!NJ_is_number(c) &&
- c != '.' &&
- c != '-' &&
- c != '+' ) {
-
- goto BAD;
- }
-
- /*
- * if the first character is not a number, and string is only one
- * character long, then we return failure.
- */
- if(strlen(token) == 1) {
- if(!NJ_is_number(c)) {
- goto BAD;
- }
- }
-
- for(i=0;i<strlen(token);i++) {
-
- c = token[i];
-
- /* make sure that all chars in dist string are in list of valid chars */
- if(!NJ_is_number(c) &&
- c != '.' &&
- c != '-' &&
- c != '+' &&
- c != 'e' &&
- c != 'E' ) {
-
- goto BAD;
- }
-
- /* not the first char and we are not in exponent state but read (+,-) */
- if(i>0 && !exponent_state) {
- if(c == '-' ||
- c == '+') {
- goto BAD;
- }
- }
-
- /* if we are in the exponent state, and we've already seen a sign */
- if(exponent_state && expsign_state) {
- if(c == '-' ||
- c == '+') {
- goto BAD;
- }
- }
-
- /* if we are in the exponent state and we see a decimal point */
- if(exponent_state) {
- if(c == '.') {
- goto BAD;
- }
- }
-
- /* if we are in the exponent state and see another e or E */
- if(exponent_state) {
- if(c == 'e' ||
- c == 'E') {
- goto BAD;
- }
- }
-
- /* if we are dpoint_state and see another decimal point */
- if(dpoint_state) {
- if(c == '.') {
- goto BAD;
- }
- }
-
-
- /* enter the exponent state if we need to */
- if(!exponent_state) {
- if(c == 'e' ||
- c == 'E') {
- exponent_state = 1;
- }
- }
-
- /* enter the expsign_state if we need to */
- if(exponent_state && !expsign_state) {
- if(c == '-' ||
- c == '+') {
- expsign_state = 1;
- }
- }
-
- /* if not in dpoint state and we see a dpoint */
- if(!dpoint_state) {
- if(c == '.') {
- dpoint_state = 1;
- }
- }
-
- }
-
- /* the token must end in a number char */
- if(!NJ_is_number(token[strlen(token)-1])) {
- goto BAD;
- }
-
- /* token is a valid numerical distance */
- return(1);
-
- BAD:
-
- /* token is invalid distance format */
- return(0);
-}
-
-
-
-
-/*
- * NJ_is_label() -
- *
- * Simply, if token is not a valid number, then it is a name
- *
- */
-static inline
-int
-NJ_is_label(char *token) {
- if(NJ_is_distance(token)) {
- return(0);
- } else {
- return(1);
- }
-}
-
-
-
-/*
- * NJ_get_token() - get a token from an input stream
- *
- */
-static inline
-int
-NJ_get_token(FILE *fp,
- NJ_DIST_TOKEN *token) {
-
- char c;
- int index;
-
- c = fgetc(fp);
- if(feof(fp)) {
- token->type = NJ_EOF_STATE;
- return(token->type);
- }
-
- if(NJ_is_whitespace(c)) {
- token->buf[0] = c;
- token->buf[1] = '\0';
- token->type = NJ_WS_STATE;
-
- return NJ_WS_STATE;
- }
-
- index = 0;
- while(!NJ_is_whitespace(c)) {
-
- /* reallocate our buffer if necessary */
- if(index >= token->bufsize) {
- token->bufsize *= 2;
- token->buf = (char *)realloc(token->buf, token->bufsize*sizeof(char));
- if(!token->buf) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_get_token()\n");
- exit(-1);
- }
- }
-
- token->buf[index++] = c;
-
- c = fgetc(fp);
- if(feof(fp)) {
- token->type = NJ_EOF_STATE;
- break;
- }
- }
-
- token->buf[index] = '\0';
-
- if(token->type != NJ_EOF_STATE) {
-
- if(NJ_is_distance(token->buf)) {
- token->type = NJ_FLOAT_STATE;
- } else {
- token->type = NJ_NAME_STATE;
- }
-
- }
-
- return(token->type);
-}
-
-
-
-
-
-
-/*
- * NJ_parse_distance_matrix() -- Takes a filename and returns a distance matrix
- *
- *
- * INPUT:
- * ------
- * nj_args -- a pointer to a structure containing the command-line arguments
- *
- * OUTPUT:
- * -------
- * <DMAT *> -- NULL (failure)
- * -- A pointer to a populated distance matrix
- *
- * DESCRIPTION:
- * ------------
- * This function implements a simple state machine to parse a distance matrix
- * in approximate PHYLIP format. This function auto-detects whether the
- * distance matrix is in upper, lower, or fully-symmetric format and handles
- * it accordingly. For full/symmetric matrices, values must be symmetric
- * around the diagonal, which is required to be zeroes. Names and values must
- * be separated by whitespace (space, tab, newlines, etc.). Taxon labels can
- * include numbers, but must start with non-numerical symbols.
- *
- *
- * *** UPPER FORMAT EXAMPLE ***
- *
- * 4
- * seq1 0.2 0.3 0.1
- * seq2 0.2 0.3
- * seq3 0.1
- * seq4
- *
- * *** LOWER FORMAT EXAMPLE ***
- *
- * 4
- * seq1
- * seq2 0.3
- * seq3 0.2 0.4
- * seq4 0.3 0.1 0.3
- *
- * *** SYMMETRIC (FULL) EXAMPLE ***
- *
- * 4
- * seq1 0.0 0.3 0.5 0.3
- * seq2 0.3 0.0 0.1 0.2
- * seq3 0.5 0.1 0.0 0.9
- * seq4 0.3 0.2 0.9 0.0
- *
- * Values in the distance matrix can be positive or negative, integers or
- * real values. Values can also be parsed in exponential notation form.
- *
- */
-DMAT *
-NJ_parse_distance_matrix(NJ_ARGS *nj_args) {
-
- DMAT *dmat = NULL;
- FILE *fp = NULL;
- NJ_DIST_TOKEN *token = NULL;
-
- int state, dmat_type;
- int row;
- int fltcnt;
- int x, y, i;
- int numvalread;
- int expectedvalues = -1;
- float val;
- int first_state = 0;
-
-
- /* allocate our distance matrix and token structure */
- dmat = (DMAT *)calloc(1, sizeof(DMAT));
- token = (NJ_DIST_TOKEN *)calloc(1, sizeof(NJ_DIST_TOKEN));
- if(token) {
- token->bufsize = NJ_INITIAL_BUFSIZE;
- token->buf = (char *)calloc(token->bufsize, sizeof(char));
- }
- if(!dmat || !token || !token->buf) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
- goto XIT_BAD;
- }
-
- /* open distance matrix file here */
- if(nj_args->stdin_flag) {
- fp = stdin;
- } else {
- fp = fopen(nj_args->infilename, "r");
- if(fp==NULL) {
- fprintf(stderr, "Clearcut: Could not open distance matrix: %s\n", nj_args->infilename);
- perror("Clearcut");
- goto XIT_BAD;
- }
- }
-
- /* get the number of taxa in this file */
- fscanf(fp, "%ld\n", &dmat->ntaxa);
- if(dmat->ntaxa < 2) {
- fprintf(stderr, "Clearcut: Invalid number of taxa in distance matrix\n");
-
- goto XIT_BAD;
- }
-
- /* set our initial working size according to the # of taxa */
- dmat->size = dmat->ntaxa;
-
- /* allocate space for the distance matrix values here */
- dmat->val =
- (float *)calloc(NJ_NCELLS(dmat->ntaxa), sizeof(float));
- if(!dmat->val) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
- goto XIT_BAD;
- }
-
- /* taxa names */
- dmat->taxaname = (char **)calloc(dmat->ntaxa, sizeof(char *));
- if(!dmat->taxaname) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
- goto XIT_BAD;
- }
-
- /* set the initial state of our state machine */
- dmat_type = NJ_PARSE_UNKNOWN;
- row = -1;
- fltcnt = 0;
- numvalread = 0;
-
-
- /* read the input one character at a time to drive simple state machine */
- state = NJ_get_token(fp, token);
- while(state != NJ_EOF_STATE) {
-
- switch(state) {
-
- case NJ_NAME_STATE:
-
- if(first_state == 0) {
- first_state = 1;
- }
-
- row++;
-
- if(row > 0 && dmat_type == NJ_PARSE_UNKNOWN) {
-
- if(fltcnt == dmat->ntaxa) {
-
- dmat_type = NJ_PARSE_SYMMETRIC;
- expectedvalues = dmat->ntaxa * dmat->ntaxa;
-
- } else if (fltcnt == dmat->ntaxa-1) {
-
- dmat_type = NJ_PARSE_UPPER;
- expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
-
- /* shift everything in first row by one char */
- for(i=dmat->ntaxa-2;i>=0;i--) {
- dmat->val[i+1] = dmat->val[i];
- }
-
- } else if (fltcnt == 0) {
-
- dmat_type = NJ_PARSE_LOWER;
- expectedvalues = ((dmat->ntaxa) * (dmat->ntaxa-1)) / 2;
-
- } else {
- goto XIT_BAD;
- }
- }
-
- if(row >= dmat->ntaxa) {
- goto XIT_BAD;
- }
-
- /* allocate space for this taxon label */
- dmat->taxaname[row] = (char *)calloc(strlen(token->buf)+1, sizeof(char));
- if(!dmat->taxaname[row]) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
- goto XIT_BAD;
- }
-
- strcpy(dmat->taxaname[row], token->buf);
-
- fltcnt = 0;
-
- break;
-
-
- case NJ_FLOAT_STATE:
-
- if(first_state == 0) {
- goto XIT_BAD;
- }
-
- val = atof(token->buf);
- if(errno) {
- fprintf(stderr, "Clearcut: Distance value out-of-range.\n");
- goto XIT_BAD;
- }
-
- x = row;
- y = fltcnt;
-
- switch(dmat_type) {
-
- case NJ_PARSE_UNKNOWN:
-
- dmat->val[NJ_MAP(x, y, dmat->size)] = val;
-
- break;
-
- case NJ_PARSE_SYMMETRIC:
-
- if(fltcnt >= dmat->ntaxa) {
- fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
- goto XIT_BAD;
- }
-
- if(x < y) {
- dmat->val[NJ_MAP(x, y, dmat->size)] = val;
- } else if(x > y) {
- if(!NJ_FLT_EQ(val, dmat->val[NJ_MAP(y, x, dmat->size)])) {
- fprintf(stderr, "Clearcut: Full matrices must be symmetric.\n");
- goto XIT_BAD;
- }
- } else {
- if(!NJ_FLT_EQ(val, 0.0)) {
- fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
- goto XIT_BAD;
-
- }
- }
-
- break;
-
- case NJ_PARSE_UPPER:
-
- if(fltcnt > dmat->ntaxa-row) {
- fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
- goto XIT_BAD;
- }
-
- dmat->val[NJ_MAP(x, x+y+1, dmat->size)] = val;
-
- break;
-
- case NJ_PARSE_LOWER:
-
- if(fltcnt > row-1) {
- fprintf(stderr, "Clearcut: Incorrect number of distance values on row.\n");
- goto XIT_BAD;
- }
-
- dmat->val[NJ_MAP(y, x, dmat->size)] = val;
-
- break;
-
- default:
-
- goto XIT_BAD;
-
- break;
- }
-
- fltcnt++;
- numvalread++;
-
- break;
-
- case NJ_WS_STATE:
-
- break;
-
- case NJ_EOF_STATE:
-
- if(first_state == 0) {
- goto XIT_BAD;
- }
-
- break;
-
- default:
-
- fprintf(stderr, "Clearcut: Unknown state in distance matrix parser.\n");
- break;
-
- }
-
- /* get next token from stream */
- state = NJ_get_token(fp, token);
- }
-
-
- /*
- * At the end, if we have not read the number of values that we predicted
- * we would need, then there was a problem and we need to punt.
- */
- if(numvalread != expectedvalues) {
- fprintf(stderr, "Clearcut: Incorrect number of values in the distance matrix.\n");
- goto XIT_BAD;
- }
-
- /* special check to make sure first value read is 0.0 */
- if(dmat_type == NJ_PARSE_SYMMETRIC) {
- if(!NJ_FLT_EQ(dmat->val[NJ_MAP(0, 0, dmat->size)], 0.0)) {
- fprintf(stderr, "Clearcut: Values along the diagonal in a symmetric matrix must be zero.\n");
- goto XIT_BAD;
- }
- }
-
-
- /* now lets allocate space for the r and r2 columns */
- dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
- dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
- if(!dmat->r || !dmat->r2) {
- fprintf(stderr, "Clearcut: Memory allocation error in NJ_parse_distance_matrix()\n");
- goto XIT_BAD;
- }
-
- /* track some memory addresses */
- dmat->rhandle = dmat->r;
- dmat->r2handle = dmat->r2;
- dmat->valhandle = dmat->val;
-
- /* close matrix file here */
- if(!nj_args->stdin_flag) {
- fclose(fp);
- }
-
- if(token) {
- if(token->buf) {
- free(token->buf);
- }
- free(token);
- }
-
- return(dmat);
-
-
-
- /* clean up our partial progress */
- XIT_BAD:
-
- if(fp) {
- fprintf(stderr, "Clearcut: Syntax error in distance matrix at offset %ld.\n", ftell(fp));
- }
-
- /* close matrix file here */
- if(!nj_args->stdin_flag) {
- if(fp) {
- fclose(fp);
- }
- }
-
- /* if we have a valid dmat (partial or complete), we need to free it */
- if(dmat) {
- NJ_free_dmat(dmat);
- }
-
- if(token) {
- if(token->buf) {
- free(token->buf);
- }
- free(token);
- }
-
- return(NULL);
-}
-
-
-
-
-
-
-
-/*
- * NJ_output_matrix() - Output a distance matrix to the specified file
- *
- *
- * INPUTS:
- * -------
- * nj_args -- a pointer to a data structure holding the command-line args
- * dmat -- a pointer to a distance matrix
- *
- *
- * RETURNS:
- * --------
- * NOTHING
- *
- *
- * DESCRIPTION:
- * ------------
- * If the appropriate flag was specified in the command-line, this function
- * now outputs the parsed or computed distance matrix to a file. This
- * can be useful if generating a distance matrix was the primary goal of
- * running the program, or if one wanted to debug and/or verify the
- * correctness of the program.
- *
- * Currently this function outputs full/symmetric matrices only.
- *
- */
-void
-NJ_output_matrix(NJ_ARGS *nj_args,
- DMAT *dmat) {
-
- FILE *fp = NULL;
- long int i, j;
-
-
-
- /* if we haven't specieid matrixout, return immediately */
- if(!nj_args->matrixout) {
- return;
- }
-
- /* open the specified matrix file for writing */
- fp = fopen(nj_args->matrixout, "w");
- if(!fp) {
- fprintf(stderr, "Clearcut: Could not open matrix file %s for output.\n", nj_args->matrixout);
- return;
- }
-
- /* output the number of taxa in the matrix */
- fprintf(fp, " %ld\n", dmat->size);
-
- fprintf(fp, "%s\n", dmat->taxaname[0]); // print the first taxon name outside of the main loop
-
- for(i=1;i<dmat->size;i++) {
-
- /* output taxaname */
- fprintf(fp, "%s\t", dmat->taxaname[i]);
-
- for(j=0;j<i;j++) {
- if(nj_args->expdist) { /* exponential notation (or not) */
- fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
- } else {
- fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
- }
- }
-
- fprintf(fp, "\n");
- }
-
-#ifdef FULL_SYMMETRIC_MATRIX
-
- /* output the number of taxa in the matrix */
- fprintf(fp, " %ld\n", dmat->size);
- for(i=0;i<dmat->size;i++) {
-
- /* output taxaname */
- fprintf(fp, "%s\t", dmat->taxaname[i]);
-
- for(j=0;j<dmat->size;j++) {
- if(i>j) {
- if(nj_args->expdist) { /* exponential notation (or not) */
- fprintf(fp, "%e ", dmat->val[NJ_MAP(j,i,dmat->size)]);
- } else {
- fprintf(fp, "%f ", dmat->val[NJ_MAP(j,i,dmat->size)]);
- }
- } else if(i<j) {
- if(nj_args->expdist) { /* exponential notation (or not) */
- fprintf(fp, "%e ", dmat->val[NJ_MAP(i,j,dmat->size)]);
- } else {
- fprintf(fp, "%f ", dmat->val[NJ_MAP(i,j,dmat->size)]);
- }
- } else {
- if(nj_args->expdist) { /* exponential notation (or not) */
- fprintf(fp, "%e ", 0.0);
- } else {
- fprintf(fp, "%f ", 0.0);
- }
- }
- }
-
- fprintf(fp, "\n");
- }
-
-#endif // FULL_SYMMETRIC_MATRIX
-
- /* close the file here */
- if(fp) {
- fclose(fp);
- }
-
- return;
-}
-
-
-
-
-