+/*
+ * fasta.c
+ *
+ * $Id$
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Functions for parsing FASTA formatted alignment files
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "clearcut.h"
+#include "common.h"
+#include "fasta.h"
+
+
+#define NJ_NUM_DNA_AMBIGUITY_SYMS 14
+static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] =
+{
+ 'M', 'R', 'W', 'S', 'Y', 'K',
+ 'V', 'H', 'D', 'B', 'X', 'N',
+ '-', '.'
+};
+
+
+#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
+static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
+{
+ 'X', 'B', 'Z', '*', '-', '.'
+};
+
+#define NJ_NUM_DNA_SYMS 5
+static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] =
+{
+ 'A', 'G', 'C', 'T', 'U'
+};
+
+
+#define NJ_NUM_PROTEIN_SYMS 20
+static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] =
+{
+ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
+ 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
+};
+
+
+
+
+
+/*
+ * NJ_is_whitespace() - Check to see if character is whitespace
+ *
+ * INPUTS:
+ * -------
+ * c -- character to check
+ *
+ * RETURNS:
+ * --------
+ * int -- 0 if not whitespace
+ * 1 if whitespace
+ */
+static inline
+int
+NJ_is_whitespace(char c) {
+ if( c == ' ' || /* space */
+ c == '\n' || /* newline */
+ c == '\r' || /* carriage-return */
+ c == '\v' || /* vertical tab */
+ c == '\f' || /* form feed */
+ c == '\t' ) { /* horizontal tab */
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+
+
+/*
+ * NJ_is_dna() -
+ *
+ * Determines if the given symbol is DNA
+ *
+ * RETURNS: 1 if DNA
+ * 0 if not DNA
+ *
+ */
+static inline
+int
+NJ_is_dna(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
+ if(up_c == NJ_dna_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+
+/*
+ * NJ_is_dna_ambiguity() -
+ *
+ * Determines if the given symbol is a
+ * DNA ambiguity code
+ *
+ * RETURNS: 1 if DNA Ambiguity Code
+ * 0 if not DNA Ambiguity Code
+ *
+ */
+static inline
+int
+NJ_is_dna_ambiguity(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
+ if(up_c == NJ_dna_ambiguity_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+/*
+ * NJ_is_protein() -
+ *
+ * Determines if supplied symbol is amino acid symbol
+ *
+ * RETURNS: 1 if amino acid
+ * 0 if not amino acid
+ *
+ */
+static inline
+int
+NJ_is_protein(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
+ if(up_c == NJ_protein_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+/*
+ * NJ_is_protein_ambiguity() -
+ *
+ * Determines if supplied symbol is amino acid ambiguity symbol
+ *
+ * RETURNS: 1 if amino acid ambiguity symbol
+ * 0 if not amino acid ambiguity symbol
+ *
+ */
+static inline
+int
+NJ_is_protein_ambiguity(char c) {
+
+ int i;
+ char up_c;
+
+ up_c = toupper(c);
+
+ for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
+ if(up_c == NJ_protein_ambiguity_syms[i]) {
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+
+
+
+
+
+/*
+ * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
+ * data structure
+ *
+ *
+ * INPUTS:
+ * -------
+ * nj_args -- A pointer to a data structure containing command-line args
+ *
+ * RETURNS:
+ * --------
+ * NJ_alignment * -- A pointer to a multiple sequence alignment
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ *
+ * This function implements a state machine parser for parsing FASTA-formatted
+ * multiple sequence alignment files.
+ *
+ * Example Input:
+ *
+ * > sequence1
+ * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
+ * > sequence2
+ * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
+ * > sequence3
+ * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
+ *
+ * This function will parse alignments for DNA or protein, and will do
+ * so mindful of ambiguity codes for these kinds of sequences. All
+ * ambiguity codes are ignored by this program for the purposes of
+ * computing a distance matrix from a multiple alignment. By design,
+ * this program does not auto-detect DNA vs. Protein, and requires that
+ * the user explictly specify that on the command-line.
+ *
+ * Gaps can be represented either with the '-' or '.' characters.
+ *
+ * Newlines and other whitespace are allowed to be interspersed
+ * throughout the sequences.
+ *
+ * Taxon labels are required to be unique, and they must start with
+ * an alphabetic character (not a number, etc.). The parser will read
+ * the first token after the > character in the description line up until the
+ * first whitespace and use that for the taxon label.
+ *
+ * For example, in the line "> taxon1 is homo sapien", the taxon label will be
+ * "taxon1"
+ *
+ */
+NJ_alignment *
+NJ_read_fasta(NJ_ARGS *nj_args) {
+
+ FILE *fp = NULL;
+ char *buf = NULL;
+ char *ptr = NULL;
+ NJ_alignment *alignment = NULL;
+
+ char c;
+ int state;
+ long int index, x, seq;
+ long int i;
+ long int bufsize, nseqs = NJ_INITIAL_NSEQS;
+ int first_sequence_flag;
+
+
+
+
+ /*
+ * In this function, we implement a FASTA alignment parser which
+ * reads in an alignment character-by-character, maintaining state
+ * information which guides the parser.
+ *
+ * The program reads either DNA or Protein alignments. All title lines
+ * and sequences can be arbitrarily long. Gaps can be represented by
+ * "-" or "." characters.
+ *
+ * Ambiguity codes are also handled.
+ *
+ */
+
+ /*
+ * We can't handle reading fasta input unless the user explicity
+ * specifies the input type...just to be sure.
+ */
+ if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
+ (nj_args->dna_flag && nj_args->protein_flag) ) {
+ fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
+ goto XIT_BAD;
+ }
+
+ /* open specified fasta file here */
+ if(nj_args->stdin_flag) {
+ fp = stdin;
+ } else {
+ fp = fopen(nj_args->infilename, "r");
+ if(!fp) {
+ fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
+ perror("Clearcut");
+ goto XIT_BAD;
+ }
+ }
+
+ /* allocate the initial buffer */
+ bufsize = NJ_INITIAL_BUFFER_SIZE;
+ buf = (char *)calloc(bufsize, sizeof(char));
+
+ /* allocate the alignment container here */
+ alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
+
+ /* allocate initial title array */
+ // printf("allocating initial title array\n");
+ alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));
+
+ /* make sure that we successfully allocated memory */
+ if(!buf || !alignment || !alignment->titles) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ /* a flag */
+ first_sequence_flag = 1;
+
+ index = 0; /* tracks the position in buffer */
+ x = 0; /* tracks the position on sequence */
+ seq = 0; /* tracks the active sequence */
+
+ /* intitial state of state machine */
+ state = NJ_FASTA_MODE_UNKNOWN;
+
+ while(1) {
+
+ /* get the next character */
+ c = fgetc(fp);
+ if(feof(fp)) {
+
+ if(state == NJ_FASTA_MODE_SEQUENCE) {
+ buf[index+1] = '\0';
+
+ /* copy buf to alignment */
+ for(i=1;i<=alignment->length;i++) {
+ alignment->data[seq*alignment->length+i-1] = buf[i];
+ }
+ }
+
+ break;
+ }
+
+ /* make sure our dynamic buffer is big enough */
+ if(index >= bufsize) {
+ bufsize *= 2;
+ buf = (char *)realloc(buf, bufsize);
+ if(!buf) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ switch(state) {
+
+ case NJ_FASTA_MODE_UNKNOWN:
+
+ if(!NJ_is_whitespace(c)) {
+ if(c == '>') {
+ state = NJ_FASTA_MODE_TITLE;
+ buf[0] = '>';
+ } else {
+ goto XIT_BAD;
+ }
+ }
+
+ break;
+
+ case NJ_FASTA_MODE_TITLE:
+
+ if( c == '\n' ||
+ c == '\r' ) {
+
+ buf[index] = '\0';
+ state = NJ_FASTA_MODE_SEQUENCE;
+ index = 0;
+ x = -1;
+
+ /* make sure we've allocated enough space for titles and sequences */
+ if(seq == nseqs) {
+
+ // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs);
+
+ nseqs *= 2;
+
+ alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
+ if(!alignment->titles) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);
+
+ alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
+ if(!alignment->titles[seq]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+
+ /* lets forward to the first non-space (space/tab) character after the '>' */
+
+ if(first_sequence_flag) {
+ ptr = buf;
+ } else {
+ ptr = &buf[1];
+ }
+ while(*ptr == '\t' || *ptr == ' ') {
+ ptr++;
+ }
+ sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */
+
+ alignment->nseq++;
+ }
+
+ buf[index++] = c;
+
+ break;
+
+
+ case NJ_FASTA_MODE_SEQUENCE:
+
+ if(c == '>') {
+
+ if(first_sequence_flag) {
+ first_sequence_flag = 0;
+
+ /* allocate our alignment data section here */
+ alignment->length = index-1;
+
+ nseqs = NJ_INITIAL_NSEQS;
+ alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+
+ if(!first_sequence_flag) {
+ if(index-1 < alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+ }
+
+ /* re-allocate if necessary */
+ /*
+ if(seq >= nseqs) {
+ nseqs *= 2;
+ alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
+ if(!alignment->data) {
+ fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
+ goto XIT_BAD;
+ }
+ }
+ */
+
+ /* copy buf to alignment */
+ for(i=1;i<=alignment->length;i++) {
+ alignment->data[seq*alignment->length+i-1] = buf[i];
+ }
+
+ state = NJ_FASTA_MODE_TITLE;
+ index = 1;
+ x = 1;
+
+ buf[0] = c;
+
+ seq++;
+
+ } else {
+
+ if(NJ_is_whitespace(c)) {
+ break;
+ }
+
+ if(!first_sequence_flag) {
+ if(index-1 >= alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+ }
+
+
+ /*
+ * Here we check to make sure that the symbol read is appropriate
+ * for the type of data the user specified. (dna or protein).
+ * We also handle ambiguity codes by converting them to a specific
+ * assigned ambiguity code character. Ambiguity codes are ignored
+ * when computing distances
+ */
+
+ if(nj_args->dna_flag) {
+ if(NJ_is_dna(c)) {
+ buf[index++] = toupper(c);
+ } else {
+ if(NJ_is_dna_ambiguity(c)) {
+ buf[index++] = NJ_AMBIGUITY_CHAR;
+ } else {
+ fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
+ goto XIT_BAD;
+ }
+ }
+ } else if(nj_args->protein_flag) {
+ if(NJ_is_protein(c)) {
+ buf[index++] = toupper(c);
+ } else {
+ if(NJ_is_protein_ambiguity(c)) {
+ buf[index++] = NJ_AMBIGUITY_CHAR;
+ } else {
+ fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
+ goto XIT_BAD;
+ }
+ }
+ }
+
+ }
+
+ break;
+
+ default:
+ goto XIT_BAD;
+
+ break;
+
+ }
+ }
+
+ if(index-1 != alignment->length) {
+ fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
+ goto XIT_BAD;
+ }
+
+ /* check for duplicate taxon labels */
+ if(!NJ_taxaname_unique(alignment)) {
+ goto XIT_BAD;
+ }
+
+ return(alignment);
+
+
+ XIT_BAD:
+
+ if(fp) {
+ fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
+ }
+
+ if(buf) {
+ free(buf);
+ }
+
+ NJ_free_alignment(alignment);
+
+ return(NULL);
+}
+
+
+
+
+/*
+ * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
+ *
+ * INPUTS:
+ * -------
+ * alignment -- A pointer to the alignment
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ */
+void
+NJ_print_alignment(NJ_alignment *alignment) {
+
+ long int i, j;
+
+ printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
+
+ for(i=0;i<alignment->nseq;i++) {
+
+ printf("> %s\n", alignment->titles[i]);
+
+ for(j=0;j<alignment->length;j++) {
+ printf("%c", alignment->data[i*alignment->length+j]);
+ }
+
+ printf("\n");
+ }
+
+ return;
+}
+
+
+
+
+
+
+
+/*
+ *
+ * NJ_free_alignment() - Free all of the memory allocated for the
+ * multiple sequence alignment
+ *
+ * INPUTS:
+ * -------
+ * alignment -- A pointer to the multiple sequence alignment
+ *
+ * RETURNS:
+ * --------
+ * NONE
+ *
+ */
+void
+NJ_free_alignment(NJ_alignment *alignment) {
+
+ long int i;
+
+ if(alignment) {
+
+ /* free the allocated titles */
+ if(alignment->titles) {
+ for(i=0;i<alignment->nseq;i++) {
+
+ if(alignment->titles[i]) {
+ free(alignment->titles[i]);
+ }
+ }
+
+ free(alignment->titles);
+ }
+
+ /* free the alignment data */
+ if(alignment->data) {
+ free(alignment->data);
+ }
+
+ /* free the alignment itself */
+ free(alignment);
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
+ *
+ * INPUTS:
+ * -------
+ * alignment -- a pointer to a multiple sequence alignment
+ *
+ * OUTPUTS:
+ * --------
+ * int -- 0 if all taxanames in alignment are unique
+ * 1 if all taxanames in alignment are NOT unique
+ *
+ *
+ * DESCRIPTION:
+ * ------------
+ *
+ * Check to see if the taxanames in the alignment are unique. It
+ * will be impossible to make sense of the final tree if the taxon
+ * labels are not unqiue.
+ *
+ */
+int
+NJ_taxaname_unique(NJ_alignment *alignment) {
+
+ long int i, j;
+
+ for(i=0;i<alignment->nseq;i++) {
+ for(j=i+1;j<alignment->nseq;j++) {
+ if(!strcmp(alignment->titles[i],
+ alignment->titles[j])) {
+ fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n",
+ i, j, alignment->titles[i]);
+ return(0);
+ }
+ }
+ }
+
+ return(1);
+}
+
+
+void
+NJ_print_titles(NJ_alignment *alignment) {
+
+ int i;
+
+ for(i=0;i<alignment->nseq;i++) {
+ printf("%d: %s\n", i, alignment->titles[i]);
+ }
+
+ return;
+}