X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=fasta.cpp;fp=fasta.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=1dc17fcfe1aa9703c1126cfc913e6155c8c6208a;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/fasta.cpp b/fasta.cpp deleted file mode 100644 index 1dc17fc..0000000 --- a/fasta.cpp +++ /dev/null @@ -1,755 +0,0 @@ -/* - * fasta.c - * - * $Id$ - * - ***************************************************************************** - * - * Copyright (c) 2004, Luke Sheneman - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * + Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * + The names of its contributors may not be used to endorse or promote - * products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - ***************************************************************************** - * - * Functions for parsing FASTA formatted alignment files - * - ***************************************************************************** - * - * AUTHOR: - * - * Luke Sheneman - * sheneman@cs.uidaho.edu - * - */ - -#include -#include -#include -#include - -#include "clearcut.h" -#include "common.h" -#include "fasta.h" - - -#define NJ_NUM_DNA_AMBIGUITY_SYMS 14 -static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = -{ - 'M', 'R', 'W', 'S', 'Y', 'K', - 'V', 'H', 'D', 'B', 'X', 'N', - '-', '.' -}; - - -#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6 -static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] = -{ - 'X', 'B', 'Z', '*', '-', '.' -}; - -#define NJ_NUM_DNA_SYMS 5 -static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = -{ - 'A', 'G', 'C', 'T', 'U' -}; - - -#define NJ_NUM_PROTEIN_SYMS 20 -static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = -{ - 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' -}; - - - - - -/* - * NJ_is_whitespace() - Check to see if character is whitespace - * - * INPUTS: - * ------- - * c -- character to check - * - * RETURNS: - * -------- - * int -- 0 if not whitespace - * 1 if whitespace - */ -static inline -int -NJ_is_whitespace(char c) { - if( c == ' ' || /* space */ - c == '\n' || /* newline */ - c == '\r' || /* carriage-return */ - c == '\v' || /* vertical tab */ - c == '\f' || /* form feed */ - c == '\t' ) { /* horizontal tab */ - return(1); - } else { - return(0); - } -} - - - - - -/* - * NJ_is_dna() - - * - * Determines if the given symbol is DNA - * - * RETURNS: 1 if DNA - * 0 if not DNA - * - */ -static inline -int -NJ_is_dna(char c) { - - int i; - char up_c; - - up_c = toupper(c); - - for(i=0;i sequence1 - * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT- - * > sequence2 - * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT - * > sequence3 - * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC-- - * - * This function will parse alignments for DNA or protein, and will do - * so mindful of ambiguity codes for these kinds of sequences. All - * ambiguity codes are ignored by this program for the purposes of - * computing a distance matrix from a multiple alignment. By design, - * this program does not auto-detect DNA vs. Protein, and requires that - * the user explictly specify that on the command-line. - * - * Gaps can be represented either with the '-' or '.' characters. - * - * Newlines and other whitespace are allowed to be interspersed - * throughout the sequences. - * - * Taxon labels are required to be unique, and they must start with - * an alphabetic character (not a number, etc.). The parser will read - * the first token after the > character in the description line up until the - * first whitespace and use that for the taxon label. - * - * For example, in the line "> taxon1 is homo sapien", the taxon label will be - * "taxon1" - * - */ -NJ_alignment * -NJ_read_fasta(NJ_ARGS *nj_args) { - - FILE *fp = NULL; - char *buf = NULL; - char *ptr = NULL; - NJ_alignment *alignment = NULL; - - char c; - int state; - long int index, x, seq; - long int i; - long int bufsize, nseqs = NJ_INITIAL_NSEQS; - int first_sequence_flag; - - - - - /* - * In this function, we implement a FASTA alignment parser which - * reads in an alignment character-by-character, maintaining state - * information which guides the parser. - * - * The program reads either DNA or Protein alignments. All title lines - * and sequences can be arbitrarily long. Gaps can be represented by - * "-" or "." characters. - * - * Ambiguity codes are also handled. - * - */ - - /* - * We can't handle reading fasta input unless the user explicity - * specifies the input type...just to be sure. - */ - if( (!nj_args->dna_flag && !nj_args->protein_flag) || - (nj_args->dna_flag && nj_args->protein_flag) ) { - fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n"); - goto XIT_BAD; - } - - /* open specified fasta file here */ - if(nj_args->stdin_flag) { - fp = stdin; - } else { - fp = fopen(nj_args->infilename, "r"); - if(!fp) { - fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename); - perror("Clearcut"); - goto XIT_BAD; - } - } - - /* allocate the initial buffer */ - bufsize = NJ_INITIAL_BUFFER_SIZE; - buf = (char *)calloc(bufsize, sizeof(char)); - - /* allocate the alignment container here */ - alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment)); - - /* allocate initial title array */ - // printf("allocating initial title array\n"); - alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *)); - - /* make sure that we successfully allocated memory */ - if(!buf || !alignment || !alignment->titles) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - /* a flag */ - first_sequence_flag = 1; - - index = 0; /* tracks the position in buffer */ - x = 0; /* tracks the position on sequence */ - seq = 0; /* tracks the active sequence */ - - /* intitial state of state machine */ - state = NJ_FASTA_MODE_UNKNOWN; - - while(1) { - - /* get the next character */ - c = fgetc(fp); - if(feof(fp)) { - - if(state == NJ_FASTA_MODE_SEQUENCE) { - buf[index+1] = '\0'; - - /* copy buf to alignment */ - for(i=1;i<=alignment->length;i++) { - alignment->data[seq*alignment->length+i-1] = buf[i]; - } - } - - break; - } - - /* make sure our dynamic buffer is big enough */ - if(index >= bufsize) { - bufsize *= 2; - buf = (char *)realloc(buf, bufsize); - if(!buf) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - switch(state) { - - case NJ_FASTA_MODE_UNKNOWN: - - if(!NJ_is_whitespace(c)) { - if(c == '>') { - state = NJ_FASTA_MODE_TITLE; - buf[0] = '>'; - } else { - goto XIT_BAD; - } - } - - break; - - case NJ_FASTA_MODE_TITLE: - - if( c == '\n' || - c == '\r' ) { - - buf[index] = '\0'; - state = NJ_FASTA_MODE_SEQUENCE; - index = 0; - x = -1; - - /* make sure we've allocated enough space for titles and sequences */ - if(seq == nseqs) { - - // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs); - - nseqs *= 2; - - alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *)); - if(!alignment->titles) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf); - - alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char)); - if(!alignment->titles[seq]) { - fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - - /* lets forward to the first non-space (space/tab) character after the '>' */ - - if(first_sequence_flag) { - ptr = buf; - } else { - ptr = &buf[1]; - } - while(*ptr == '\t' || *ptr == ' ') { - ptr++; - } - sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */ - - alignment->nseq++; - } - - buf[index++] = c; - - break; - - - case NJ_FASTA_MODE_SEQUENCE: - - if(c == '>') { - - if(first_sequence_flag) { - first_sequence_flag = 0; - - /* allocate our alignment data section here */ - alignment->length = index-1; - - nseqs = NJ_INITIAL_NSEQS; - alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - - if(!first_sequence_flag) { - if(index-1 < alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - } - - /* re-allocate if necessary */ - /* - if(seq >= nseqs) { - nseqs *= 2; - alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char)); - if(!alignment->data) { - fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n"); - goto XIT_BAD; - } - } - */ - - /* copy buf to alignment */ - for(i=1;i<=alignment->length;i++) { - alignment->data[seq*alignment->length+i-1] = buf[i]; - } - - state = NJ_FASTA_MODE_TITLE; - index = 1; - x = 1; - - buf[0] = c; - - seq++; - - } else { - - if(NJ_is_whitespace(c)) { - break; - } - - if(!first_sequence_flag) { - if(index-1 >= alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - } - - - /* - * Here we check to make sure that the symbol read is appropriate - * for the type of data the user specified. (dna or protein). - * We also handle ambiguity codes by converting them to a specific - * assigned ambiguity code character. Ambiguity codes are ignored - * when computing distances - */ - - if(nj_args->dna_flag) { - if(NJ_is_dna(c)) { - buf[index++] = toupper(c); - } else { - if(NJ_is_dna_ambiguity(c)) { - buf[index++] = NJ_AMBIGUITY_CHAR; - } else { - fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq); - goto XIT_BAD; - } - } - } else if(nj_args->protein_flag) { - if(NJ_is_protein(c)) { - buf[index++] = toupper(c); - } else { - if(NJ_is_protein_ambiguity(c)) { - buf[index++] = NJ_AMBIGUITY_CHAR; - } else { - fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq); - goto XIT_BAD; - } - } - } - - } - - break; - - default: - goto XIT_BAD; - - break; - - } - } - - if(index-1 != alignment->length) { - fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq); - goto XIT_BAD; - } - - /* check for duplicate taxon labels */ - if(!NJ_taxaname_unique(alignment)) { - goto XIT_BAD; - } - - return(alignment); - - - XIT_BAD: - - if(fp) { - fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp)); - } - - if(buf) { - free(buf); - } - - NJ_free_alignment(alignment); - - return(NULL); -} - - - - -/* - * NJ_print_alignment() - Print multiple sequence alignment (for debugging) - * - * INPUTS: - * ------- - * alignment -- A pointer to the alignment - * - * RETURNS: - * -------- - * NONE - * - */ -void -NJ_print_alignment(NJ_alignment *alignment) { - - long int i, j; - - printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length); - - for(i=0;inseq;i++) { - - printf("> %s\n", alignment->titles[i]); - - for(j=0;jlength;j++) { - printf("%c", alignment->data[i*alignment->length+j]); - } - - printf("\n"); - } - - return; -} - - - - - - - -/* - * - * NJ_free_alignment() - Free all of the memory allocated for the - * multiple sequence alignment - * - * INPUTS: - * ------- - * alignment -- A pointer to the multiple sequence alignment - * - * RETURNS: - * -------- - * NONE - * - */ -void -NJ_free_alignment(NJ_alignment *alignment) { - - long int i; - - if(alignment) { - - /* free the allocated titles */ - if(alignment->titles) { - for(i=0;inseq;i++) { - - if(alignment->titles[i]) { - free(alignment->titles[i]); - } - } - - free(alignment->titles); - } - - /* free the alignment data */ - if(alignment->data) { - free(alignment->data); - } - - /* free the alignment itself */ - free(alignment); - } - - return; -} - - - - -/* - * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment - * - * INPUTS: - * ------- - * alignment -- a pointer to a multiple sequence alignment - * - * OUTPUTS: - * -------- - * int -- 0 if all taxanames in alignment are unique - * 1 if all taxanames in alignment are NOT unique - * - * - * DESCRIPTION: - * ------------ - * - * Check to see if the taxanames in the alignment are unique. It - * will be impossible to make sense of the final tree if the taxon - * labels are not unqiue. - * - */ -int -NJ_taxaname_unique(NJ_alignment *alignment) { - - long int i, j; - - for(i=0;inseq;i++) { - for(j=i+1;jnseq;j++) { - if(!strcmp(alignment->titles[i], - alignment->titles[j])) { - fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", - i, j, alignment->titles[i]); - return(0); - } - } - } - - return(1); -} - - -void -NJ_print_titles(NJ_alignment *alignment) { - - int i; - - for(i=0;inseq;i++) { - printf("%d: %s\n", i, alignment->titles[i]); - } - - return; -}