X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=fasta.cpp;fp=fasta.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=1dc17fcfe1aa9703c1126cfc913e6155c8c6208a;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git

diff --git a/fasta.cpp b/fasta.cpp
deleted file mode 100644
index 1dc17fc..0000000
--- a/fasta.cpp
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * fasta.c
- *
- * $Id$
- *
- *****************************************************************************
- *
- * Copyright (c) 2004,  Luke Sheneman
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without 
- * modification, are permitted provided that the following conditions 
- * are met:
- * 
- *  + Redistributions of source code must retain the above copyright 
- *    notice, this list of conditions and the following disclaimer. 
- *  + Redistributions in binary form must reproduce the above copyright 
- *    notice, this list of conditions and the following disclaimer in 
- *    the documentation and/or other materials provided with the 
- *    distribution. 
- *  + The names of its contributors may not be used to endorse or promote 
- *    products derived  from this software without specific prior 
- *    written permission. 
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
- * POSSIBILITY OF SUCH DAMAGE.  
- *
- *****************************************************************************
- *
- * Functions for parsing FASTA formatted alignment files
- *
- *****************************************************************************
- *
- * AUTHOR:
- * 
- *   Luke Sheneman
- *   sheneman@cs.uidaho.edu
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#include "clearcut.h"
-#include "common.h"
-#include "fasta.h"
-
-
-#define NJ_NUM_DNA_AMBIGUITY_SYMS 14
-static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] = 
-{
-  'M', 'R', 'W', 'S', 'Y', 'K',
-  'V', 'H', 'D', 'B', 'X', 'N',
-  '-', '.'
-};
-
-
-#define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
-static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
-{
-  'X', 'B', 'Z', '*', '-', '.'
-};
-
-#define NJ_NUM_DNA_SYMS 5
-static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] = 
-{
-  'A', 'G', 'C', 'T', 'U'
-};
-
-
-#define NJ_NUM_PROTEIN_SYMS 20
-static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] = 
-{
-  'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 
-  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
-};
-
-
-
-
-
-/*
- * NJ_is_whitespace() - Check to see if character is whitespace
- *
- * INPUTS:
- * -------
- *     c -- character to check
- *
- * RETURNS:
- * --------
- *    int -- 0 if not whitespace
- *           1 if whitespace
- */
-static inline
-int
-NJ_is_whitespace(char c) {
-  if( c == ' '  ||   /* space           */
-      c == '\n' ||   /* newline         */
-      c == '\r' ||   /* carriage-return */
-      c == '\v' ||   /* vertical tab    */
-      c == '\f' ||   /* form feed       */
-      c == '\t' ) {  /* horizontal tab  */
-    return(1);
-  } else {
-    return(0);
-  }
-}
-
-
-
-
-
-/*
- * NJ_is_dna() - 
- *
- * Determines if the given symbol is DNA
- *
- * RETURNS: 1 if DNA
- *          0 if not DNA
- *
- */
-static inline
-int
-NJ_is_dna(char c) {
-
-  int i;
-  char up_c;
-  
-  up_c = toupper(c);
-  
-  for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
-    if(up_c == NJ_dna_syms[i]) {
-      return(1);
-    }
-  }
-
-  return(0);
-}
-
-
-
-
-
-/*
- * NJ_is_dna_ambiguity() - 
- *
- * Determines if the given symbol is a 
- * DNA ambiguity code
- *
- * RETURNS: 1 if DNA Ambiguity Code
- *          0 if not DNA Ambiguity Code
- *
- */
-static inline
-int
-NJ_is_dna_ambiguity(char c) {
-  
-  int i;
-  char up_c;
-  
-  up_c = toupper(c);
-  
-  for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
-    if(up_c == NJ_dna_ambiguity_syms[i]) {
-      return(1);
-    }
-  }
-  
-  return(0);
-}
-
-
-
-/*
- * NJ_is_protein() - 
- *
- * Determines if supplied symbol is amino acid symbol
- *
- * RETURNS: 1 if amino acid
- *          0 if not amino acid
- *
- */
-static inline
-int
-NJ_is_protein(char c) {
-
-  int i;
-  char up_c;
-  
-  up_c = toupper(c);
-  
-  for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
-    if(up_c == NJ_protein_syms[i]) {
-      return(1);
-    }
-  }
-  
-  return(0);
-}
-
-
-
-
-/*
- * NJ_is_protein_ambiguity() - 
- *
- * Determines if supplied symbol is amino acid ambiguity symbol
- *
- * RETURNS: 1 if amino acid ambiguity symbol
- *          0 if not amino acid ambiguity symbol
- *
- */
-static inline
-int 
-NJ_is_protein_ambiguity(char c) {
-
-  int i;
-  char up_c;
-  
-  up_c = toupper(c);
-
-  for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
-    if(up_c == NJ_protein_ambiguity_syms[i]) {
-      return(1);
-    }
-  }
-  
-  return(0);
-}
-
-
-
-
-
-
-/*
- * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
- *                   data structure
- *
- *
- * INPUTS:
- * -------
- *   nj_args -- A pointer to a data structure containing command-line args
- *
- * RETURNS:
- * --------
- *   NJ_alignment * -- A pointer to a multiple sequence alignment
- *
- * DESCRIPTION:
- * ------------
- *
- * 
- * This function implements a state machine parser for parsing FASTA-formatted
- * multiple sequence alignment files.  
- * 
- * Example Input:
- *
- *   > sequence1
- *   ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
- *   > sequence2 
- *   --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
- *   > sequence3
- *   TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
- * 
- * This function will parse alignments for DNA or protein, and will do
- * so mindful of ambiguity codes for these kinds of sequences.  All 
- * ambiguity codes are ignored by this program for the purposes of 
- * computing a distance matrix from a multiple alignment.  By design, 
- * this program does not auto-detect DNA vs. Protein, and requires that 
- * the user explictly specify that on the command-line.
- *
- * Gaps can be represented either with the '-' or '.' characters.
- * 
- * Newlines and other whitespace are allowed to be interspersed 
- * throughout the sequences.
- *
- * Taxon labels are required to be unique, and they must start with 
- * an alphabetic character (not a number, etc.).  The parser will read
- * the first token after the > character in the description line up until the
- * first whitespace and use that for the taxon label.
- *
- * For example, in the line "> taxon1 is homo sapien", the taxon label will be 
- * "taxon1"
- *
- */
-NJ_alignment *
-NJ_read_fasta(NJ_ARGS *nj_args) {
-
-  FILE *fp  = NULL;
-  char *buf = NULL;
-  char *ptr = NULL;
-  NJ_alignment *alignment = NULL;
-
-  char c;
-  int state;
-  long int index, x, seq;
-  long int i;
-  long int bufsize, nseqs = NJ_INITIAL_NSEQS;
-  int first_sequence_flag;
-
-
-  
-
-  /* 
-   * In this function, we implement a FASTA alignment parser which
-   * reads in an alignment character-by-character, maintaining state
-   * information which guides the parser.
-   *
-   * The program reads either DNA or Protein alignments.  All title lines
-   * and sequences can be arbitrarily long.  Gaps can be represented by 
-   * "-" or "." characters.  
-   *
-   * Ambiguity codes are also handled.
-   * 
-   */
-
-  /* 
-   * We can't handle reading fasta input unless the user explicity 
-   * specifies the input type...just to be sure.
-   */
-  if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
-      (nj_args->dna_flag  &&  nj_args->protein_flag) ) {
-    fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
-    goto XIT_BAD;
-  }
-
-  /* open specified fasta file here */
-  if(nj_args->stdin_flag) {
-    fp = stdin;
-  } else {
-    fp = fopen(nj_args->infilename, "r");
-    if(!fp) {
-      fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
-      perror("Clearcut");
-      goto XIT_BAD;
-    }
-  }
-
-  /* allocate the initial buffer */
-  bufsize = NJ_INITIAL_BUFFER_SIZE;
-  buf = (char *)calloc(bufsize, sizeof(char));
-  
-  /* allocate the alignment container here */
-  alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
-  
-  /* allocate initial title array */
-  //  printf("allocating initial title array\n");
-  alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));
-
-  /* make sure that we successfully allocated memory */
-  if(!buf || !alignment || !alignment->titles) {
-    fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
-    goto XIT_BAD;
-  }
-
-  /* a flag */
-  first_sequence_flag = 1;  
-  
-  index  = 0;  /* tracks the position in buffer     */
-  x      = 0;  /* tracks the position on sequence   */
-  seq    = 0;  /* tracks the active sequence        */
-
-  /* intitial state of state machine */
-  state  = NJ_FASTA_MODE_UNKNOWN;
-
-  while(1) {
-    
-    /* get the next character */
-    c = fgetc(fp);
-    if(feof(fp)) {
-
-      if(state == NJ_FASTA_MODE_SEQUENCE) {
-	buf[index+1] = '\0';
-
-	/* copy buf to alignment */
-	for(i=1;i<=alignment->length;i++) {
-	  alignment->data[seq*alignment->length+i-1] = buf[i];
-	}
-      }
-
-      break;
-    }
-
-    /* make sure our dynamic buffer is big enough */
-    if(index >= bufsize) {
-      bufsize *= 2;
-      buf = (char *)realloc(buf, bufsize);
-      if(!buf) {
-	fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
-	goto XIT_BAD;
-      }
-    }
-
-    switch(state) {
-      
-    case NJ_FASTA_MODE_UNKNOWN:
-      
-      if(!NJ_is_whitespace(c)) {
-	if(c == '>') {
-	  state = NJ_FASTA_MODE_TITLE;
-	  buf[0] = '>';
-	} else {
-	  goto XIT_BAD;
-	}
-      }
-
-      break;
-
-    case NJ_FASTA_MODE_TITLE:
-
-      if( c == '\n' ||
-	  c == '\r' ) {
-
-	buf[index] = '\0';
-	state = NJ_FASTA_MODE_SEQUENCE;
-	index = 0;
-	x     = -1;
-
-	/* make sure we've allocated enough space for titles and sequences */
-	if(seq == nseqs) {
-
-	  //	  printf("realloc().  seq = %d, nseqs = %d\n", seq, nseqs);
-
-	  nseqs *= 2;
-
-	  alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
-	  if(!alignment->titles) {
-	    fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
-	    goto XIT_BAD;
-	  }
-
-	  alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
-	  if(!alignment->data) {
-	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
-	    goto XIT_BAD;
-	  }
-	}
-
-	// printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);
-
-	alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
-	if(!alignment->titles[seq]) {
-	  fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
-	  goto XIT_BAD;
-	}
-
-	/* lets forward to the first non-space (space/tab) character after the '>' */
-
-	if(first_sequence_flag) {
-	  ptr = buf;
-	} else {
-	  ptr = &buf[1];
-	}
-	while(*ptr == '\t' || *ptr == ' ') {
-	  ptr++;
-	}
-	sscanf(ptr, "%s", alignment->titles[seq]);  /* get the first word and use as the title */
-
-	alignment->nseq++;
-      }
-	
-      buf[index++] = c;
-
-      break;
-
-
-    case NJ_FASTA_MODE_SEQUENCE:
-
-      if(c == '>') {
-
-	if(first_sequence_flag) {
-	  first_sequence_flag = 0;
-
-	  /* allocate our alignment data section here */
-	  alignment->length = index-1;
-
-	  nseqs = NJ_INITIAL_NSEQS;
-	  alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
-	  if(!alignment->data) {
-	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
-	    goto XIT_BAD;
-	  }
-	} 
-	
-	if(!first_sequence_flag) {
-	  if(index-1 < alignment->length) {
-	    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
-	    goto XIT_BAD;
-	  }
-	}
-
-	/* re-allocate if necessary */
-	/*
-	if(seq >= nseqs) {
-	  nseqs *= 2;
-	  alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
-	  if(!alignment->data) {
-	    fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
-	    goto XIT_BAD;
-	  }
-	}
-	*/
-
-	/* copy buf to alignment */
-	for(i=1;i<=alignment->length;i++) {
-	  alignment->data[seq*alignment->length+i-1] = buf[i];
-	}
-	  
-	state = NJ_FASTA_MODE_TITLE;
-	index = 1;
-	x     = 1;
-	  
-	buf[0] = c;
-	
-	seq++;
-
-      } else {
-	  
-	if(NJ_is_whitespace(c)) {
-	  break;
-	}
-		
-	if(!first_sequence_flag) {
-	  if(index-1 >= alignment->length) {
-	    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
-	    goto XIT_BAD;
-	  }
-	}
-
-
-	/* 
-	 * Here we check to make sure that the symbol read is appropriate
-	 * for the type of data the user specified.  (dna or protein).
-	 * We also handle ambiguity codes by converting them to a specific
-	 * assigned ambiguity code character.  Ambiguity codes are ignored
-	 * when computing distances
-	 */
-
-	if(nj_args->dna_flag) {
-	  if(NJ_is_dna(c)) {
-	    buf[index++] = toupper(c);
-	  } else {
-	    if(NJ_is_dna_ambiguity(c)) {
-	      buf[index++] = NJ_AMBIGUITY_CHAR;
-	    } else {
-	      fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
-	      goto XIT_BAD;
-	    }
-	  }
-	} else if(nj_args->protein_flag) {
-	  if(NJ_is_protein(c)) {
-	    buf[index++] = toupper(c);
-	  } else {
-	    if(NJ_is_protein_ambiguity(c)) {
-	      buf[index++] = NJ_AMBIGUITY_CHAR;
-	    } else {
-	      fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
-	      goto XIT_BAD;
-	    }
-	  }
-	}
-
-      }
-
-      break;
-	
-    default:
-      goto XIT_BAD;
-	
-      break;
-
-    }
-  }
-  
-  if(index-1 != alignment->length) {
-    fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
-    goto XIT_BAD;
-  }
-  
-  /* check for duplicate taxon labels */
-  if(!NJ_taxaname_unique(alignment)) {
-    goto XIT_BAD;
-  }
-  
-  return(alignment);
-
-  
- XIT_BAD:
-
-  if(fp) {
-    fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
-  }
-  
-  if(buf) {
-    free(buf);
-  }
-  
-  NJ_free_alignment(alignment);
-
-  return(NULL);
-}
-
-
-
-
-/*
- * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
- *
- * INPUTS:
- * -------
- *    alignment -- A pointer to the alignment
- *
- * RETURNS:
- * --------
- *    NONE
- *
- */
-void
-NJ_print_alignment(NJ_alignment *alignment) {
-  
-  long int i, j;
-  
-  printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
-  
-  for(i=0;i<alignment->nseq;i++) {
-    
-    printf("> %s\n", alignment->titles[i]);
-
-    for(j=0;j<alignment->length;j++) {
-      printf("%c", alignment->data[i*alignment->length+j]);
-    }
-
-    printf("\n");
-  }
-
-  return;
-}
-
-
-
-
-
-
-
-/*
- *
- * NJ_free_alignment() - Free all of the memory allocated for the
- *                       multiple sequence alignment 
- *
- * INPUTS:
- * -------
- *   alignment -- A pointer to the multiple sequence alignment
- *
- * RETURNS:
- * --------
- *    NONE
- *
- */
-void
-NJ_free_alignment(NJ_alignment *alignment) {
-  
-  long int i;
-  
-  if(alignment) {
-
-    /* free the allocated titles */
-    if(alignment->titles) {
-      for(i=0;i<alignment->nseq;i++) {
-
-	if(alignment->titles[i]) {
-	  free(alignment->titles[i]);
-	}
-      }
-      
-      free(alignment->titles);
-    }
-
-    /* free the alignment data */
-    if(alignment->data) {
-      free(alignment->data);
-    }
-
-    /* free the alignment itself */
-    free(alignment);
-  }
-
-  return;
-}
-
-
-
-
-/*
- * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
- *
- * INPUTS:
- * -------
- *  alignment -- a pointer to a multiple sequence alignment
- *
- * OUTPUTS:
- * --------
- *  int -- 0 if all taxanames in alignment are unique
- *         1 if all taxanames in alignment are NOT unique
- *
- *
- * DESCRIPTION:
- * ------------
- *
- * Check to see if the taxanames in the alignment are unique.  It
- * will be impossible to make sense of the final tree if the taxon
- * labels are not unqiue.
- *
- */
-int
-NJ_taxaname_unique(NJ_alignment *alignment) {
-  
-  long int i, j;
-  
-  for(i=0;i<alignment->nseq;i++) {
-    for(j=i+1;j<alignment->nseq;j++) {
-        if(!strcmp(alignment->titles[i], 
-		 alignment->titles[j])) {
-	fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n", 
-		i, j, alignment->titles[i]);
-	return(0);
-      }
-    }
-  }
-  
-  return(1);
-}
-
-
-void
-NJ_print_titles(NJ_alignment *alignment) {
-
-  int i;
-
-  for(i=0;i<alignment->nseq;i++) {
-    printf("%d: %s\n", i, alignment->titles[i]);
-  }
-
-  return;
-}