6 *****************************************************************************
8 * Copyright (c) 2004, Luke Sheneman
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
15 * + Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * + Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
21 * + The names of its contributors may not be used to endorse or promote
22 * products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
37 *****************************************************************************
39 * Functions for parsing FASTA formatted alignment files
41 *****************************************************************************
46 * sheneman@cs.uidaho.edu
60 #define NJ_NUM_DNA_AMBIGUITY_SYMS 14
61 static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] =
63 'M', 'R', 'W', 'S', 'Y', 'K',
64 'V', 'H', 'D', 'B', 'X', 'N',
69 #define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
70 static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
72 'X', 'B', 'Z', '*', '-', '.'
75 #define NJ_NUM_DNA_SYMS 5
76 static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] =
78 'A', 'G', 'C', 'T', 'U'
82 #define NJ_NUM_PROTEIN_SYMS 20
83 static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] =
85 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
86 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
94 * NJ_is_whitespace() - Check to see if character is whitespace
98 * c -- character to check
102 * int -- 0 if not whitespace
107 NJ_is_whitespace(char c) {
108 if( c == ' ' || /* space */
109 c == '\n' || /* newline */
110 c == '\r' || /* carriage-return */
111 c == '\v' || /* vertical tab */
112 c == '\f' || /* form feed */
113 c == '\t' ) { /* horizontal tab */
127 * Determines if the given symbol is DNA
142 for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
143 if(up_c == NJ_dna_syms[i]) {
156 * NJ_is_dna_ambiguity() -
158 * Determines if the given symbol is a
161 * RETURNS: 1 if DNA Ambiguity Code
162 * 0 if not DNA Ambiguity Code
167 NJ_is_dna_ambiguity(char c) {
174 for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
175 if(up_c == NJ_dna_ambiguity_syms[i]) {
188 * Determines if supplied symbol is amino acid symbol
190 * RETURNS: 1 if amino acid
191 * 0 if not amino acid
196 NJ_is_protein(char c) {
203 for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
204 if(up_c == NJ_protein_syms[i]) {
216 * NJ_is_protein_ambiguity() -
218 * Determines if supplied symbol is amino acid ambiguity symbol
220 * RETURNS: 1 if amino acid ambiguity symbol
221 * 0 if not amino acid ambiguity symbol
226 NJ_is_protein_ambiguity(char c) {
233 for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
234 if(up_c == NJ_protein_ambiguity_syms[i]) {
248 * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
254 * nj_args -- A pointer to a data structure containing command-line args
258 * NJ_alignment * -- A pointer to a multiple sequence alignment
264 * This function implements a state machine parser for parsing FASTA-formatted
265 * multiple sequence alignment files.
270 * ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
272 * --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
274 * TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
276 * This function will parse alignments for DNA or protein, and will do
277 * so mindful of ambiguity codes for these kinds of sequences. All
278 * ambiguity codes are ignored by this program for the purposes of
279 * computing a distance matrix from a multiple alignment. By design,
280 * this program does not auto-detect DNA vs. Protein, and requires that
281 * the user explictly specify that on the command-line.
283 * Gaps can be represented either with the '-' or '.' characters.
285 * Newlines and other whitespace are allowed to be interspersed
286 * throughout the sequences.
288 * Taxon labels are required to be unique, and they must start with
289 * an alphabetic character (not a number, etc.). The parser will read
290 * the first token after the > character in the description line up until the
291 * first whitespace and use that for the taxon label.
293 * For example, in the line "> taxon1 is homo sapien", the taxon label will be
298 NJ_read_fasta(NJ_ARGS *nj_args) {
303 NJ_alignment *alignment = NULL;
307 long int index, x, seq;
309 long int bufsize, nseqs = NJ_INITIAL_NSEQS;
310 int first_sequence_flag;
316 * In this function, we implement a FASTA alignment parser which
317 * reads in an alignment character-by-character, maintaining state
318 * information which guides the parser.
320 * The program reads either DNA or Protein alignments. All title lines
321 * and sequences can be arbitrarily long. Gaps can be represented by
322 * "-" or "." characters.
324 * Ambiguity codes are also handled.
329 * We can't handle reading fasta input unless the user explicity
330 * specifies the input type...just to be sure.
332 if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
333 (nj_args->dna_flag && nj_args->protein_flag) ) {
334 fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
338 /* open specified fasta file here */
339 if(nj_args->stdin_flag) {
342 fp = fopen(nj_args->infilename, "r");
344 fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
350 /* allocate the initial buffer */
351 bufsize = NJ_INITIAL_BUFFER_SIZE;
352 buf = (char *)calloc(bufsize, sizeof(char));
354 /* allocate the alignment container here */
355 alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
357 /* allocate initial title array */
358 // printf("allocating initial title array\n");
359 alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));
361 /* make sure that we successfully allocated memory */
362 if(!buf || !alignment || !alignment->titles) {
363 fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
368 first_sequence_flag = 1;
370 index = 0; /* tracks the position in buffer */
371 x = 0; /* tracks the position on sequence */
372 seq = 0; /* tracks the active sequence */
374 /* intitial state of state machine */
375 state = NJ_FASTA_MODE_UNKNOWN;
379 /* get the next character */
383 if(state == NJ_FASTA_MODE_SEQUENCE) {
386 /* copy buf to alignment */
387 for(i=1;i<=alignment->length;i++) {
388 alignment->data[seq*alignment->length+i-1] = buf[i];
395 /* make sure our dynamic buffer is big enough */
396 if(index >= bufsize) {
398 buf = (char *)realloc(buf, bufsize);
400 fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
407 case NJ_FASTA_MODE_UNKNOWN:
409 if(!NJ_is_whitespace(c)) {
411 state = NJ_FASTA_MODE_TITLE;
420 case NJ_FASTA_MODE_TITLE:
426 state = NJ_FASTA_MODE_SEQUENCE;
430 /* make sure we've allocated enough space for titles and sequences */
433 // printf("realloc(). seq = %d, nseqs = %d\n", seq, nseqs);
437 alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
438 if(!alignment->titles) {
439 fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
443 alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
444 if(!alignment->data) {
445 fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
450 // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);
452 alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
453 if(!alignment->titles[seq]) {
454 fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
458 /* lets forward to the first non-space (space/tab) character after the '>' */
460 if(first_sequence_flag) {
465 while(*ptr == '\t' || *ptr == ' ') {
468 sscanf(ptr, "%s", alignment->titles[seq]); /* get the first word and use as the title */
478 case NJ_FASTA_MODE_SEQUENCE:
482 if(first_sequence_flag) {
483 first_sequence_flag = 0;
485 /* allocate our alignment data section here */
486 alignment->length = index-1;
488 nseqs = NJ_INITIAL_NSEQS;
489 alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
490 if(!alignment->data) {
491 fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
496 if(!first_sequence_flag) {
497 if(index-1 < alignment->length) {
498 fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
503 /* re-allocate if necessary */
507 alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
508 if(!alignment->data) {
509 fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
515 /* copy buf to alignment */
516 for(i=1;i<=alignment->length;i++) {
517 alignment->data[seq*alignment->length+i-1] = buf[i];
520 state = NJ_FASTA_MODE_TITLE;
530 if(NJ_is_whitespace(c)) {
534 if(!first_sequence_flag) {
535 if(index-1 >= alignment->length) {
536 fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
543 * Here we check to make sure that the symbol read is appropriate
544 * for the type of data the user specified. (dna or protein).
545 * We also handle ambiguity codes by converting them to a specific
546 * assigned ambiguity code character. Ambiguity codes are ignored
547 * when computing distances
550 if(nj_args->dna_flag) {
552 buf[index++] = toupper(c);
554 if(NJ_is_dna_ambiguity(c)) {
555 buf[index++] = NJ_AMBIGUITY_CHAR;
557 fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
561 } else if(nj_args->protein_flag) {
562 if(NJ_is_protein(c)) {
563 buf[index++] = toupper(c);
565 if(NJ_is_protein_ambiguity(c)) {
566 buf[index++] = NJ_AMBIGUITY_CHAR;
568 fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
586 if(index-1 != alignment->length) {
587 fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
591 /* check for duplicate taxon labels */
592 if(!NJ_taxaname_unique(alignment)) {
602 fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
609 NJ_free_alignment(alignment);
618 * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
622 * alignment -- A pointer to the alignment
630 NJ_print_alignment(NJ_alignment *alignment) {
634 printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
636 for(i=0;i<alignment->nseq;i++) {
638 printf("> %s\n", alignment->titles[i]);
640 for(j=0;j<alignment->length;j++) {
641 printf("%c", alignment->data[i*alignment->length+j]);
658 * NJ_free_alignment() - Free all of the memory allocated for the
659 * multiple sequence alignment
663 * alignment -- A pointer to the multiple sequence alignment
671 NJ_free_alignment(NJ_alignment *alignment) {
677 /* free the allocated titles */
678 if(alignment->titles) {
679 for(i=0;i<alignment->nseq;i++) {
681 if(alignment->titles[i]) {
682 free(alignment->titles[i]);
686 free(alignment->titles);
689 /* free the alignment data */
690 if(alignment->data) {
691 free(alignment->data);
694 /* free the alignment itself */
705 * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
709 * alignment -- a pointer to a multiple sequence alignment
713 * int -- 0 if all taxanames in alignment are unique
714 * 1 if all taxanames in alignment are NOT unique
720 * Check to see if the taxanames in the alignment are unique. It
721 * will be impossible to make sense of the final tree if the taxon
722 * labels are not unqiue.
726 NJ_taxaname_unique(NJ_alignment *alignment) {
730 for(i=0;i<alignment->nseq;i++) {
731 for(j=i+1;j<alignment->nseq;j++) {
732 if(!strcmp(alignment->titles[i],
733 alignment->titles[j])) {
734 fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n",
735 i, j, alignment->titles[i]);
746 NJ_print_titles(NJ_alignment *alignment) {
750 for(i=0;i<alignment->nseq;i++) {
751 printf("%d: %s\n", i, alignment->titles[i]);