fasta.cpp

   1 /*
   2  * fasta.c
   3  *
   4  * $Id$
   5  *
   6  *****************************************************************************
   7  *
   8  * Copyright (c) 2004,  Luke Sheneman
   9  * All rights reserved.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  *
  15  *  + Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  *  + Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in
  19  *    the documentation and/or other materials provided with the
  20  *    distribution.
  21  *  + The names of its contributors may not be used to endorse or promote
  22  *    products derived  from this software without specific prior
  23  *    written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  26  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35  * POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  *****************************************************************************
  38  *
  39  * Functions for parsing FASTA formatted alignment files
  40  *
  41  *****************************************************************************
  42  *
  43  * AUTHOR:
  44  *
  45  *   Luke Sheneman
  46  *   sheneman@cs.uidaho.edu
  47  *
  48  */
  49
  50 #include <stdio.h>
  51 #include <stdlib.h>
  52 #include <string.h>
  53 #include <ctype.h>
  54
  55 #include "clearcut.h"
  56 #include "common.h"
  57 #include "fasta.h"
  58
  59
  60 #define NJ_NUM_DNA_AMBIGUITY_SYMS 14
  61 static const char NJ_dna_ambiguity_syms[NJ_NUM_DNA_AMBIGUITY_SYMS] =
  62 {
  63   'M', 'R', 'W', 'S', 'Y', 'K',
  64   'V', 'H', 'D', 'B', 'X', 'N',
  65   '-', '.'
  66 };
  67
  68
  69 #define NJ_NUM_PROTEIN_AMBIGUITY_SYMS 6
  70 static const char NJ_protein_ambiguity_syms[NJ_NUM_PROTEIN_AMBIGUITY_SYMS] =
  71 {
  72   'X', 'B', 'Z', '*', '-', '.'
  73 };
  74
  75 #define NJ_NUM_DNA_SYMS 5
  76 static const char NJ_dna_syms[NJ_NUM_DNA_SYMS] =
  77 {
  78   'A', 'G', 'C', 'T', 'U'
  79 };
  80
  81
  82 #define NJ_NUM_PROTEIN_SYMS 20
  83 static const char NJ_protein_syms[NJ_NUM_PROTEIN_SYMS] =
  84 {
  85   'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
  86   'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
  87 };
  88
  89
  90
  91
  92
  93 /*
  94  * NJ_is_whitespace() - Check to see if character is whitespace
  95  *
  96  * INPUTS:
  97  * -------
  98  *     c -- character to check
  99  *
 100  * RETURNS:
 101  * --------
 102  *    int -- 0 if not whitespace
 103  *           1 if whitespace
 104  */
 105 static inline
 106 int
 107 NJ_is_whitespace(char c) {
 108   if( c == ' '  ||   /* space           */
 109       c == '\n' ||   /* newline         */
 110       c == '\r' ||   /* carriage-return */
 111       c == '\v' ||   /* vertical tab    */
 112       c == '\f' ||   /* form feed       */
 113       c == '\t' ) {  /* horizontal tab  */
 114     return(1);
 115   } else {
 116     return(0);
 117   }
 118 }
 119
 120
 121
 122
 123
 124 /*
 125  * NJ_is_dna() -
 126  *
 127  * Determines if the given symbol is DNA
 128  *
 129  * RETURNS: 1 if DNA
 130  *          0 if not DNA
 131  *
 132  */
 133 static inline
 134 int
 135 NJ_is_dna(char c) {
 136
 137   int i;
 138   char up_c;
 139
 140   up_c = toupper(c);
 141
 142   for(i=0;i<NJ_NUM_DNA_SYMS;i++) {
 143     if(up_c == NJ_dna_syms[i]) {
 144       return(1);
 145     }
 146   }
 147
 148   return(0);
 149 }
 150
 151
 152
 153
 154
 155 /*
 156  * NJ_is_dna_ambiguity() -
 157  *
 158  * Determines if the given symbol is a
 159  * DNA ambiguity code
 160  *
 161  * RETURNS: 1 if DNA Ambiguity Code
 162  *          0 if not DNA Ambiguity Code
 163  *
 164  */
 165 static inline
 166 int
 167 NJ_is_dna_ambiguity(char c) {
 168
 169   int i;
 170   char up_c;
 171
 172   up_c = toupper(c);
 173
 174   for(i=0;i<NJ_NUM_DNA_AMBIGUITY_SYMS;i++) {
 175     if(up_c == NJ_dna_ambiguity_syms[i]) {
 176       return(1);
 177     }
 178   }
 179
 180   return(0);
 181 }
 182
 183
 184
 185 /*
 186  * NJ_is_protein() -
 187  *
 188  * Determines if supplied symbol is amino acid symbol
 189  *
 190  * RETURNS: 1 if amino acid
 191  *          0 if not amino acid
 192  *
 193  */
 194 static inline
 195 int
 196 NJ_is_protein(char c) {
 197
 198   int i;
 199   char up_c;
 200
 201   up_c = toupper(c);
 202
 203   for(i=0;i<NJ_NUM_PROTEIN_SYMS;i++) {
 204     if(up_c == NJ_protein_syms[i]) {
 205       return(1);
 206     }
 207   }
 208
 209   return(0);
 210 }
 211
 212
 213
 214
 215 /*
 216  * NJ_is_protein_ambiguity() -
 217  *
 218  * Determines if supplied symbol is amino acid ambiguity symbol
 219  *
 220  * RETURNS: 1 if amino acid ambiguity symbol
 221  *          0 if not amino acid ambiguity symbol
 222  *
 223  */
 224 static inline
 225 int
 226 NJ_is_protein_ambiguity(char c) {
 227
 228   int i;
 229   char up_c;
 230
 231   up_c = toupper(c);
 232
 233   for(i=0;i<NJ_NUM_PROTEIN_AMBIGUITY_SYMS;i++) {
 234     if(up_c == NJ_protein_ambiguity_syms[i]) {
 235       return(1);
 236     }
 237   }
 238
 239   return(0);
 240 }
 241
 242
 243
 244
 245
 246
 247 /*
 248  * NJ_read_fasta() - A function for inputing FASTA sequences into an alignment
 249  *                   data structure
 250  *
 251  *
 252  * INPUTS:
 253  * -------
 254  *   nj_args -- A pointer to a data structure containing command-line args
 255  *
 256  * RETURNS:
 257  * --------
 258  *   NJ_alignment * -- A pointer to a multiple sequence alignment
 259  *
 260  * DESCRIPTION:
 261  * ------------
 262  *
 263  *
 264  * This function implements a state machine parser for parsing FASTA-formatted
 265  * multiple sequence alignment files.
 266  *
 267  * Example Input:
 268  *
 269  *   > sequence1
 270  *   ATAGATATAGATTAGAATAT----TATAGATAT----ATATAT-TTT-
 271  *   > sequence2
 272  *   --ATAGATA---ATATATATATTTT--GTCTCATAGT---ATATGCTT
 273  *   > sequence3
 274  *   TTATAGATA---ATATATATATTTTAAGTCTCATAGT-A-ATATGC--
 275  *
 276  * This function will parse alignments for DNA or protein, and will do
 277  * so mindful of ambiguity codes for these kinds of sequences.  All
 278  * ambiguity codes are ignored by this program for the purposes of
 279  * computing a distance matrix from a multiple alignment.  By design,
 280  * this program does not auto-detect DNA vs. Protein, and requires that
 281  * the user explictly specify that on the command-line.
 282  *
 283  * Gaps can be represented either with the '-' or '.' characters.
 284  *
 285  * Newlines and other whitespace are allowed to be interspersed
 286  * throughout the sequences.
 287  *
 288  * Taxon labels are required to be unique, and they must start with
 289  * an alphabetic character (not a number, etc.).  The parser will read
 290  * the first token after the > character in the description line up until the
 291  * first whitespace and use that for the taxon label.
 292  *
 293  * For example, in the line "> taxon1 is homo sapien", the taxon label will be
 294  * "taxon1"
 295  *
 296  */
 297 NJ_alignment *
 298 NJ_read_fasta(NJ_ARGS *nj_args) {
 299
 300   FILE *fp  = NULL;
 301   char *buf = NULL;
 302   char *ptr = NULL;
 303   NJ_alignment *alignment = NULL;
 304
 305   char c;
 306   int state;
 307   long int index, x, seq;
 308   long int i;
 309   long int bufsize, nseqs = NJ_INITIAL_NSEQS;
 310   int first_sequence_flag;
 311
 312
 313
 314
 315   /*
 316    * In this function, we implement a FASTA alignment parser which
 317    * reads in an alignment character-by-character, maintaining state
 318    * information which guides the parser.
 319    *
 320    * The program reads either DNA or Protein alignments.  All title lines
 321    * and sequences can be arbitrarily long.  Gaps can be represented by
 322    * "-" or "." characters.
 323    *
 324    * Ambiguity codes are also handled.
 325    *
 326    */
 327
 328   /*
 329    * We can't handle reading fasta input unless the user explicity
 330    * specifies the input type...just to be sure.
 331    */
 332   if( (!nj_args->dna_flag && !nj_args->protein_flag) ||
 333       (nj_args->dna_flag  &&  nj_args->protein_flag) ) {
 334     fprintf(stderr, "Clearcut: Explicitly specify protein or DNA\n");
 335     goto XIT_BAD;
 336   }
 337
 338   /* open specified fasta file here */
 339   if(nj_args->stdin_flag) {
 340     fp = stdin;
 341   } else {
 342     fp = fopen(nj_args->infilename, "r");
 343     if(!fp) {
 344       fprintf(stderr, "Clearcut: Failed to open input FASTA file: %s\n", nj_args->infilename);
 345       perror("Clearcut");
 346       goto XIT_BAD;
 347     }
 348   }
 349
 350   /* allocate the initial buffer */
 351   bufsize = NJ_INITIAL_BUFFER_SIZE;
 352   buf = (char *)calloc(bufsize, sizeof(char));
 353
 354   /* allocate the alignment container here */
 355   alignment = (NJ_alignment *)calloc(1, sizeof(NJ_alignment));
 356
 357   /* allocate initial title array */
 358   //  printf("allocating initial title array\n");
 359   alignment->titles = (char **)calloc(NJ_INITIAL_NSEQS, sizeof(char *));
 360
 361   /* make sure that we successfully allocated memory */
 362   if(!buf || !alignment || !alignment->titles) {
 363     fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
 364     goto XIT_BAD;
 365   }
 366
 367   /* a flag */
 368   first_sequence_flag = 1;
 369
 370   index  = 0;  /* tracks the position in buffer     */
 371   x      = 0;  /* tracks the position on sequence   */
 372   seq    = 0;  /* tracks the active sequence        */
 373
 374   /* intitial state of state machine */
 375   state  = NJ_FASTA_MODE_UNKNOWN;
 376
 377   while(1) {
 378
 379     /* get the next character */
 380     c = fgetc(fp);
 381     if(feof(fp)) {
 382
 383       if(state == NJ_FASTA_MODE_SEQUENCE) {
 384         buf[index+1] = '\0';
 385
 386         /* copy buf to alignment */
 387         for(i=1;i<=alignment->length;i++) {
 388           alignment->data[seq*alignment->length+i-1] = buf[i];
 389         }
 390       }
 391
 392       break;
 393     }
 394
 395     /* make sure our dynamic buffer is big enough */
 396     if(index >= bufsize) {
 397       bufsize *= 2;
 398       buf = (char *)realloc(buf, bufsize);
 399       if(!buf) {
 400         fprintf(stderr, "Clearcut: Memory allocation error in NJ_read_fasta()\n");
 401         goto XIT_BAD;
 402       }
 403     }
 404
 405     switch(state) {
 406
 407     case NJ_FASTA_MODE_UNKNOWN:
 408
 409       if(!NJ_is_whitespace(c)) {
 410         if(c == '>') {
 411           state = NJ_FASTA_MODE_TITLE;
 412           buf[0] = '>';
 413         } else {
 414           goto XIT_BAD;
 415         }
 416       }
 417
 418       break;
 419
 420     case NJ_FASTA_MODE_TITLE:
 421
 422       if( c == '\n' ||
 423           c == '\r' ) {
 424
 425         buf[index] = '\0';
 426         state = NJ_FASTA_MODE_SEQUENCE;
 427         index = 0;
 428         x     = -1;
 429
 430         /* make sure we've allocated enough space for titles and sequences */
 431         if(seq == nseqs) {
 432
 433           //      printf("realloc().  seq = %d, nseqs = %d\n", seq, nseqs);
 434
 435           nseqs *= 2;
 436
 437           alignment->titles = (char **)realloc(alignment->titles, nseqs*sizeof(char *));
 438           if(!alignment->titles) {
 439             fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
 440             goto XIT_BAD;
 441           }
 442
 443           alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
 444           if(!alignment->data) {
 445             fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
 446             goto XIT_BAD;
 447           }
 448         }
 449
 450         // printf("Allocating %d bytes for title %d: %s\n", (int)strlen(buf), (int)seq, buf);
 451
 452         alignment->titles[seq] = (char *)calloc(strlen(buf), sizeof(char));
 453         if(!alignment->titles[seq]) {
 454           fprintf(stderr, "Clearcut:  Memory allocation error in NJ_read_fasta()\n");
 455           goto XIT_BAD;
 456         }
 457
 458         /* lets forward to the first non-space (space/tab) character after the '>' */
 459
 460         if(first_sequence_flag) {
 461           ptr = buf;
 462         } else {
 463           ptr = &buf[1];
 464         }
 465         while(*ptr == '\t' || *ptr == ' ') {
 466           ptr++;
 467         }
 468         sscanf(ptr, "%s", alignment->titles[seq]);  /* get the first word and use as the title */
 469
 470         alignment->nseq++;
 471       }
 472
 473       buf[index++] = c;
 474
 475       break;
 476
 477
 478     case NJ_FASTA_MODE_SEQUENCE:
 479
 480       if(c == '>') {
 481
 482         if(first_sequence_flag) {
 483           first_sequence_flag = 0;
 484
 485           /* allocate our alignment data section here */
 486           alignment->length = index-1;
 487
 488           nseqs = NJ_INITIAL_NSEQS;
 489           alignment->data = (char *)calloc(alignment->length*nseqs, sizeof(char));
 490           if(!alignment->data) {
 491             fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
 492             goto XIT_BAD;
 493           }
 494         }
 495
 496         if(!first_sequence_flag) {
 497           if(index-1 < alignment->length) {
 498             fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
 499             goto XIT_BAD;
 500           }
 501         }
 502
 503         /* re-allocate if necessary */
 504         /*
 505         if(seq >= nseqs) {
 506           nseqs *= 2;
 507           alignment->data = (char *)realloc(alignment->data, alignment->length*nseqs*sizeof(char));
 508           if(!alignment->data) {
 509             fprintf(stderr, "Clearcut: Allocation error in NJ_read_fasta()\n");
 510             goto XIT_BAD;
 511           }
 512         }
 513         */
 514
 515         /* copy buf to alignment */
 516         for(i=1;i<=alignment->length;i++) {
 517           alignment->data[seq*alignment->length+i-1] = buf[i];
 518         }
 519
 520         state = NJ_FASTA_MODE_TITLE;
 521         index = 1;
 522         x     = 1;
 523
 524         buf[0] = c;
 525
 526         seq++;
 527
 528       } else {
 529
 530         if(NJ_is_whitespace(c)) {
 531           break;
 532         }
 533
 534         if(!first_sequence_flag) {
 535           if(index-1 >= alignment->length) {
 536             fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
 537             goto XIT_BAD;
 538           }
 539         }
 540
 541
 542         /*
 543          * Here we check to make sure that the symbol read is appropriate
 544          * for the type of data the user specified.  (dna or protein).
 545          * We also handle ambiguity codes by converting them to a specific
 546          * assigned ambiguity code character.  Ambiguity codes are ignored
 547          * when computing distances
 548          */
 549
 550         if(nj_args->dna_flag) {
 551           if(NJ_is_dna(c)) {
 552             buf[index++] = toupper(c);
 553           } else {
 554             if(NJ_is_dna_ambiguity(c)) {
 555               buf[index++] = NJ_AMBIGUITY_CHAR;
 556             } else {
 557               fprintf(stderr, "Clearcut: Unknown symbol '%c' in nucleotide sequence %ld.\n", c, seq);
 558               goto XIT_BAD;
 559             }
 560           }
 561         } else if(nj_args->protein_flag) {
 562           if(NJ_is_protein(c)) {
 563             buf[index++] = toupper(c);
 564           } else {
 565             if(NJ_is_protein_ambiguity(c)) {
 566               buf[index++] = NJ_AMBIGUITY_CHAR;
 567             } else {
 568               fprintf(stderr, "Clearcut: Unknown symbol '%c' in protein sequence %ld.\n", c, seq);
 569               goto XIT_BAD;
 570             }
 571           }
 572         }
 573
 574       }
 575
 576       break;
 577
 578     default:
 579       goto XIT_BAD;
 580
 581       break;
 582
 583     }
 584   }
 585
 586   if(index-1 != alignment->length) {
 587     fprintf(stderr, "Clearcut: Sequences must be of uniform length in alignment at sequence %ld\n", seq);
 588     goto XIT_BAD;
 589   }
 590
 591   /* check for duplicate taxon labels */
 592   if(!NJ_taxaname_unique(alignment)) {
 593     goto XIT_BAD;
 594   }
 595
 596   return(alignment);
 597
 598
 599  XIT_BAD:
 600
 601   if(fp) {
 602     fprintf(stderr, "Clearcut: Fatal error parsing FASTA file at file offset %ld.\n", ftell(fp));
 603   }
 604
 605   if(buf) {
 606     free(buf);
 607   }
 608
 609   NJ_free_alignment(alignment);
 610
 611   return(NULL);
 612 }
 613
 614
 615
 616
 617 /*
 618  * NJ_print_alignment() - Print multiple sequence alignment (for debugging)
 619  *
 620  * INPUTS:
 621  * -------
 622  *    alignment -- A pointer to the alignment
 623  *
 624  * RETURNS:
 625  * --------
 626  *    NONE
 627  *
 628  */
 629 void
 630 NJ_print_alignment(NJ_alignment *alignment) {
 631
 632   long int i, j;
 633
 634   printf("nseq = %ld, length = %ld\n", alignment->nseq, alignment->length);
 635
 636   for(i=0;i<alignment->nseq;i++) {
 637
 638     printf("> %s\n", alignment->titles[i]);
 639
 640     for(j=0;j<alignment->length;j++) {
 641       printf("%c", alignment->data[i*alignment->length+j]);
 642     }
 643
 644     printf("\n");
 645   }
 646
 647   return;
 648 }
 649
 650
 651
 652
 653
 654
 655
 656 /*
 657  *
 658  * NJ_free_alignment() - Free all of the memory allocated for the
 659  *                       multiple sequence alignment
 660  *
 661  * INPUTS:
 662  * -------
 663  *   alignment -- A pointer to the multiple sequence alignment
 664  *
 665  * RETURNS:
 666  * --------
 667  *    NONE
 668  *
 669  */
 670 void
 671 NJ_free_alignment(NJ_alignment *alignment) {
 672
 673   long int i;
 674
 675   if(alignment) {
 676
 677     /* free the allocated titles */
 678     if(alignment->titles) {
 679       for(i=0;i<alignment->nseq;i++) {
 680
 681         if(alignment->titles[i]) {
 682           free(alignment->titles[i]);
 683         }
 684       }
 685
 686       free(alignment->titles);
 687     }
 688
 689     /* free the alignment data */
 690     if(alignment->data) {
 691       free(alignment->data);
 692     }
 693
 694     /* free the alignment itself */
 695     free(alignment);
 696   }
 697
 698   return;
 699 }
 700
 701
 702
 703
 704 /*
 705  * NJ_taxaname_unique() - Check to see if taxanames are unique in alignment
 706  *
 707  * INPUTS:
 708  * -------
 709  *  alignment -- a pointer to a multiple sequence alignment
 710  *
 711  * OUTPUTS:
 712  * --------
 713  *  int -- 0 if all taxanames in alignment are unique
 714  *         1 if all taxanames in alignment are NOT unique
 715  *
 716  *
 717  * DESCRIPTION:
 718  * ------------
 719  *
 720  * Check to see if the taxanames in the alignment are unique.  It
 721  * will be impossible to make sense of the final tree if the taxon
 722  * labels are not unqiue.
 723  *
 724  */
 725 int
 726 NJ_taxaname_unique(NJ_alignment *alignment) {
 727
 728   long int i, j;
 729
 730   for(i=0;i<alignment->nseq;i++) {
 731     for(j=i+1;j<alignment->nseq;j++) {
 732         if(!strcmp(alignment->titles[i],
 733                  alignment->titles[j])) {
 734         fprintf(stderr, "Clearcut: Taxa %ld and %ld (%s) do not have unique taxon labels.\n",
 735                 i, j, alignment->titles[i]);
 736         return(0);
 737       }
 738     }
 739   }
 740
 741   return(1);
 742 }
 743
 744
 745 void
 746 NJ_print_titles(NJ_alignment *alignment) {
 747
 748   int i;
 749
 750   for(i=0;i<alignment->nseq;i++) {
 751     printf("%d: %s\n", i, alignment->titles[i]);
 752   }
 753
 754   return;
 755 }