distclearcut.cpp

   1 /*
   2  * dist.c
   3  *
   4  * $Id$
   5  *
   6  *
   7  *****************************************************************************
   8  *
   9  * Copyright (c) 2004,  Luke Sheneman
  10  * All rights reserved.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  *
  16  *  + Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  *  + Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in
  20  *    the documentation and/or other materials provided with the
  21  *    distribution.
  22  *  + The names of its contributors may not be used to endorse or promote
  23  *    products derived  from this software without specific prior
  24  *    written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36  * POSSIBILITY OF SUCH DAMAGE.
  37  *
  38  *****************************************************************************
  39  *
  40  * Compute a distance matrix given a set of sequences
  41  *
  42  *****************************************************************************
  43  *
  44  * AUTHOR:
  45  *
  46  *   Luke Sheneman
  47  *   sheneman@cs.uidaho.edu
  48  *
  49  */
  50
  51 #include <stdio.h>
  52 #include <stdlib.h>
  53 #include <math.h>
  54 #include <string.h>
  55 #include <ctype.h>
  56
  57 #include "common.h"
  58 #include "dayhoff.h"
  59 #include "fasta.h"
  60 #include "distclearcut.h"
  61
  62
  63
  64
  65 /*
  66  * NJ_build_distance_matrix() -
  67  *
  68  * Given a filename for an alignment, read the alignment
  69  * into memory and then compute the distance matrix
  70  * using the appropriate correction model
  71  */
  72 DMAT *
  73 NJ_build_distance_matrix(NJ_ARGS *nj_args) {
  74
  75   DMAT *dmat;
  76   NJ_alignment *alignment;
  77
  78   /* Read an alignment in FASTA format */
  79   alignment =
  80     NJ_read_fasta(nj_args);
  81
  82   if(!alignment) {
  83     return(NULL);
  84   }
  85
  86   /*
  87    * Given a global multiple sequence alignment (MSA) and
  88    * a specified distance correction model, compute a
  89    * corrected distance matrix
  90    *
  91    * From proteins, we may want to allow users to specify
  92    * a substitution matrix (feature)
  93    */
  94
  95   dmat =
  96     NJ_compute_dmat(nj_args,
  97                     alignment);
  98
  99   // NJ_print_taxanames(dmat);
 100
 101   if(!dmat) {
 102     fprintf(stderr, "Clearcut: Error computing distance matrix\n");
 103   }
 104
 105   /* now free the memory associated with the alignment */
 106   NJ_free_alignment(alignment);
 107
 108   return(dmat);
 109 }
 110
 111
 112
 113
 114
 115 /*
 116  * NJ_compute_dmat() -
 117  *
 118  * Given an alignment and a correction model, compute the
 119  * distance matrix and return it
 120  *
 121  */
 122 DMAT *
 123 NJ_compute_dmat(NJ_ARGS *nj_args,
 124                 NJ_alignment *alignment) {
 125
 126   DMAT *dmat;
 127   long int i;
 128
 129
 130   /* allocate distance matrix here */
 131   dmat = (DMAT *)calloc(1, sizeof(DMAT));
 132   if(!dmat) {
 133     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 134     return(NULL);
 135   }
 136
 137   dmat->ntaxa = alignment->nseq;
 138   dmat->size  = alignment->nseq;
 139
 140   /* allocate memory to hold the taxa names */
 141   dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *));
 142   if(!dmat->taxaname) {
 143     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 144     return(NULL);
 145   }
 146
 147   /* copy sequence titles */
 148   for(i=0;i<alignment->nseq;i++) {
 149     dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char));
 150     if(!dmat->taxaname[i]) {
 151       fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 152       return(NULL);
 153     }
 154
 155     strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i]));
 156   }
 157
 158   /* allocate val matrix in dmat */
 159   dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float));
 160
 161   if(!dmat->val) {
 162     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 163     return(NULL);
 164   }
 165
 166   /* now lets allocate space for the r and r2 columns */
 167   dmat->r  = (float *)calloc(dmat->ntaxa, sizeof(float));
 168   dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
 169
 170   /* track some memory addresses */
 171   dmat->rhandle   = dmat->r;
 172   dmat->r2handle  = dmat->r2;
 173   dmat->valhandle = dmat->val;
 174
 175   /* apply model correction to matrix */
 176   switch(nj_args->correction_model) {
 177
 178   case NJ_MODEL_JUKES:
 179
 180     if(nj_args->dna_flag) {
 181       NJ_DNA_jc_correction(dmat, alignment);
 182     } else if(nj_args->protein_flag) {
 183       NJ_PROTEIN_jc_correction(dmat, alignment);
 184     } else {
 185       fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n");
 186       return(NULL);
 187     }
 188
 189     break;
 190
 191   case NJ_MODEL_KIMURA:
 192
 193     if(nj_args->dna_flag) {
 194       NJ_DNA_k2p_correction(dmat, alignment);
 195     } else if(nj_args->protein_flag) {
 196       NJ_PROTEIN_kimura_correction(dmat, alignment);
 197     } else {
 198       fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n");
 199       return(NULL);
 200     }
 201
 202     break;
 203
 204   case NJ_MODEL_NONE:
 205
 206     NJ_no_correction(dmat, alignment);
 207
 208     break;
 209
 210   default:
 211     fprintf(stderr, "Clearcut: Invalid distance correction model.\n");
 212     return(NULL);
 213   }
 214
 215   return(dmat);
 216 }
 217
 218
 219
 220
 221
 222 /*
 223  * NJ_no_correction() -
 224  *
 225  * Compute the distance matrix without correction
 226  * (straight percent ID)
 227  *
 228  * Resolve ambiguities in sequence data by skipping
 229  * those nucleotides/residues
 230  *
 231  */
 232 void
 233 NJ_no_correction(DMAT *dmat,
 234                  NJ_alignment *alignment) {
 235
 236   long int i, j;
 237   float pdiff;
 238
 239   /* compute pairwise percent identity */
 240   for(i=0;i<dmat->size;i++) {
 241     for(j=i+1;j<dmat->size;j++) {
 242       pdiff = 1.0 - NJ_pw_percentid(alignment, i, j);
 243       dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff;
 244     }
 245   }
 246
 247   return;
 248 }
 249
 250
 251
 252
 253 /*
 254  * NJ_DNA_jc_correction() -
 255  *
 256  * Compute the distance matrix with jukes-cantor correction
 257  * and assign high distance if sequence divergence exceeds
 258  * 0.75
 259  *
 260  *   Jukes, T.H. (1969), Evolution of protein molecules.  In H.N. Munro (Ed.),
 261  *   Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
 262  *   New York: Academic Press
 263  *
 264  */
 265 void
 266 NJ_DNA_jc_correction(DMAT *dmat,
 267                      NJ_alignment *alignment) {
 268
 269   long int i, j;
 270   long int k;
 271   float d, cutoff, dist;
 272   long int residues;
 273
 274   cutoff = 0.75;
 275
 276   for(i=0;i<dmat->size;i++) {
 277     for(j=i+1;j<dmat->size;j++) {
 278
 279       k = NJ_pw_differences(alignment, i, j, &residues);
 280       d = 1.0 - NJ_pw_percentid(alignment, i, j);
 281
 282       if(d > cutoff) {
 283         dist = NJ_BIGDIST;
 284       } else {
 285         dist = (-0.75) * log(1.0 - (4.0/3.0)*d);
 286       }
 287
 288       if(fabs(dist) < FLT_EPSILON) {
 289         dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
 290       } else {
 291         dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 292       }
 293     }
 294   }
 295
 296
 297
 298   return;
 299 }
 300
 301
 302
 303
 304
 305
 306 /*
 307  * NJ_PROTEIN_jc_correction() -
 308  *
 309  * This function performs modified jukes/cantor correction on
 310  * a protein alignment
 311  *
 312  *   Jukes, T.H. (1969), Evolution of protein molecules.  In H.N. Munro (Ed.),
 313  *   Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
 314  *   New York: Academic Press
 315  *
 316  */
 317 void
 318 NJ_PROTEIN_jc_correction(DMAT *dmat,
 319                          NJ_alignment *alignment) {
 320
 321   long int i, j;
 322   long int residues;
 323   long int diff;
 324   float dist, x;
 325
 326
 327   for(i=0;i<dmat->size;i++) {
 328     for(j=i+1;j<dmat->size;j++) {
 329
 330       diff = NJ_pw_differences(alignment, i, j, &residues);
 331
 332       if(!diff || !residues) {
 333         dist = 0.0;
 334       } else {
 335
 336         dist = (float)diff/(float)residues;
 337         x = ((20.0/19.0)*dist);
 338
 339         if(NJ_FLT_GT(x, 1.0)) {
 340           dist = NJ_BIGDIST;
 341         } else {
 342           dist = -(19.0/20.0) * log(1.0 - x);
 343         }
 344       }
 345
 346       dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 347     }
 348   }
 349
 350   return;
 351 }
 352
 353
 354
 355
 356
 357
 358 /*
 359  * NJ_DNA_k2p_correction() -
 360  *
 361  * Correct a distance matrix using k2p correction using
 362  * cutoffs to avoid problems with logarithms.
 363  *
 364  * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q)
 365  *
 366  * But due to the logarithms, this is only valid when
 367  *
 368  * (2P+Q <= 1)  &&
 369  * (2Q <= 1)
 370  *
 371  * So assign arbitary distances when these constraints are
 372  * not strictly followed.
 373  *
 374  *   Kimura, M. (1980), A simple method for estimating evolutionary
 375  *   rates of base substitutions through comparative studies of
 376  *   nucleotide sequences.  J. Mol. Evol., 16, 111-120
 377  *
 378  */
 379 void
 380 NJ_DNA_k2p_correction(DMAT *dmat,
 381                       NJ_alignment *alignment) {
 382
 383   long int i, j;
 384   float P;  /* proportion of transitions   */
 385   float Q;  /* proportion of transversions */
 386   long int nucleotides;
 387   long int transitions, transversions;
 388   float dist;
 389   float log_x = 0.0;  /* the params for the first log  */
 390   float log_y = 0.0;  /* the params for the second log */
 391
 392   int blowup;   /* a flag to specify if we have a log blowup */
 393
 394
 395   for(i=0;i<dmat->size;i++) {
 396     for(j=i+1;j<dmat->size;j++) {
 397
 398       blowup = 0;
 399
 400       /* count the number of transitions and transversions */
 401       NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides);
 402
 403       if(!nucleotides) {   /* sequences have no non-ambiguous overlap in alignment */
 404         P = 0.0;
 405         Q = 0.0;
 406       } else {
 407         P = (float)transitions   / (float)nucleotides;
 408         Q = (float)transversions / (float)nucleotides;
 409       }
 410
 411       /* the first log blows up if 2*P+Q = 1.0 */
 412       if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) {
 413         blowup = 1;
 414       } else {
 415         if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) {
 416           blowup = 1;
 417         } else {
 418           log_x = log(1.0 - 2.0*P - Q);
 419         }
 420       }
 421
 422       /* the second log blows up if 2*Q >= 1.0 */
 423       if( NJ_FLT_EQ((2.0 * Q), 1.0) ||
 424           NJ_FLT_GT((2.0 * Q), 1.0) ) {
 425         blowup = 1;
 426       } else {
 427         log_y = log(1.0 - 2.0*Q);
 428       }
 429
 430       /* if our logarithms blow up, we just set the distance to the max */
 431       if(blowup) {
 432         dist = NJ_BIGDIST;
 433       } else {
 434         dist = (-0.5)*log_x - 0.25*log_y;
 435       }
 436
 437       if(fabs(dist) < FLT_EPSILON) {
 438         dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
 439       } else {
 440         dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 441       }
 442     }
 443   }
 444
 445   return;
 446 }
 447
 448
 449
 450
 451 /*
 452  * NJ_PROTEIN_kimura_correction() -
 453  *
 454  * Perform Kimura correction for distances derived from protein
 455  * alignments.
 456  *
 457  *   Kimura, M. (1983), The Neutral Theory of Molecular Evolution.
 458  *   p. 75., Cambridge University Press, Cambridge, England
 459  *
 460  */
 461 void
 462 NJ_PROTEIN_kimura_correction(DMAT *dmat,
 463                              NJ_alignment *alignment) {
 464
 465   long int i, j;
 466   long int residues;
 467   long int diff;
 468   float dist;
 469
 470
 471   printf("NJ_PROTEIN_kimura_correction()\n");
 472
 473   for(i=0;i<dmat->size;i++) {
 474     for(j=i+1;j<dmat->size;j++) {
 475       diff = NJ_pw_differences(alignment, i, j, &residues);
 476
 477       if(!diff || !residues) {
 478         dist = 0.0;
 479       } else {
 480         dist = (float)diff/(float)residues;
 481       }
 482
 483       if(NJ_FLT_LT(dist, 0.75)) {
 484         if(NJ_FLT_GT(dist, 0.0) ) {
 485           dist = -log(1.0 - dist - (dist * dist/5.0) );
 486         }
 487       } else {
 488         if(NJ_FLT_GT(dist, 0.93) ) {
 489           dist = 10.0;
 490         } else {
 491           dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ;
 492         }
 493       }
 494
 495       dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 496     }
 497   }
 498
 499   return;
 500 }
 501
 502
 503
 504
 505
 506 /*
 507  * NJ_DNA_count_tt() -
 508  *
 509  * Count the number of transitions and transversions
 510  * between two aligned DNA sequences
 511  *
 512  * This routine automatically skips ambiguities when
 513  * counting transitions and transversions.
 514  *
 515  */
 516 void
 517 NJ_DNA_count_tt(NJ_alignment *alignment,
 518                 long int x,
 519                 long int y,
 520                 long int *transitions,
 521                 long int *transversions,
 522                 long int *residues) {
 523
 524   long int tmp_transitions   = 0;
 525   long int tmp_transversions = 0;
 526   long int tmp_residues      = 0;
 527   char a, b;
 528   long int i;
 529
 530   for(i=0;i<alignment->length;i++) {
 531
 532     a = toupper(alignment->data[x*alignment->length+i]);
 533     b = toupper(alignment->data[y*alignment->length+i]);
 534
 535     if( (a == 'A' && b == 'T') ||
 536         (a == 'T' && b == 'A') ||
 537         (a == 'A' && b == 'C') ||
 538         (a == 'C' && b == 'A') ||
 539         (a == 'T' && b == 'G') ||
 540         (a == 'G' && b == 'T') ||
 541         (a == 'C' && b == 'G') ||
 542         (a == 'G' && b == 'C') ) {
 543       tmp_transversions++;
 544     }
 545
 546     if( (a == 'C' && b == 'T') ||
 547         (a == 'T' && b == 'C') ||
 548         (a == 'G' && b == 'A') ||
 549         (a == 'A' && b == 'G') ) {
 550       tmp_transitions++;
 551     }
 552
 553     /* count the number of residues */
 554     if(a != NJ_AMBIGUITY_CHAR &&
 555        b != NJ_AMBIGUITY_CHAR ) {
 556       tmp_residues++;
 557     }
 558
 559   }
 560
 561   *transitions   = tmp_transitions;
 562   *transversions = tmp_transversions;
 563
 564   if(residues) {
 565     *residues = tmp_residues;
 566   }
 567
 568   return;
 569 }
 570
 571
 572
 573
 574
 575 /*
 576  * NJ_pw_percentid() -
 577  *
 578  * Given an alignment and a specification
 579  * for two rows, compute the pairwise
 580  * percent identity between the two
 581  *
 582  */
 583 float
 584 NJ_pw_percentid(NJ_alignment *alignment,
 585                 long int x,
 586                 long int y) {
 587
 588   float pid;
 589   long int i;
 590   long int residues;
 591   long int same;
 592   char c1, c2;
 593
 594   residues = 0;
 595   same     = 0;
 596   for(i=0;i<alignment->length;i++) {
 597
 598     c1 = alignment->data[x*alignment->length+i];
 599     c2 = alignment->data[y*alignment->length+i];
 600
 601     if( c1 != NJ_AMBIGUITY_CHAR ||
 602         c2 != NJ_AMBIGUITY_CHAR ) {
 603
 604       residues++;
 605
 606       if(c1 == c2) {
 607         same++;
 608       }
 609     }
 610
 611   }
 612
 613   pid = (float)same/(float)residues;
 614
 615   return(pid);
 616 }
 617
 618
 619
 620 /*
 621  * NJ_pw_differences() -
 622  *
 623  * Given an alignment and a specification
 624  * for two rows in the alignment, compute the
 625  * number of differences between the two sequences
 626  *
 627  * With respect to ambiguity codes, we will want to
 628  * disregard those sites entirely in our count.
 629  *
 630  */
 631 long int
 632 NJ_pw_differences(NJ_alignment *alignment,
 633                   long int x,
 634                   long int y,
 635                   long int *residues) {
 636
 637   long int i;
 638   long int diff;
 639   char c1, c2;
 640   long int tmp_residues;
 641
 642   diff         = 0;
 643   tmp_residues = 0;
 644   for(i=0;i<alignment->length;i++) {
 645
 646     c1 = alignment->data[x*alignment->length+i];
 647     c2 = alignment->data[y*alignment->length+i];
 648
 649     if( c1 != NJ_AMBIGUITY_CHAR ||
 650         c2 != NJ_AMBIGUITY_CHAR ) {
 651
 652       tmp_residues++;
 653
 654       if(c1 != c2) {
 655         diff++;
 656       }
 657     }
 658
 659   }
 660
 661   *residues = tmp_residues;
 662
 663   return(diff);
 664 }
 665
 666
 667
 668
 669
 670