distclearcut.cpp

   1 /*
   2  * dist.c
   3  *
   4  * $Id$
   5  *
   6  *
   7  *****************************************************************************
   8  *
   9  * Copyright (c) 2004,  Luke Sheneman
  10  * All rights reserved.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  *
  16  *  + Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  *  + Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in
  20  *    the documentation and/or other materials provided with the
  21  *    distribution.
  22  *  + The names of its contributors may not be used to endorse or promote
  23  *    products derived  from this software without specific prior
  24  *    written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36  * POSSIBILITY OF SUCH DAMAGE.
  37  *
  38  *****************************************************************************
  39  *
  40  * Compute a distance matrix given a set of sequences
  41  *
  42  *****************************************************************************
  43  *
  44  * AUTHOR:
  45  *
  46  *   Luke Sheneman
  47  *   sheneman@cs.uidaho.edu
  48  *
  49  */
  50
  51 #include <stdio.h>
  52 #include <stdlib.h>
  53 #include <math.h>
  54 #include <string.h>
  55 #include <ctype.h>
  56
  57 #include "common.h"
  58 #include "dayhoff.h"
  59 #include "fasta.h"
  60 #include "distclearcut.h"
  61
  62
  63
  64
  65 /*
  66  * NJ_build_distance_matrix() -
  67  *
  68  * Given a filename for an alignment, read the alignment
  69  * into memory and then compute the distance matrix
  70  * using the appropriate correction model
  71  */
  72 DMAT *
  73 NJ_build_distance_matrix(NJ_ARGS *nj_args) {
  74
  75   DMAT *dmat;
  76   NJ_alignment *alignment;
  77
  78   /* Read an alignment in FASTA format */
  79   alignment =
  80     NJ_read_fasta(nj_args);
  81
  82   if(!alignment) {
  83     return(NULL);
  84   }
  85
  86   /*
  87    * Given a global multiple sequence alignment (MSA) and
  88    * a specified distance correction model, compute a
  89    * corrected distance matrix
  90    *
  91    * From proteins, we may want to allow users to specify
  92    * a substitution matrix (feature)
  93    */
  94   dmat =
  95     NJ_compute_dmat(nj_args,
  96                     alignment);
  97
  98   // NJ_print_taxanames(dmat);
  99
 100   if(!dmat) {
 101     fprintf(stderr, "Clearcut: Error computing distance matrix\n");
 102   }
 103
 104   /* now free the memory associated with the alignment */
 105   NJ_free_alignment(alignment);
 106
 107   return(dmat);
 108 }
 109
 110
 111
 112
 113
 114 /*
 115  * NJ_compute_dmat() -
 116  *
 117  * Given an alignment and a correction model, compute the
 118  * distance matrix and return it
 119  *
 120  */
 121 DMAT *
 122 NJ_compute_dmat(NJ_ARGS *nj_args,
 123                 NJ_alignment *alignment) {
 124
 125   DMAT *dmat;
 126   long int i;
 127
 128
 129   /* allocate distance matrix here */
 130   dmat = (DMAT *)calloc(1, sizeof(DMAT));
 131   if(!dmat) {
 132     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 133     return(NULL);
 134   }
 135
 136   dmat->ntaxa = alignment->nseq;
 137   dmat->size  = alignment->nseq;
 138
 139   /* allocate memory to hold the taxa names */
 140   dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *));
 141   if(!dmat->taxaname) {
 142     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 143     return(NULL);
 144   }
 145
 146   /* copy sequence titles */
 147   for(i=0;i<alignment->nseq;i++) {
 148     dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char));
 149     if(!dmat->taxaname[i]) {
 150       fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 151       return(NULL);
 152     }
 153
 154     strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i]));
 155   }
 156
 157   /* allocate val matrix in dmat */
 158   dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float));
 159   if(!dmat->val) {
 160     fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
 161     return(NULL);
 162   }
 163
 164   /* now lets allocate space for the r and r2 columns */
 165   dmat->r  = (float *)calloc(dmat->ntaxa, sizeof(float));
 166   dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
 167
 168   /* track some memory addresses */
 169   dmat->rhandle   = dmat->r;
 170   dmat->r2handle  = dmat->r2;
 171   dmat->valhandle = dmat->val;
 172
 173   /* apply model correction to matrix */
 174   switch(nj_args->correction_model) {
 175
 176   case NJ_MODEL_JUKES:
 177
 178     if(nj_args->dna_flag) {
 179       NJ_DNA_jc_correction(dmat, alignment);
 180     } else if(nj_args->protein_flag) {
 181       NJ_PROTEIN_jc_correction(dmat, alignment);
 182     } else {
 183       fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n");
 184       return(NULL);
 185     }
 186
 187     break;
 188
 189   case NJ_MODEL_KIMURA:
 190
 191     if(nj_args->dna_flag) {
 192       NJ_DNA_k2p_correction(dmat, alignment);
 193     } else if(nj_args->protein_flag) {
 194       NJ_PROTEIN_kimura_correction(dmat, alignment);
 195     } else {
 196       fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n");
 197       return(NULL);
 198     }
 199
 200     break;
 201
 202   case NJ_MODEL_NONE:
 203
 204     NJ_no_correction(dmat, alignment);
 205
 206     break;
 207
 208   default:
 209     fprintf(stderr, "Clearcut: Invalid distance correction model.\n");
 210     return(NULL);
 211   }
 212
 213   return(dmat);
 214 }
 215
 216
 217
 218
 219
 220 /*
 221  * NJ_no_correction() -
 222  *
 223  * Compute the distance matrix without correction
 224  * (straight percent ID)
 225  *
 226  * Resolve ambiguities in sequence data by skipping
 227  * those nucleotides/residues
 228  *
 229  */
 230 void
 231 NJ_no_correction(DMAT *dmat,
 232                  NJ_alignment *alignment) {
 233
 234   long int i, j;
 235   float pdiff;
 236
 237   /* compute pairwise percent identity */
 238   for(i=0;i<dmat->size;i++) {
 239     for(j=i+1;j<dmat->size;j++) {
 240       pdiff = 1.0 - NJ_pw_percentid(alignment, i, j);
 241       dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff;
 242     }
 243   }
 244
 245   return;
 246 }
 247
 248
 249
 250
 251 /*
 252  * NJ_DNA_jc_correction() -
 253  *
 254  * Compute the distance matrix with jukes-cantor correction
 255  * and assign high distance if sequence divergence exceeds
 256  * 0.75
 257  *
 258  *   Jukes, T.H. (1969), Evolution of protein molecules.  In H.N. Munro (Ed.),
 259  *   Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
 260  *   New York: Academic Press
 261  *
 262  */
 263 void
 264 NJ_DNA_jc_correction(DMAT *dmat,
 265                      NJ_alignment *alignment) {
 266
 267   long int i, j;
 268   long int k;
 269   float d, cutoff, dist;
 270   long int residues;
 271
 272   cutoff = 0.75;
 273
 274   for(i=0;i<dmat->size;i++) {
 275     for(j=i+1;j<dmat->size;j++) {
 276
 277       k = NJ_pw_differences(alignment, i, j, &residues);
 278       d = 1.0 - NJ_pw_percentid(alignment, i, j);
 279
 280       if(d > cutoff) {
 281         dist = NJ_BIGDIST;
 282       } else {
 283         dist = (-0.75) * log(1.0 - (4.0/3.0)*d);
 284       }
 285
 286       if(fabs(dist) < FLT_EPSILON) {
 287         dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
 288       } else {
 289         dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 290       }
 291     }
 292   }
 293
 294
 295
 296   return;
 297 }
 298
 299
 300
 301
 302
 303
 304 /*
 305  * NJ_PROTEIN_jc_correction() -
 306  *
 307  * This function performs modified jukes/cantor correction on
 308  * a protein alignment
 309  *
 310  *   Jukes, T.H. (1969), Evolution of protein molecules.  In H.N. Munro (Ed.),
 311  *   Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
 312  *   New York: Academic Press
 313  *
 314  */
 315 void
 316 NJ_PROTEIN_jc_correction(DMAT *dmat,
 317                          NJ_alignment *alignment) {
 318
 319   long int i, j;
 320   long int residues;
 321   long int diff;
 322   float dist, x;
 323
 324
 325   for(i=0;i<dmat->size;i++) {
 326     for(j=i+1;j<dmat->size;j++) {
 327
 328       diff = NJ_pw_differences(alignment, i, j, &residues);
 329
 330       if(!diff || !residues) {
 331         dist = 0.0;
 332       } else {
 333
 334         dist = (float)diff/(float)residues;
 335         x = ((20.0/19.0)*dist);
 336
 337         if(NJ_FLT_GT(x, 1.0)) {
 338           dist = NJ_BIGDIST;
 339         } else {
 340           dist = -(19.0/20.0) * log(1.0 - x);
 341         }
 342       }
 343
 344       dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 345     }
 346   }
 347
 348   return;
 349 }
 350
 351
 352
 353
 354
 355
 356 /*
 357  * NJ_DNA_k2p_correction() -
 358  *
 359  * Correct a distance matrix using k2p correction using
 360  * cutoffs to avoid problems with logarithms.
 361  *
 362  * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q)
 363  *
 364  * But due to the logarithms, this is only valid when
 365  *
 366  * (2P+Q <= 1)  &&
 367  * (2Q <= 1)
 368  *
 369  * So assign arbitary distances when these constraints are
 370  * not strictly followed.
 371  *
 372  *   Kimura, M. (1980), A simple method for estimating evolutionary
 373  *   rates of base substitutions through comparative studies of
 374  *   nucleotide sequences.  J. Mol. Evol., 16, 111-120
 375  *
 376  */
 377 void
 378 NJ_DNA_k2p_correction(DMAT *dmat,
 379                       NJ_alignment *alignment) {
 380
 381   long int i, j;
 382   float P;  /* proportion of transitions   */
 383   float Q;  /* proportion of transversions */
 384   long int nucleotides;
 385   long int transitions, transversions;
 386   float dist;
 387   float log_x = 0.0;  /* the params for the first log  */
 388   float log_y = 0.0;  /* the params for the second log */
 389
 390   int blowup;   /* a flag to specify if we have a log blowup */
 391
 392
 393   for(i=0;i<dmat->size;i++) {
 394     for(j=i+1;j<dmat->size;j++) {
 395
 396       blowup = 0;
 397
 398       /* count the number of transitions and transversions */
 399       NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides);
 400
 401       if(!nucleotides) {   /* sequences have no non-ambiguous overlap in alignment */
 402         P = 0.0;
 403         Q = 0.0;
 404       } else {
 405         P = (float)transitions   / (float)nucleotides;
 406         Q = (float)transversions / (float)nucleotides;
 407       }
 408
 409       /* the first log blows up if 2*P+Q = 1.0 */
 410       if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) {
 411         blowup = 1;
 412       } else {
 413         if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) {
 414           blowup = 1;
 415         } else {
 416           log_x = log(1.0 - 2.0*P - Q);
 417         }
 418       }
 419
 420       /* the second log blows up if 2*Q >= 1.0 */
 421       if( NJ_FLT_EQ((2.0 * Q), 1.0) ||
 422           NJ_FLT_GT((2.0 * Q), 1.0) ) {
 423         blowup = 1;
 424       } else {
 425         log_y = log(1.0 - 2.0*Q);
 426       }
 427
 428       /* if our logarithms blow up, we just set the distance to the max */
 429       if(blowup) {
 430         dist = NJ_BIGDIST;
 431       } else {
 432         dist = (-0.5)*log_x - 0.25*log_y;
 433       }
 434
 435       if(fabs(dist) < FLT_EPSILON) {
 436         dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
 437       } else {
 438         dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 439       }
 440     }
 441   }
 442
 443   return;
 444 }
 445
 446
 447
 448
 449 /*
 450  * NJ_PROTEIN_kimura_correction() -
 451  *
 452  * Perform Kimura correction for distances derived from protein
 453  * alignments.
 454  *
 455  *   Kimura, M. (1983), The Neutral Theory of Molecular Evolution.
 456  *   p. 75., Cambridge University Press, Cambridge, England
 457  *
 458  */
 459 void
 460 NJ_PROTEIN_kimura_correction(DMAT *dmat,
 461                              NJ_alignment *alignment) {
 462
 463   long int i, j;
 464   long int residues;
 465   long int diff;
 466   float dist;
 467
 468
 469   printf("NJ_PROTEIN_kimura_correction()\n");
 470
 471   for(i=0;i<dmat->size;i++) {
 472     for(j=i+1;j<dmat->size;j++) {
 473       diff = NJ_pw_differences(alignment, i, j, &residues);
 474
 475       if(!diff || !residues) {
 476         dist = 0.0;
 477       } else {
 478         dist = (float)diff/(float)residues;
 479       }
 480
 481       if(NJ_FLT_LT(dist, 0.75)) {
 482         if(NJ_FLT_GT(dist, 0.0) ) {
 483           dist = -log(1.0 - dist - (dist * dist/5.0) );
 484         }
 485       } else {
 486         if(NJ_FLT_GT(dist, 0.93) ) {
 487           dist = 10.0;
 488         } else {
 489           dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ;
 490         }
 491       }
 492
 493       dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
 494     }
 495   }
 496
 497   return;
 498 }
 499
 500
 501
 502
 503
 504 /*
 505  * NJ_DNA_count_tt() -
 506  *
 507  * Count the number of transitions and transversions
 508  * between two aligned DNA sequences
 509  *
 510  * This routine automatically skips ambiguities when
 511  * counting transitions and transversions.
 512  *
 513  */
 514 void
 515 NJ_DNA_count_tt(NJ_alignment *alignment,
 516                 long int x,
 517                 long int y,
 518                 long int *transitions,
 519                 long int *transversions,
 520                 long int *residues) {
 521
 522   long int tmp_transitions   = 0;
 523   long int tmp_transversions = 0;
 524   long int tmp_residues      = 0;
 525   char a, b;
 526   long int i;
 527
 528   for(i=0;i<alignment->length;i++) {
 529
 530     a = toupper(alignment->data[x*alignment->length+i]);
 531     b = toupper(alignment->data[y*alignment->length+i]);
 532
 533     if( (a == 'A' && b == 'T') ||
 534         (a == 'T' && b == 'A') ||
 535         (a == 'A' && b == 'C') ||
 536         (a == 'C' && b == 'A') ||
 537         (a == 'T' && b == 'G') ||
 538         (a == 'G' && b == 'T') ||
 539         (a == 'C' && b == 'G') ||
 540         (a == 'G' && b == 'C') ) {
 541       tmp_transversions++;
 542     }
 543
 544     if( (a == 'C' && b == 'T') ||
 545         (a == 'T' && b == 'C') ||
 546         (a == 'G' && b == 'A') ||
 547         (a == 'A' && b == 'G') ) {
 548       tmp_transitions++;
 549     }
 550
 551     /* count the number of residues */
 552     if(a != NJ_AMBIGUITY_CHAR &&
 553        b != NJ_AMBIGUITY_CHAR ) {
 554       tmp_residues++;
 555     }
 556
 557   }
 558
 559   *transitions   = tmp_transitions;
 560   *transversions = tmp_transversions;
 561
 562   if(residues) {
 563     *residues = tmp_residues;
 564   }
 565
 566   return;
 567 }
 568
 569
 570
 571
 572
 573 /*
 574  * NJ_pw_percentid() -
 575  *
 576  * Given an alignment and a specification
 577  * for two rows, compute the pairwise
 578  * percent identity between the two
 579  *
 580  */
 581 float
 582 NJ_pw_percentid(NJ_alignment *alignment,
 583                 long int x,
 584                 long int y) {
 585
 586   float pid;
 587   long int i;
 588   long int residues;
 589   long int same;
 590   char c1, c2;
 591
 592   residues = 0;
 593   same     = 0;
 594   for(i=0;i<alignment->length;i++) {
 595
 596     c1 = alignment->data[x*alignment->length+i];
 597     c2 = alignment->data[y*alignment->length+i];
 598
 599     if( c1 != NJ_AMBIGUITY_CHAR ||
 600         c2 != NJ_AMBIGUITY_CHAR ) {
 601
 602       residues++;
 603
 604       if(c1 == c2) {
 605         same++;
 606       }
 607     }
 608
 609   }
 610
 611   pid = (float)same/(float)residues;
 612
 613   return(pid);
 614 }
 615
 616
 617
 618 /*
 619  * NJ_pw_differences() -
 620  *
 621  * Given an alignment and a specification
 622  * for two rows in the alignment, compute the
 623  * number of differences between the two sequences
 624  *
 625  * With respect to ambiguity codes, we will want to
 626  * disregard those sites entirely in our count.
 627  *
 628  */
 629 long int
 630 NJ_pw_differences(NJ_alignment *alignment,
 631                   long int x,
 632                   long int y,
 633                   long int *residues) {
 634
 635   long int i;
 636   long int diff;
 637   char c1, c2;
 638   long int tmp_residues;
 639
 640   diff         = 0;
 641   tmp_residues = 0;
 642   for(i=0;i<alignment->length;i++) {
 643
 644     c1 = alignment->data[x*alignment->length+i];
 645     c2 = alignment->data[y*alignment->length+i];
 646
 647     if( c1 != NJ_AMBIGUITY_CHAR ||
 648         c2 != NJ_AMBIGUITY_CHAR ) {
 649
 650       tmp_residues++;
 651
 652       if(c1 != c2) {
 653         diff++;
 654       }
 655     }
 656
 657   }
 658
 659   *residues = tmp_residues;
 660
 661   return(diff);
 662 }
 663
 664
 665
 666
 667
 668