7 *****************************************************************************
9 * Copyright (c) 2004, Luke Sheneman
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * + Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * + Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in
20 * the documentation and/or other materials provided with the
22 * + The names of its contributors may not be used to endorse or promote
23 * products derived from this software without specific prior
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
38 *****************************************************************************
40 * Compute a distance matrix given a set of sequences
42 *****************************************************************************
47 * sheneman@cs.uidaho.edu
60 #include "distclearcut.h"
66 * NJ_build_distance_matrix() -
68 * Given a filename for an alignment, read the alignment
69 * into memory and then compute the distance matrix
70 * using the appropriate correction model
73 NJ_build_distance_matrix(NJ_ARGS *nj_args) {
76 NJ_alignment *alignment;
78 /* Read an alignment in FASTA format */
80 NJ_read_fasta(nj_args);
87 * Given a global multiple sequence alignment (MSA) and
88 * a specified distance correction model, compute a
89 * corrected distance matrix
91 * From proteins, we may want to allow users to specify
92 * a substitution matrix (feature)
96 NJ_compute_dmat(nj_args,
99 // NJ_print_taxanames(dmat);
102 fprintf(stderr, "Clearcut: Error computing distance matrix\n");
105 /* now free the memory associated with the alignment */
106 NJ_free_alignment(alignment);
116 * NJ_compute_dmat() -
118 * Given an alignment and a correction model, compute the
119 * distance matrix and return it
123 NJ_compute_dmat(NJ_ARGS *nj_args,
124 NJ_alignment *alignment) {
130 /* allocate distance matrix here */
131 dmat = (DMAT *)calloc(1, sizeof(DMAT));
133 fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
137 dmat->ntaxa = alignment->nseq;
138 dmat->size = alignment->nseq;
140 /* allocate memory to hold the taxa names */
141 dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *));
142 if(!dmat->taxaname) {
143 fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
147 /* copy sequence titles */
148 for(i=0;i<alignment->nseq;i++) {
149 dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char));
150 if(!dmat->taxaname[i]) {
151 fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
155 strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i]));
158 /* allocate val matrix in dmat */
159 dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float));
162 fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
166 /* now lets allocate space for the r and r2 columns */
167 dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
168 dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
170 /* track some memory addresses */
171 dmat->rhandle = dmat->r;
172 dmat->r2handle = dmat->r2;
173 dmat->valhandle = dmat->val;
175 /* apply model correction to matrix */
176 switch(nj_args->correction_model) {
180 if(nj_args->dna_flag) {
181 NJ_DNA_jc_correction(dmat, alignment);
182 } else if(nj_args->protein_flag) {
183 NJ_PROTEIN_jc_correction(dmat, alignment);
185 fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n");
191 case NJ_MODEL_KIMURA:
193 if(nj_args->dna_flag) {
194 NJ_DNA_k2p_correction(dmat, alignment);
195 } else if(nj_args->protein_flag) {
196 NJ_PROTEIN_kimura_correction(dmat, alignment);
198 fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n");
206 NJ_no_correction(dmat, alignment);
211 fprintf(stderr, "Clearcut: Invalid distance correction model.\n");
223 * NJ_no_correction() -
225 * Compute the distance matrix without correction
226 * (straight percent ID)
228 * Resolve ambiguities in sequence data by skipping
229 * those nucleotides/residues
233 NJ_no_correction(DMAT *dmat,
234 NJ_alignment *alignment) {
239 /* compute pairwise percent identity */
240 for(i=0;i<dmat->size;i++) {
241 for(j=i+1;j<dmat->size;j++) {
242 pdiff = 1.0 - NJ_pw_percentid(alignment, i, j);
243 dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff;
254 * NJ_DNA_jc_correction() -
256 * Compute the distance matrix with jukes-cantor correction
257 * and assign high distance if sequence divergence exceeds
260 * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
261 * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
262 * New York: Academic Press
266 NJ_DNA_jc_correction(DMAT *dmat,
267 NJ_alignment *alignment) {
271 float d, cutoff, dist;
276 for(i=0;i<dmat->size;i++) {
277 for(j=i+1;j<dmat->size;j++) {
279 k = NJ_pw_differences(alignment, i, j, &residues);
280 d = 1.0 - NJ_pw_percentid(alignment, i, j);
285 dist = (-0.75) * log(1.0 - (4.0/3.0)*d);
288 if(fabs(dist) < FLT_EPSILON) {
289 dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
291 dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
307 * NJ_PROTEIN_jc_correction() -
309 * This function performs modified jukes/cantor correction on
310 * a protein alignment
312 * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
313 * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
314 * New York: Academic Press
318 NJ_PROTEIN_jc_correction(DMAT *dmat,
319 NJ_alignment *alignment) {
327 for(i=0;i<dmat->size;i++) {
328 for(j=i+1;j<dmat->size;j++) {
330 diff = NJ_pw_differences(alignment, i, j, &residues);
332 if(!diff || !residues) {
336 dist = (float)diff/(float)residues;
337 x = ((20.0/19.0)*dist);
339 if(NJ_FLT_GT(x, 1.0)) {
342 dist = -(19.0/20.0) * log(1.0 - x);
346 dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
359 * NJ_DNA_k2p_correction() -
361 * Correct a distance matrix using k2p correction using
362 * cutoffs to avoid problems with logarithms.
364 * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q)
366 * But due to the logarithms, this is only valid when
371 * So assign arbitary distances when these constraints are
372 * not strictly followed.
374 * Kimura, M. (1980), A simple method for estimating evolutionary
375 * rates of base substitutions through comparative studies of
376 * nucleotide sequences. J. Mol. Evol., 16, 111-120
380 NJ_DNA_k2p_correction(DMAT *dmat,
381 NJ_alignment *alignment) {
384 float P; /* proportion of transitions */
385 float Q; /* proportion of transversions */
386 long int nucleotides;
387 long int transitions, transversions;
389 float log_x = 0.0; /* the params for the first log */
390 float log_y = 0.0; /* the params for the second log */
392 int blowup; /* a flag to specify if we have a log blowup */
395 for(i=0;i<dmat->size;i++) {
396 for(j=i+1;j<dmat->size;j++) {
400 /* count the number of transitions and transversions */
401 NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides);
403 if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */
407 P = (float)transitions / (float)nucleotides;
408 Q = (float)transversions / (float)nucleotides;
411 /* the first log blows up if 2*P+Q = 1.0 */
412 if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) {
415 if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) {
418 log_x = log(1.0 - 2.0*P - Q);
422 /* the second log blows up if 2*Q >= 1.0 */
423 if( NJ_FLT_EQ((2.0 * Q), 1.0) ||
424 NJ_FLT_GT((2.0 * Q), 1.0) ) {
427 log_y = log(1.0 - 2.0*Q);
430 /* if our logarithms blow up, we just set the distance to the max */
434 dist = (-0.5)*log_x - 0.25*log_y;
437 if(fabs(dist) < FLT_EPSILON) {
438 dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
440 dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
452 * NJ_PROTEIN_kimura_correction() -
454 * Perform Kimura correction for distances derived from protein
457 * Kimura, M. (1983), The Neutral Theory of Molecular Evolution.
458 * p. 75., Cambridge University Press, Cambridge, England
462 NJ_PROTEIN_kimura_correction(DMAT *dmat,
463 NJ_alignment *alignment) {
471 printf("NJ_PROTEIN_kimura_correction()\n");
473 for(i=0;i<dmat->size;i++) {
474 for(j=i+1;j<dmat->size;j++) {
475 diff = NJ_pw_differences(alignment, i, j, &residues);
477 if(!diff || !residues) {
480 dist = (float)diff/(float)residues;
483 if(NJ_FLT_LT(dist, 0.75)) {
484 if(NJ_FLT_GT(dist, 0.0) ) {
485 dist = -log(1.0 - dist - (dist * dist/5.0) );
488 if(NJ_FLT_GT(dist, 0.93) ) {
491 dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ;
495 dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
507 * NJ_DNA_count_tt() -
509 * Count the number of transitions and transversions
510 * between two aligned DNA sequences
512 * This routine automatically skips ambiguities when
513 * counting transitions and transversions.
517 NJ_DNA_count_tt(NJ_alignment *alignment,
520 long int *transitions,
521 long int *transversions,
522 long int *residues) {
524 long int tmp_transitions = 0;
525 long int tmp_transversions = 0;
526 long int tmp_residues = 0;
530 for(i=0;i<alignment->length;i++) {
532 a = toupper(alignment->data[x*alignment->length+i]);
533 b = toupper(alignment->data[y*alignment->length+i]);
535 if( (a == 'A' && b == 'T') ||
536 (a == 'T' && b == 'A') ||
537 (a == 'A' && b == 'C') ||
538 (a == 'C' && b == 'A') ||
539 (a == 'T' && b == 'G') ||
540 (a == 'G' && b == 'T') ||
541 (a == 'C' && b == 'G') ||
542 (a == 'G' && b == 'C') ) {
546 if( (a == 'C' && b == 'T') ||
547 (a == 'T' && b == 'C') ||
548 (a == 'G' && b == 'A') ||
549 (a == 'A' && b == 'G') ) {
553 /* count the number of residues */
554 if(a != NJ_AMBIGUITY_CHAR &&
555 b != NJ_AMBIGUITY_CHAR ) {
561 *transitions = tmp_transitions;
562 *transversions = tmp_transversions;
565 *residues = tmp_residues;
576 * NJ_pw_percentid() -
578 * Given an alignment and a specification
579 * for two rows, compute the pairwise
580 * percent identity between the two
584 NJ_pw_percentid(NJ_alignment *alignment,
596 for(i=0;i<alignment->length;i++) {
598 c1 = alignment->data[x*alignment->length+i];
599 c2 = alignment->data[y*alignment->length+i];
601 if( c1 != NJ_AMBIGUITY_CHAR ||
602 c2 != NJ_AMBIGUITY_CHAR ) {
613 pid = (float)same/(float)residues;
621 * NJ_pw_differences() -
623 * Given an alignment and a specification
624 * for two rows in the alignment, compute the
625 * number of differences between the two sequences
627 * With respect to ambiguity codes, we will want to
628 * disregard those sites entirely in our count.
632 NJ_pw_differences(NJ_alignment *alignment,
635 long int *residues) {
640 long int tmp_residues;
644 for(i=0;i<alignment->length;i++) {
646 c1 = alignment->data[x*alignment->length+i];
647 c2 = alignment->data[y*alignment->length+i];
649 if( c1 != NJ_AMBIGUITY_CHAR ||
650 c2 != NJ_AMBIGUITY_CHAR ) {
661 *residues = tmp_residues;