X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=distclearcut.cpp;fp=distclearcut.cpp;h=cf696497032ed81700f8eb1321df24a5f098bc4d;hb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc;hp=0000000000000000000000000000000000000000;hpb=1b73ff67c83892a025e597dabd9df6fe7b58206a;p=mothur.git diff --git a/distclearcut.cpp b/distclearcut.cpp new file mode 100644 index 0000000..cf69649 --- /dev/null +++ b/distclearcut.cpp @@ -0,0 +1,670 @@ +/* + * dist.c + * + * $Id$ + * + * + ***************************************************************************** + * + * Copyright (c) 2004, Luke Sheneman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ***************************************************************************** + * + * Compute a distance matrix given a set of sequences + * + ***************************************************************************** + * + * AUTHOR: + * + * Luke Sheneman + * sheneman@cs.uidaho.edu + * + */ + +#include +#include +#include +#include +#include + +#include "common.h" +#include "dayhoff.h" +#include "fasta.h" +#include "distclearcut.h" + + + + +/* + * NJ_build_distance_matrix() - + * + * Given a filename for an alignment, read the alignment + * into memory and then compute the distance matrix + * using the appropriate correction model + */ +DMAT * +NJ_build_distance_matrix(NJ_ARGS *nj_args) { + + DMAT *dmat; + NJ_alignment *alignment; + + /* Read an alignment in FASTA format */ + alignment = + NJ_read_fasta(nj_args); + + if(!alignment) { + return(NULL); + } + + /* + * Given a global multiple sequence alignment (MSA) and + * a specified distance correction model, compute a + * corrected distance matrix + * + * From proteins, we may want to allow users to specify + * a substitution matrix (feature) + */ + + dmat = + NJ_compute_dmat(nj_args, + alignment); + + // NJ_print_taxanames(dmat); + + if(!dmat) { + fprintf(stderr, "Clearcut: Error computing distance matrix\n"); + } + + /* now free the memory associated with the alignment */ + NJ_free_alignment(alignment); + + return(dmat); +} + + + + + +/* + * NJ_compute_dmat() - + * + * Given an alignment and a correction model, compute the + * distance matrix and return it + * + */ +DMAT * +NJ_compute_dmat(NJ_ARGS *nj_args, + NJ_alignment *alignment) { + + DMAT *dmat; + long int i; + + + /* allocate distance matrix here */ + dmat = (DMAT *)calloc(1, sizeof(DMAT)); + if(!dmat) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + dmat->ntaxa = alignment->nseq; + dmat->size = alignment->nseq; + + /* allocate memory to hold the taxa names */ + dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *)); + if(!dmat->taxaname) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* copy sequence titles */ + for(i=0;inseq;i++) { + dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char)); + if(!dmat->taxaname[i]) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i])); + } + + /* allocate val matrix in dmat */ + dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float)); + + if(!dmat->val) { + fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n"); + return(NULL); + } + + /* now lets allocate space for the r and r2 columns */ + dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float)); + dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float)); + + /* track some memory addresses */ + dmat->rhandle = dmat->r; + dmat->r2handle = dmat->r2; + dmat->valhandle = dmat->val; + + /* apply model correction to matrix */ + switch(nj_args->correction_model) { + + case NJ_MODEL_JUKES: + + if(nj_args->dna_flag) { + NJ_DNA_jc_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_jc_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_KIMURA: + + if(nj_args->dna_flag) { + NJ_DNA_k2p_correction(dmat, alignment); + } else if(nj_args->protein_flag) { + NJ_PROTEIN_kimura_correction(dmat, alignment); + } else { + fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n"); + return(NULL); + } + + break; + + case NJ_MODEL_NONE: + + NJ_no_correction(dmat, alignment); + + break; + + default: + fprintf(stderr, "Clearcut: Invalid distance correction model.\n"); + return(NULL); + } + + return(dmat); +} + + + + + +/* + * NJ_no_correction() - + * + * Compute the distance matrix without correction + * (straight percent ID) + * + * Resolve ambiguities in sequence data by skipping + * those nucleotides/residues + * + */ +void +NJ_no_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float pdiff; + + /* compute pairwise percent identity */ + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + pdiff = 1.0 - NJ_pw_percentid(alignment, i, j); + dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff; + } + } + + return; +} + + + + +/* + * NJ_DNA_jc_correction() - + * + * Compute the distance matrix with jukes-cantor correction + * and assign high distance if sequence divergence exceeds + * 0.75 + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_DNA_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int k; + float d, cutoff, dist; + long int residues; + + cutoff = 0.75; + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + k = NJ_pw_differences(alignment, i, j, &residues); + d = 1.0 - NJ_pw_percentid(alignment, i, j); + + if(d > cutoff) { + dist = NJ_BIGDIST; + } else { + dist = (-0.75) * log(1.0 - (4.0/3.0)*d); + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + + + return; +} + + + + + + +/* + * NJ_PROTEIN_jc_correction() - + * + * This function performs modified jukes/cantor correction on + * a protein alignment + * + * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.), + * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132. + * New York: Academic Press + * + */ +void +NJ_PROTEIN_jc_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist, x; + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + + dist = (float)diff/(float)residues; + x = ((20.0/19.0)*dist); + + if(NJ_FLT_GT(x, 1.0)) { + dist = NJ_BIGDIST; + } else { + dist = -(19.0/20.0) * log(1.0 - x); + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + + +/* + * NJ_DNA_k2p_correction() - + * + * Correct a distance matrix using k2p correction using + * cutoffs to avoid problems with logarithms. + * + * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q) + * + * But due to the logarithms, this is only valid when + * + * (2P+Q <= 1) && + * (2Q <= 1) + * + * So assign arbitary distances when these constraints are + * not strictly followed. + * + * Kimura, M. (1980), A simple method for estimating evolutionary + * rates of base substitutions through comparative studies of + * nucleotide sequences. J. Mol. Evol., 16, 111-120 + * + */ +void +NJ_DNA_k2p_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + float P; /* proportion of transitions */ + float Q; /* proportion of transversions */ + long int nucleotides; + long int transitions, transversions; + float dist; + float log_x = 0.0; /* the params for the first log */ + float log_y = 0.0; /* the params for the second log */ + + int blowup; /* a flag to specify if we have a log blowup */ + + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + + blowup = 0; + + /* count the number of transitions and transversions */ + NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides); + + if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */ + P = 0.0; + Q = 0.0; + } else { + P = (float)transitions / (float)nucleotides; + Q = (float)transversions / (float)nucleotides; + } + + /* the first log blows up if 2*P+Q = 1.0 */ + if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) { + blowup = 1; + } else { + if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) { + blowup = 1; + } else { + log_x = log(1.0 - 2.0*P - Q); + } + } + + /* the second log blows up if 2*Q >= 1.0 */ + if( NJ_FLT_EQ((2.0 * Q), 1.0) || + NJ_FLT_GT((2.0 * Q), 1.0) ) { + blowup = 1; + } else { + log_y = log(1.0 - 2.0*Q); + } + + /* if our logarithms blow up, we just set the distance to the max */ + if(blowup) { + dist = NJ_BIGDIST; + } else { + dist = (-0.5)*log_x - 0.25*log_y; + } + + if(fabs(dist) < FLT_EPSILON) { + dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0; + } else { + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + } + + return; +} + + + + +/* + * NJ_PROTEIN_kimura_correction() - + * + * Perform Kimura correction for distances derived from protein + * alignments. + * + * Kimura, M. (1983), The Neutral Theory of Molecular Evolution. + * p. 75., Cambridge University Press, Cambridge, England + * + */ +void +NJ_PROTEIN_kimura_correction(DMAT *dmat, + NJ_alignment *alignment) { + + long int i, j; + long int residues; + long int diff; + float dist; + + + printf("NJ_PROTEIN_kimura_correction()\n"); + + for(i=0;isize;i++) { + for(j=i+1;jsize;j++) { + diff = NJ_pw_differences(alignment, i, j, &residues); + + if(!diff || !residues) { + dist = 0.0; + } else { + dist = (float)diff/(float)residues; + } + + if(NJ_FLT_LT(dist, 0.75)) { + if(NJ_FLT_GT(dist, 0.0) ) { + dist = -log(1.0 - dist - (dist * dist/5.0) ); + } + } else { + if(NJ_FLT_GT(dist, 0.93) ) { + dist = 10.0; + } else { + dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ; + } + } + + dmat->val[NJ_MAP(i, j, dmat->size)] = dist; + } + } + + return; +} + + + + + +/* + * NJ_DNA_count_tt() - + * + * Count the number of transitions and transversions + * between two aligned DNA sequences + * + * This routine automatically skips ambiguities when + * counting transitions and transversions. + * + */ +void +NJ_DNA_count_tt(NJ_alignment *alignment, + long int x, + long int y, + long int *transitions, + long int *transversions, + long int *residues) { + + long int tmp_transitions = 0; + long int tmp_transversions = 0; + long int tmp_residues = 0; + char a, b; + long int i; + + for(i=0;ilength;i++) { + + a = toupper(alignment->data[x*alignment->length+i]); + b = toupper(alignment->data[y*alignment->length+i]); + + if( (a == 'A' && b == 'T') || + (a == 'T' && b == 'A') || + (a == 'A' && b == 'C') || + (a == 'C' && b == 'A') || + (a == 'T' && b == 'G') || + (a == 'G' && b == 'T') || + (a == 'C' && b == 'G') || + (a == 'G' && b == 'C') ) { + tmp_transversions++; + } + + if( (a == 'C' && b == 'T') || + (a == 'T' && b == 'C') || + (a == 'G' && b == 'A') || + (a == 'A' && b == 'G') ) { + tmp_transitions++; + } + + /* count the number of residues */ + if(a != NJ_AMBIGUITY_CHAR && + b != NJ_AMBIGUITY_CHAR ) { + tmp_residues++; + } + + } + + *transitions = tmp_transitions; + *transversions = tmp_transversions; + + if(residues) { + *residues = tmp_residues; + } + + return; +} + + + + + +/* + * NJ_pw_percentid() - + * + * Given an alignment and a specification + * for two rows, compute the pairwise + * percent identity between the two + * + */ +float +NJ_pw_percentid(NJ_alignment *alignment, + long int x, + long int y) { + + float pid; + long int i; + long int residues; + long int same; + char c1, c2; + + residues = 0; + same = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + residues++; + + if(c1 == c2) { + same++; + } + } + + } + + pid = (float)same/(float)residues; + + return(pid); +} + + + +/* + * NJ_pw_differences() - + * + * Given an alignment and a specification + * for two rows in the alignment, compute the + * number of differences between the two sequences + * + * With respect to ambiguity codes, we will want to + * disregard those sites entirely in our count. + * + */ +long int +NJ_pw_differences(NJ_alignment *alignment, + long int x, + long int y, + long int *residues) { + + long int i; + long int diff; + char c1, c2; + long int tmp_residues; + + diff = 0; + tmp_residues = 0; + for(i=0;ilength;i++) { + + c1 = alignment->data[x*alignment->length+i]; + c2 = alignment->data[y*alignment->length+i]; + + if( c1 != NJ_AMBIGUITY_CHAR || + c2 != NJ_AMBIGUITY_CHAR ) { + + tmp_residues++; + + if(c1 != c2) { + diff++; + } + } + + } + + *residues = tmp_residues; + + return(diff); +} + + + + + +