--- /dev/null
+/*
+ * dist.c
+ *
+ * $Id$
+ *
+ *
+ *****************************************************************************
+ *
+ * Copyright (c) 2004, Luke Sheneman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * + The names of its contributors may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************
+ *
+ * Compute a distance matrix given a set of sequences
+ *
+ *****************************************************************************
+ *
+ * AUTHOR:
+ *
+ * Luke Sheneman
+ * sheneman@cs.uidaho.edu
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "common.h"
+#include "dayhoff.h"
+#include "fasta.h"
+#include "distclearcut.h"
+
+
+
+
+/*
+ * NJ_build_distance_matrix() -
+ *
+ * Given a filename for an alignment, read the alignment
+ * into memory and then compute the distance matrix
+ * using the appropriate correction model
+ */
+DMAT *
+NJ_build_distance_matrix(NJ_ARGS *nj_args) {
+
+ DMAT *dmat;
+ NJ_alignment *alignment;
+
+ /* Read an alignment in FASTA format */
+ alignment =
+ NJ_read_fasta(nj_args);
+
+ if(!alignment) {
+ return(NULL);
+ }
+
+ /*
+ * Given a global multiple sequence alignment (MSA) and
+ * a specified distance correction model, compute a
+ * corrected distance matrix
+ *
+ * From proteins, we may want to allow users to specify
+ * a substitution matrix (feature)
+ */
+
+ dmat =
+ NJ_compute_dmat(nj_args,
+ alignment);
+
+ // NJ_print_taxanames(dmat);
+
+ if(!dmat) {
+ fprintf(stderr, "Clearcut: Error computing distance matrix\n");
+ }
+
+ /* now free the memory associated with the alignment */
+ NJ_free_alignment(alignment);
+
+ return(dmat);
+}
+
+
+
+
+
+/*
+ * NJ_compute_dmat() -
+ *
+ * Given an alignment and a correction model, compute the
+ * distance matrix and return it
+ *
+ */
+DMAT *
+NJ_compute_dmat(NJ_ARGS *nj_args,
+ NJ_alignment *alignment) {
+
+ DMAT *dmat;
+ long int i;
+
+
+ /* allocate distance matrix here */
+ dmat = (DMAT *)calloc(1, sizeof(DMAT));
+ if(!dmat) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ dmat->ntaxa = alignment->nseq;
+ dmat->size = alignment->nseq;
+
+ /* allocate memory to hold the taxa names */
+ dmat->taxaname = (char **)calloc(alignment->nseq, sizeof(char *));
+ if(!dmat->taxaname) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ /* copy sequence titles */
+ for(i=0;i<alignment->nseq;i++) {
+ dmat->taxaname[i] = (char *)calloc(strlen(alignment->titles[i])+1, sizeof(char));
+ if(!dmat->taxaname[i]) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ strncpy(dmat->taxaname[i], alignment->titles[i], strlen(alignment->titles[i]));
+ }
+
+ /* allocate val matrix in dmat */
+ dmat->val = (float *)calloc(dmat->ntaxa*dmat->ntaxa, sizeof(float));
+
+ if(!dmat->val) {
+ fprintf(stderr, "Clearcut: Memory allocation error in NJ_compute_dmat()\n");
+ return(NULL);
+ }
+
+ /* now lets allocate space for the r and r2 columns */
+ dmat->r = (float *)calloc(dmat->ntaxa, sizeof(float));
+ dmat->r2 = (float *)calloc(dmat->ntaxa, sizeof(float));
+
+ /* track some memory addresses */
+ dmat->rhandle = dmat->r;
+ dmat->r2handle = dmat->r2;
+ dmat->valhandle = dmat->val;
+
+ /* apply model correction to matrix */
+ switch(nj_args->correction_model) {
+
+ case NJ_MODEL_JUKES:
+
+ if(nj_args->dna_flag) {
+ NJ_DNA_jc_correction(dmat, alignment);
+ } else if(nj_args->protein_flag) {
+ NJ_PROTEIN_jc_correction(dmat, alignment);
+ } else {
+ fprintf(stderr, "Clearcut: Need to know sequence type for Jukes-Cantor model correction.\n");
+ return(NULL);
+ }
+
+ break;
+
+ case NJ_MODEL_KIMURA:
+
+ if(nj_args->dna_flag) {
+ NJ_DNA_k2p_correction(dmat, alignment);
+ } else if(nj_args->protein_flag) {
+ NJ_PROTEIN_kimura_correction(dmat, alignment);
+ } else {
+ fprintf(stderr, "Clearcut: Need to know sequence type for Kimura model correction.\n");
+ return(NULL);
+ }
+
+ break;
+
+ case NJ_MODEL_NONE:
+
+ NJ_no_correction(dmat, alignment);
+
+ break;
+
+ default:
+ fprintf(stderr, "Clearcut: Invalid distance correction model.\n");
+ return(NULL);
+ }
+
+ return(dmat);
+}
+
+
+
+
+
+/*
+ * NJ_no_correction() -
+ *
+ * Compute the distance matrix without correction
+ * (straight percent ID)
+ *
+ * Resolve ambiguities in sequence data by skipping
+ * those nucleotides/residues
+ *
+ */
+void
+NJ_no_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ float pdiff;
+
+ /* compute pairwise percent identity */
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+ pdiff = 1.0 - NJ_pw_percentid(alignment, i, j);
+ dmat->val[NJ_MAP(i, j, dmat->size)] = pdiff;
+ }
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_DNA_jc_correction() -
+ *
+ * Compute the distance matrix with jukes-cantor correction
+ * and assign high distance if sequence divergence exceeds
+ * 0.75
+ *
+ * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
+ * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
+ * New York: Academic Press
+ *
+ */
+void
+NJ_DNA_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int k;
+ float d, cutoff, dist;
+ long int residues;
+
+ cutoff = 0.75;
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ k = NJ_pw_differences(alignment, i, j, &residues);
+ d = 1.0 - NJ_pw_percentid(alignment, i, j);
+
+ if(d > cutoff) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = (-0.75) * log(1.0 - (4.0/3.0)*d);
+ }
+
+ if(fabs(dist) < FLT_EPSILON) {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
+ } else {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+ }
+
+
+
+ return;
+}
+
+
+
+
+
+
+/*
+ * NJ_PROTEIN_jc_correction() -
+ *
+ * This function performs modified jukes/cantor correction on
+ * a protein alignment
+ *
+ * Jukes, T.H. (1969), Evolution of protein molecules. In H.N. Munro (Ed.),
+ * Mammalian Protein Metabolism, Volume III, Chapter 24, pp. 21-132.
+ * New York: Academic Press
+ *
+ */
+void
+NJ_PROTEIN_jc_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int residues;
+ long int diff;
+ float dist, x;
+
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ diff = NJ_pw_differences(alignment, i, j, &residues);
+
+ if(!diff || !residues) {
+ dist = 0.0;
+ } else {
+
+ dist = (float)diff/(float)residues;
+ x = ((20.0/19.0)*dist);
+
+ if(NJ_FLT_GT(x, 1.0)) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = -(19.0/20.0) * log(1.0 - x);
+ }
+ }
+
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+
+/*
+ * NJ_DNA_k2p_correction() -
+ *
+ * Correct a distance matrix using k2p correction using
+ * cutoffs to avoid problems with logarithms.
+ *
+ * dist = -0.5ln(1-2P-Q) - 0.25ln(1-2Q)
+ *
+ * But due to the logarithms, this is only valid when
+ *
+ * (2P+Q <= 1) &&
+ * (2Q <= 1)
+ *
+ * So assign arbitary distances when these constraints are
+ * not strictly followed.
+ *
+ * Kimura, M. (1980), A simple method for estimating evolutionary
+ * rates of base substitutions through comparative studies of
+ * nucleotide sequences. J. Mol. Evol., 16, 111-120
+ *
+ */
+void
+NJ_DNA_k2p_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ float P; /* proportion of transitions */
+ float Q; /* proportion of transversions */
+ long int nucleotides;
+ long int transitions, transversions;
+ float dist;
+ float log_x = 0.0; /* the params for the first log */
+ float log_y = 0.0; /* the params for the second log */
+
+ int blowup; /* a flag to specify if we have a log blowup */
+
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+
+ blowup = 0;
+
+ /* count the number of transitions and transversions */
+ NJ_DNA_count_tt(alignment, i, j, &transitions, &transversions, &nucleotides);
+
+ if(!nucleotides) { /* sequences have no non-ambiguous overlap in alignment */
+ P = 0.0;
+ Q = 0.0;
+ } else {
+ P = (float)transitions / (float)nucleotides;
+ Q = (float)transversions / (float)nucleotides;
+ }
+
+ /* the first log blows up if 2*P+Q = 1.0 */
+ if(NJ_FLT_EQ((2.0 * P + Q), 1.0)) {
+ blowup = 1;
+ } else {
+ if( NJ_FLT_LT(1.0 - 2.0*P - Q, 0.0) ) {
+ blowup = 1;
+ } else {
+ log_x = log(1.0 - 2.0*P - Q);
+ }
+ }
+
+ /* the second log blows up if 2*Q >= 1.0 */
+ if( NJ_FLT_EQ((2.0 * Q), 1.0) ||
+ NJ_FLT_GT((2.0 * Q), 1.0) ) {
+ blowup = 1;
+ } else {
+ log_y = log(1.0 - 2.0*Q);
+ }
+
+ /* if our logarithms blow up, we just set the distance to the max */
+ if(blowup) {
+ dist = NJ_BIGDIST;
+ } else {
+ dist = (-0.5)*log_x - 0.25*log_y;
+ }
+
+ if(fabs(dist) < FLT_EPSILON) {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = 0.0;
+ } else {
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+ }
+
+ return;
+}
+
+
+
+
+/*
+ * NJ_PROTEIN_kimura_correction() -
+ *
+ * Perform Kimura correction for distances derived from protein
+ * alignments.
+ *
+ * Kimura, M. (1983), The Neutral Theory of Molecular Evolution.
+ * p. 75., Cambridge University Press, Cambridge, England
+ *
+ */
+void
+NJ_PROTEIN_kimura_correction(DMAT *dmat,
+ NJ_alignment *alignment) {
+
+ long int i, j;
+ long int residues;
+ long int diff;
+ float dist;
+
+
+ printf("NJ_PROTEIN_kimura_correction()\n");
+
+ for(i=0;i<dmat->size;i++) {
+ for(j=i+1;j<dmat->size;j++) {
+ diff = NJ_pw_differences(alignment, i, j, &residues);
+
+ if(!diff || !residues) {
+ dist = 0.0;
+ } else {
+ dist = (float)diff/(float)residues;
+ }
+
+ if(NJ_FLT_LT(dist, 0.75)) {
+ if(NJ_FLT_GT(dist, 0.0) ) {
+ dist = -log(1.0 - dist - (dist * dist/5.0) );
+ }
+ } else {
+ if(NJ_FLT_GT(dist, 0.93) ) {
+ dist = 10.0;
+ } else {
+ dist = (float)NJ_dayhoff[ (int)((dist*1000.0)-750.0) ] / 100.0 ;
+ }
+ }
+
+ dmat->val[NJ_MAP(i, j, dmat->size)] = dist;
+ }
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_DNA_count_tt() -
+ *
+ * Count the number of transitions and transversions
+ * between two aligned DNA sequences
+ *
+ * This routine automatically skips ambiguities when
+ * counting transitions and transversions.
+ *
+ */
+void
+NJ_DNA_count_tt(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *transitions,
+ long int *transversions,
+ long int *residues) {
+
+ long int tmp_transitions = 0;
+ long int tmp_transversions = 0;
+ long int tmp_residues = 0;
+ char a, b;
+ long int i;
+
+ for(i=0;i<alignment->length;i++) {
+
+ a = toupper(alignment->data[x*alignment->length+i]);
+ b = toupper(alignment->data[y*alignment->length+i]);
+
+ if( (a == 'A' && b == 'T') ||
+ (a == 'T' && b == 'A') ||
+ (a == 'A' && b == 'C') ||
+ (a == 'C' && b == 'A') ||
+ (a == 'T' && b == 'G') ||
+ (a == 'G' && b == 'T') ||
+ (a == 'C' && b == 'G') ||
+ (a == 'G' && b == 'C') ) {
+ tmp_transversions++;
+ }
+
+ if( (a == 'C' && b == 'T') ||
+ (a == 'T' && b == 'C') ||
+ (a == 'G' && b == 'A') ||
+ (a == 'A' && b == 'G') ) {
+ tmp_transitions++;
+ }
+
+ /* count the number of residues */
+ if(a != NJ_AMBIGUITY_CHAR &&
+ b != NJ_AMBIGUITY_CHAR ) {
+ tmp_residues++;
+ }
+
+ }
+
+ *transitions = tmp_transitions;
+ *transversions = tmp_transversions;
+
+ if(residues) {
+ *residues = tmp_residues;
+ }
+
+ return;
+}
+
+
+
+
+
+/*
+ * NJ_pw_percentid() -
+ *
+ * Given an alignment and a specification
+ * for two rows, compute the pairwise
+ * percent identity between the two
+ *
+ */
+float
+NJ_pw_percentid(NJ_alignment *alignment,
+ long int x,
+ long int y) {
+
+ float pid;
+ long int i;
+ long int residues;
+ long int same;
+ char c1, c2;
+
+ residues = 0;
+ same = 0;
+ for(i=0;i<alignment->length;i++) {
+
+ c1 = alignment->data[x*alignment->length+i];
+ c2 = alignment->data[y*alignment->length+i];
+
+ if( c1 != NJ_AMBIGUITY_CHAR ||
+ c2 != NJ_AMBIGUITY_CHAR ) {
+
+ residues++;
+
+ if(c1 == c2) {
+ same++;
+ }
+ }
+
+ }
+
+ pid = (float)same/(float)residues;
+
+ return(pid);
+}
+
+
+
+/*
+ * NJ_pw_differences() -
+ *
+ * Given an alignment and a specification
+ * for two rows in the alignment, compute the
+ * number of differences between the two sequences
+ *
+ * With respect to ambiguity codes, we will want to
+ * disregard those sites entirely in our count.
+ *
+ */
+long int
+NJ_pw_differences(NJ_alignment *alignment,
+ long int x,
+ long int y,
+ long int *residues) {
+
+ long int i;
+ long int diff;
+ char c1, c2;
+ long int tmp_residues;
+
+ diff = 0;
+ tmp_residues = 0;
+ for(i=0;i<alignment->length;i++) {
+
+ c1 = alignment->data[x*alignment->length+i];
+ c2 = alignment->data[y*alignment->length+i];
+
+ if( c1 != NJ_AMBIGUITY_CHAR ||
+ c2 != NJ_AMBIGUITY_CHAR ) {
+
+ tmp_residues++;
+
+ if(c1 != c2) {
+ diff++;
+ }
+ }
+
+ }
+
+ *residues = tmp_residues;
+
+ return(diff);
+}
+
+
+
+
+
+