]> git.donarmstrong.com Git - mothur.git/blob - nast.cpp
fixed bugs in venn and aligner
[mothur.git] / nast.cpp
1 /*
2  *  nast.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/17/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
9  *
10  *      DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
11  *              sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
12  *
13  *      To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14  *      and candidate sequence that are to be aligned to each other.
15  *
16  */
17
18 #include "sequence.hpp"
19 #include "alignment.hpp"
20 #include "nast.hpp"
21
22 /**************************************************************************************************/
23
24 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
25         try {
26                 maxInsertLength = 0;
27                 pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
28                 regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
29
30         }
31         catch(exception& e) {
32                 errorOut(e, "Nast", "Nast");
33                 exit(1);
34         }
35
36 }
37
38 /**************************************************************************************************/
39
40 void Nast::pairwiseAlignSeqs(){ //      Here we call one of the pairwise alignment methods to align our unaligned candidate
41                                                                 //      and template sequences
42         try {
43                 
44                 alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
45         
46                 string candAln = alignment->getSeqAAln();
47                 string tempAln = alignment->getSeqBAln();
48
49                 if(candAln == ""){
50
51                         candidateSeq->setPairwise("");
52                         templateSeq->setPairwise(templateSeq->getUnaligned());
53
54                 }
55                 else{
56
57                         if(tempAln[0] == '-'){
58                                 int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
59                                 for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
60                                         if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
61                                                 candAln = candAln.substr(i);                    //      sequence
62                                                 tempAln = tempAln.substr(i);
63                                                 break;
64                                         }
65                                 }
66                         }
67                         
68                         int pairwiseAlignmentLength = tempAln.length();
69                         if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
70                                 for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
71                                         if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
72                                                 candAln = candAln.substr(0,i+1);
73                                                 tempAln = tempAln.substr(0,i+1);
74                                                 break;
75                                         }               
76                                 }
77                         }
78
79                 }
80
81                 candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
82                 templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
83
84         }
85         catch(exception& e) {
86                 errorOut(e, "Nast", "pairwiseAlignSeqs");
87                 exit(1);
88         }       
89 }
90
91 /**************************************************************************************************/
92
93 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
94
95 //      here we do steps C-F of Fig. 2 from DeSantis et al.
96         try {
97         
98                 int longAlignmentLength = newTemplateAlign.length();    
99                 
100                 for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
101                         int rightIndex, rightRoom, leftIndex, leftRoom;
102                         
103                         //      Part C of Fig. 2 from DeSantis et al.
104                         if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
105                                 
106                                 rightRoom = 0; leftRoom = 0;
107                                 
108                                 //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
109                                 for(leftIndex=i-1;leftIndex>0;leftIndex--){     //      then we've got problems...
110                                         if(!isalpha(candAln[leftIndex])){
111                                                 leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
112                                                 while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
113                                                 break;
114                                         }
115                                 }
116                                 
117                                 
118                                 for(rightIndex=i+1;rightIndex<longAlignmentLength-1;rightIndex++){
119                                         if(!isalpha(candAln[rightIndex])){
120                                                 rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
121                                                 while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
122                                                 break;
123                                         }
124                                 }
125                                         
126                                 int insertLength = 0;                                                   //      figure out how long the anomaly is
127                                 while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
128                                 if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
129                                         
130                                 if((leftRoom + rightRoom) >= insertLength){
131         
132                                         //      Parts D & E from Fig. 2 of DeSantis et al.
133                                         if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
134         
135                                                 if(leftRoom >= insertLength){                   //      enough room to the left to move
136         
137                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
138                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
139                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
140                                                         longAlignmentLength = newTemplateAlign.length();
141                                                         
142                                                         string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
143                                                         string rightCandidateString = candAln.substr(leftIndex+1);
144                                                         candAln = leftCandidateString + rightCandidateString;
145                 
146                                                 }
147                                                 else{                                                                   //      not enough room to the left, have to steal some space to
148                                                         string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
149                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
150                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
151                                                         longAlignmentLength = newTemplateAlign.length();
152                                                         
153                                                         string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
154                                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
155                                                         string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
156                                                         candAln = leftCandidateString + insertString + rightCandidateString;
157                                 
158                                                 }
159                                         }
160                                         else{                                                                           //      the right gap is closer - > move stuff right there's
161                                                 if(rightRoom >= insertLength){                  //      enough room to the right to move
162                         
163                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
164                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
165                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
166                                                         longAlignmentLength = newTemplateAlign.length();
167                                                         
168                                                         string leftCandidateString = candAln.substr(0,rightIndex);
169                                                         string rightCandidateString = candAln.substr(rightIndex+insertLength);
170                                                         candAln = leftCandidateString + rightCandidateString;   
171                                                                         
172                                                 }
173                                                 else{                                                                   //      not enough room to the right, have to steal some 
174                                                         //      space to the left lets move left and then right...
175                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
176                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
177                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
178                                                         longAlignmentLength = newTemplateAlign.length();
179                                                                         
180                                                         string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
181                                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
182                                                         string rightCandidateString = candAln.substr(rightIndex+rightRoom);
183                                                         candAln = leftCandidateString + insertString + rightCandidateString;    
184                                                                         
185                                                 }
186                                         }
187                                 }
188                                 else{
189                                                                                                         //      there could be a case where there isn't enough room in
190                                         string leftTemplateString = newTemplateAlign.substr(0,i);                       //      either direction to move stuff
191                                         string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
192                                         newTemplateAlign = leftTemplateString + rightTemplateString;
193                                         longAlignmentLength = newTemplateAlign.length();
194                                                         
195                                         string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
196                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
197                                         string rightCandidateString = candAln.substr(rightIndex+rightRoom);
198                                         candAln = leftCandidateString + insertString + rightCandidateString;
199         
200                                 }
201                                 
202                                 i -= insertLength;
203                         } 
204                 }
205         }
206         catch(exception& e) {
207                 errorOut(e, "Nast", "removeExtraGaps");
208                 exit(1);
209         }       
210         
211 }
212
213 /**************************************************************************************************/
214
215 void Nast::regapSequences(){    //This is essentially part B in Fig 2. of DeSantis et al.
216         try { 
217         
218                 string candPair = candidateSeq->getPairwise();
219                 string candAln = "";
220                 
221                 string tempPair = templateSeq->getPairwise();
222                 string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
223                 
224                 int pairwiseLength = candPair.length();
225                 int fullAlignLength = tempAln.length();
226                 
227                 if(candPair == ""){
228                         for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
229                         candidateSeq->setAligned(candAln);
230                         return;
231                 }
232                 
233                 int fullAlignIndex = 0;
234                 int pairwiseAlignIndex = 0;
235                 string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
236                 //      alignment string
237                 while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
238                         candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
239                         newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
240                         fullAlignIndex++;
241                 }
242                 
243                 string lastLoop = "";
244                 
245                 while(pairwiseAlignIndex<pairwiseLength){
246                         if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
247                            && isalpha(candPair[pairwiseAlignIndex])){
248                                 //  the template and candidate pairwise and template aligned have characters
249                                 //      need to add character onto the candidatSeq.aligned sequence
250                                 
251                                 candAln += candPair[pairwiseAlignIndex];
252                                 newTemplateAlign += tempPair[pairwiseAlignIndex];//
253                                 
254                                 pairwiseAlignIndex++;
255                                 fullAlignIndex++;
256                         }
257                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
258                                         && isalpha(candPair[pairwiseAlignIndex])){
259                                 //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
260                                 //      need to insert gaps into the candidateSeq.aligned sequence
261                                 
262                                 candAln += '-';
263                                 newTemplateAlign += '-';//
264                                 fullAlignIndex++;
265                         }
266                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
267                                         && isalpha(candPair[pairwiseAlignIndex])){
268                                 //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
269                                 //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
270                                 //      further through the tempAln sequence.
271                                 
272                                 candAln += candPair[pairwiseAlignIndex];
273                                 newTemplateAlign += '-';//
274                                 pairwiseAlignIndex++;
275                         }
276                         else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
277                                         && !isalpha(candPair[pairwiseAlignIndex])){
278                                 //  the template pairwise and full alignment are characters and the candidate sequence has a gap
279                                 //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
280                                 
281                                 candAln += candPair[pairwiseAlignIndex];
282                                 newTemplateAlign += tempAln[fullAlignIndex];//
283                                 fullAlignIndex++;                       
284                                 pairwiseAlignIndex++;
285                         }
286                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
287                                         && isalpha(candPair[pairwiseAlignIndex])){
288                                 //      the template pairwise and aligned are gaps while the candidate pairwise has a character
289                                 //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
290                                 
291                                 candAln += candPair[pairwiseAlignIndex];
292                                 newTemplateAlign += tempAln[fullAlignIndex];//
293                                 pairwiseAlignIndex++;
294                                 fullAlignIndex++;                       
295                         }
296                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
297                                         && !isalpha(candPair[pairwiseAlignIndex])){
298                                 //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
299                                 //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
300                                 
301                                 newTemplateAlign += tempAln[fullAlignIndex];//
302                                 candAln += "-";
303                                 fullAlignIndex++;                       
304                         }
305                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
306                                         && !isalpha(candPair[pairwiseAlignIndex])){
307                                 //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
308                                 //      would skip the gaps and not progress through full alignment sequence
309                                 //      not tested yet
310                                 
311                                 mothurOut("We're into D " + toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); mothurOutEndLine();
312                                 pairwiseAlignIndex++;
313                         }
314                         else{
315                                 //      everything has a gap - not possible
316                                 //      not tested yet
317                                 
318                                 mothurOut("We're into F " +  toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); mothurOutEndLine();
319                                 pairwiseAlignIndex++;
320                                 fullAlignIndex++;                       
321                         }               
322                 }
323                 
324                 for(int i=fullAlignIndex;i<fullAlignLength;i++){
325                         candAln += '.';
326                         newTemplateAlign += tempAln[i];//
327                 }
328                 
329                 int start, end;
330                 for(int i=0;i<candAln.length();i++){
331                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
332                         else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
333                 }                                                                                                                                                               //      '.' characters
334                 
335                 for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
336                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
337                         else{                   end = i;                        break;          }
338                 }
339                 
340                 for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
341                         candAln[i] = toupper(candAln[i]);                       //      everything is upper case
342                 }
343                 
344                 
345                 if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
346                         removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
347                 }                                                                                               //      2 of Desantis et al.
348                 
349                 
350                 candidateSeq->setAligned(candAln);
351         }
352         catch(exception& e) {
353                 errorOut(e, "Nast", "regapSequences");
354                 exit(1);
355         }       
356 }
357
358 /**************************************************************************************************/
359
360 float Nast::getSimilarityScore(){
361         try {
362         
363                 string cand = candidateSeq->getAligned();
364                 string temp = templateSeq->getAligned();
365                 int alignmentLength = temp.length();
366                 int mismatch = 0;
367                 int denominator = 0;
368                 
369                 for(int i=0;i<alignmentLength;i++){
370                         if(cand[i] == '-' && temp[i] == '-'){
371                                 
372                         }
373                         else if(cand[i] != '.' && temp[i] != '.'){
374                                 denominator++;
375                                 
376                                 if(cand[i] != temp[i]){
377                                         mismatch++;
378                                 }
379                         }
380                 }
381                 float similarity = 100 * (1. - mismatch / (float)denominator);
382                 if(denominator == 0){   similarity = 0.0000;    }
383                 
384                 return similarity;
385                 
386         }
387         catch(exception& e) {
388                 errorOut(e, "Nast", "getSimilarityScore");
389                 exit(1);
390         }       
391 }
392
393 /**************************************************************************************************/
394
395 int Nast::getMaxInsertLength(){
396         
397         return maxInsertLength;
398         
399 }
400         
401 /**************************************************************************************************/