]> git.donarmstrong.com Git - mothur.git/blob - nast.cpp
fixed bug in nast:removeextragaps
[mothur.git] / nast.cpp
1 /*
2  *  nast.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/17/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
9  *
10  *      DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
11  *              sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
12  *
13  *      To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14  *      and candidate sequence that are to be aligned to each other.
15  *
16  */
17
18 #include "sequence.hpp"
19 #include "alignment.hpp"
20 #include "nast.hpp"
21
22 /**************************************************************************************************/
23
24 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
25         try {
26                 m = MothurOut::getInstance();
27                 maxInsertLength = 0;
28                 pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
29                 regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "Nast", "Nast");
33                 exit(1);
34         }
35 }
36
37 /**************************************************************************************************/
38
39 void Nast::pairwiseAlignSeqs(){ //      Here we call one of the pairwise alignment methods to align our unaligned candidate
40                                                                 //      and template sequences
41         try {
42
43                 alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
44
45                 string candAln = alignment->getSeqAAln();
46                 string tempAln = alignment->getSeqBAln();
47         
48                 if(candAln == ""){
49
50                         candidateSeq->setPairwise("");
51                         templateSeq->setPairwise(templateSeq->getUnaligned());
52
53                 }
54                 else{
55                         if(tempAln[0] == '-'){
56                                 int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
57                                 for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
58                                         if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
59                                                 candAln = candAln.substr(i);                    //      sequence
60                                                 tempAln = tempAln.substr(i);
61                                                 break;
62                                         }
63                                 }
64                         }
65                         int pairwiseAlignmentLength = tempAln.length();
66                         if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
67                                 for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
68                                         if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
69                                                 candAln = candAln.substr(0,i+1);
70                                                 tempAln = tempAln.substr(0,i+1);
71                                                 break;
72                                         }               
73                                 }
74                         }
75
76                 }
77
78                 candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
79                 templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "Nast", "pairwiseAlignSeqs");
83                 exit(1);
84         }       
85 }
86
87 /**************************************************************************************************/
88
89 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
90
91 //      here we do steps C-F of Fig. 2 from DeSantis et al.
92         try {
93         
94                 int longAlignmentLength = newTemplateAlign.length();    
95         
96                 for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
97                         int rightIndex, rightRoom, leftIndex, leftRoom;
98                         
99                         //      Part C of Fig. 2 from DeSantis et al.
100                         if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
101                                 
102                                 rightRoom = 0; leftRoom = 0;
103                                 
104                                 //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
105                                 for(leftIndex=i-1;leftIndex>0;leftIndex--){             //      then we've got problems...
106                                         if(!isalpha(candAln[leftIndex])){
107                                                 leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
108                                                 while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
109                                                 break;
110                                         }
111                                 }
112
113                                 for(rightIndex=i+1;rightIndex<longAlignmentLength-1;rightIndex++){
114                                         if(!isalpha(candAln[rightIndex])){
115                                                 rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
116                                                 while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
117                                                 break;
118                                         }
119                                 }
120                                                                 
121                                 int insertLength = 0;                                                   //      figure out how long the anomaly is
122                                 while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
123                                 if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
124                 
125                                 if((leftRoom + rightRoom) >= insertLength){
126         
127                                         //      Parts D & E from Fig. 2 of DeSantis et al.
128                                         if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
129         
130                                                 if(leftRoom >= insertLength){                   //      enough room to the left to move
131                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
132                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
133                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
134                                                         longAlignmentLength = newTemplateAlign.length();
135                                                         
136                                                         string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
137                                                         string rightCandidateString = candAln.substr(leftIndex+1);
138                                                         candAln = leftCandidateString + rightCandidateString;
139                 
140                                                 }
141                                                 else{                                                                   //      not enough room to the left, have to steal some space to
142                                                         string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
143                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
144                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
145                                                         longAlignmentLength = newTemplateAlign.length();
146                                                         
147                                                         string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
148                                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
149                                                         string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
150                                                         candAln = leftCandidateString + insertString + rightCandidateString;
151                                 
152                                                 }
153                                         }
154                                         else{                                                                           //      the right gap is closer - > move stuff right there's
155                                                 if(rightRoom >= insertLength){                  //      enough room to the right to move
156                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
157                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
158                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
159                                                         longAlignmentLength = newTemplateAlign.length();
160                                                         
161                                                         string leftCandidateString = candAln.substr(0,rightIndex);
162                                                         string rightCandidateString = candAln.substr(rightIndex+insertLength);
163                                                         candAln = leftCandidateString + rightCandidateString;   
164                                                                         
165                                                 }
166                                                 else{                                                                   //      not enough room to the right, have to steal some        
167                                                         //      space to the left lets move left and then right...
168                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
169                                                         string rightTemplateString = newTemplateAlign.substr(i+insertLength);
170                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
171                                                         longAlignmentLength = newTemplateAlign.length();
172                                                                         
173                                                         string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
174                                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
175                                                         string rightCandidateString = candAln.substr(rightIndex+rightRoom);
176                                                         candAln = leftCandidateString + insertString + rightCandidateString;    
177                                                                         
178                                                 }
179                                         }
180                                         i -= insertLength;
181
182                                 }
183                                 else{
184                         //      there could be a case where there isn't enough room in either direction to move stuff
185
186                                         string leftTemplateString = newTemplateAlign.substr(0,i);       
187                                         string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
188                                         newTemplateAlign = leftTemplateString + rightTemplateString;
189                                         longAlignmentLength = newTemplateAlign.length();
190                                                         
191                                         string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
192                                         string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
193                                         string rightCandidateString = candAln.substr(rightIndex+rightRoom);
194                                         candAln = leftCandidateString + insertString + rightCandidateString;
195                                         
196                                         i -= (leftRoom + rightRoom);
197                                 }
198                         
199 //                              i -= insertLength;
200                         } 
201                 }
202         }
203         catch(exception& e) {
204                 m->errorOut(e, "Nast", "removeExtraGaps");
205                 exit(1);
206         }       
207 }
208
209 /**************************************************************************************************/
210
211 void Nast::regapSequences(){    //This is essentially part B in Fig 2. of DeSantis et al.
212         try { 
213         
214                 string candPair = candidateSeq->getPairwise();
215                 string candAln = "";
216                 
217                 string tempPair = templateSeq->getPairwise();
218                 string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
219                 
220                 int pairwiseLength = candPair.length();
221                 int fullAlignLength = tempAln.length();
222                 
223                 if(candPair == ""){
224                         for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
225                         candidateSeq->setAligned(candAln);
226                         return;
227                 }
228         
229                 int fullAlignIndex = 0;
230                 int pairwiseAlignIndex = 0;
231                 string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
232                 //      alignment string
233                 while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
234                         candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
235                         newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
236                         fullAlignIndex++;
237                 }
238
239                 string lastLoop = "";
240                 
241                 while(pairwiseAlignIndex<pairwiseLength){
242                         if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
243                            && isalpha(candPair[pairwiseAlignIndex])){
244                                 //  the template and candidate pairwise and template aligned have characters
245                                 //      need to add character onto the candidatSeq.aligned sequence
246                                 
247                                 candAln += candPair[pairwiseAlignIndex];
248                                 newTemplateAlign += tempPair[pairwiseAlignIndex];//
249                                 
250                                 pairwiseAlignIndex++;
251                                 fullAlignIndex++;
252                         }
253                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
254                                         && isalpha(candPair[pairwiseAlignIndex])){
255                                 //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
256                                 //      need to insert gaps into the candidateSeq.aligned sequence
257                                 
258                                 candAln += '-';
259                                 newTemplateAlign += '-';//
260                                 fullAlignIndex++;
261                         }
262                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
263                                         && isalpha(candPair[pairwiseAlignIndex])){
264                                 //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
265                                 //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
266                                 //      further through the tempAln sequence.
267                                 
268                                 candAln += candPair[pairwiseAlignIndex];
269                                 newTemplateAlign += '-';//
270                                 pairwiseAlignIndex++;
271                         }
272                         else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
273                                         && !isalpha(candPair[pairwiseAlignIndex])){
274                                 //  the template pairwise and full alignment are characters and the candidate sequence has a gap
275                                 //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
276                                 
277                                 candAln += candPair[pairwiseAlignIndex];
278                                 newTemplateAlign += tempAln[fullAlignIndex];//
279                                 fullAlignIndex++;                       
280                                 pairwiseAlignIndex++;
281                         }
282                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
283                                         && isalpha(candPair[pairwiseAlignIndex])){
284                                 //      the template pairwise and aligned are gaps while the candidate pairwise has a character
285                                 //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
286                                 
287                                 candAln += candPair[pairwiseAlignIndex];
288                                 newTemplateAlign += tempAln[fullAlignIndex];//
289                                 pairwiseAlignIndex++;
290                                 fullAlignIndex++;                       
291                         }
292                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
293                                         && !isalpha(candPair[pairwiseAlignIndex])){
294                                 //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
295                                 //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
296                                 
297                                 newTemplateAlign += tempAln[fullAlignIndex];//
298                                 candAln += "-";
299                                 fullAlignIndex++;                       
300                         }
301                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
302                                         && !isalpha(candPair[pairwiseAlignIndex])){
303                                 //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
304                                 //      would skip the gaps and not progress through full alignment sequence
305                                 //      not tested yet
306                                 
307                                 m->mothurOut("We're into D " + toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); m->mothurOutEndLine();
308                                 pairwiseAlignIndex++;
309                         }
310                         else{
311                                 //      everything has a gap - not possible
312                                 //      not tested yet
313                                 
314                                 m->mothurOut("We're into F " +  toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); m->mothurOutEndLine();
315                                 pairwiseAlignIndex++;
316                                 fullAlignIndex++;                       
317                         }               
318                 }
319                 
320                 for(int i=fullAlignIndex;i<fullAlignLength;i++){
321                         candAln += '.';
322                         newTemplateAlign += tempAln[i];//
323                 }
324                 
325                 int start = 0;
326                 int end = candAln.length()-1;
327
328                 for(int i=0;i<candAln.length();i++){
329                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
330                         else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
331                 }                                                                                                                                                               //      '.' characters
332                 
333                 for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
334                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
335                         else{                   end = i;                        break;          }
336                 }
337                 
338                 for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
339                         candAln[i] = toupper(candAln[i]);                       //      everything is upper case
340                 }
341                 
342
343                 if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
344                         removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
345                 }                                                                                               //      2 of Desantis et al.
346
347                 candidateSeq->setAligned(candAln);
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "Nast", "regapSequences");
351                 exit(1);
352         }       
353 }
354
355 /**************************************************************************************************/
356
357 float Nast::getSimilarityScore(){
358         try {
359         
360                 string cand = candidateSeq->getAligned();
361                 string temp = templateSeq->getAligned();
362                 int alignmentLength = temp.length();
363                 int mismatch = 0;
364                 int denominator = 0;
365                 
366                 for(int i=0;i<alignmentLength;i++){
367                         if(cand[i] == '-' && temp[i] == '-'){
368                                 
369                         }
370                         else if(cand[i] != '.' && temp[i] != '.'){
371                                 denominator++;
372                                 
373                                 if(cand[i] != temp[i]){
374                                         mismatch++;
375                                 }
376                         }
377                 }
378                 float similarity = 100 * (1. - mismatch / (float)denominator);
379                 if(denominator == 0){   similarity = 0.0000;    }
380                 
381                 return similarity;
382                 
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "Nast", "getSimilarityScore");
386                 exit(1);
387         }       
388 }
389
390 /**************************************************************************************************/
391
392 int Nast::getMaxInsertLength(){
393         
394         return maxInsertLength;
395         
396 }
397         
398 /**************************************************************************************************/