]> git.donarmstrong.com Git - mothur.git/blob - nast.cpp
fixed bug with aligner and longest base. added deunique.seqs command.
[mothur.git] / nast.cpp
1 /*
2  *  nast.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/17/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
9  *
10  *      DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
11  *              sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
12  *
13  *      To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14  *      and candidate sequence that are to be aligned to each other.
15  *
16  */
17
18 #include "sequence.hpp"
19 #include "alignment.hpp"
20 #include "nast.hpp"
21
22 /**************************************************************************************************/
23
24 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
25         try {
26                 m = MothurOut::getInstance();
27                 maxInsertLength = 0;
28         
29                 pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
30                 regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
31                 
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "Nast", "Nast");
35                 exit(1);
36         }
37 }
38
39 /**************************************************************************************************/
40
41 void Nast::pairwiseAlignSeqs(){ //      Here we call one of the pairwise alignment methods to align our unaligned candidate
42                                                                 //      and template sequences
43         try {   
44                 alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
45
46                 string candAln = alignment->getSeqAAln();
47                 string tempAln = alignment->getSeqBAln();
48
49                 if(candAln == ""){
50
51                         candidateSeq->setPairwise("");
52                         templateSeq->setPairwise(templateSeq->getUnaligned());
53
54                 }
55                 else{
56                         if(tempAln[0] == '-'){
57                                 int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
58                                 for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
59                                         if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
60                                                 candAln = candAln.substr(i);                    //      sequence
61                                                 tempAln = tempAln.substr(i);
62                                                 break;
63                                         }
64                                 }
65                         }
66                         int pairwiseAlignmentLength = tempAln.length();
67                         if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
68                                 for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
69                                         if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
70                                                 candAln = candAln.substr(0,i+1);
71                                                 tempAln = tempAln.substr(0,i+1);
72                                                 break;
73                                         }               
74                                 }
75                         }
76
77                 }
78
79                 candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
80                 templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "Nast", "pairwiseAlignSeqs");
84                 exit(1);
85         }       
86 }
87
88 /**************************************************************************************************/
89
90 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
91
92 //      here we do steps C-F of Fig. 2 from DeSantis et al.
93         try {
94         
95                 //cout << candAln << endl;
96                 //cout << tempAln << endl;
97                 //cout << newTemplateAlign << endl;
98                 //cout << endl;
99                 
100                 int longAlignmentLength = newTemplateAlign.length();    
101         
102                 for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
103                         int rightIndex, rightRoom, leftIndex, leftRoom;
104         
105                         //      Part C of Fig. 2 from DeSantis et al.
106                         if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
107                                 
108                                 rightRoom = 0; leftRoom = 0;
109                                 
110                                 //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
111                                 for(leftIndex=i-1;leftIndex>0;leftIndex--){             //      then we've got problems...
112                                         if(!isalpha(candAln[leftIndex])){
113                                                 leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
114                                                 while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
115                                                 break;
116                                         }
117                                 }
118
119                                 for(rightIndex=i+1;rightIndex<longAlignmentLength-1;rightIndex++){
120                                         if(!isalpha(candAln[rightIndex])){
121                                                 rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
122                                                 while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
123                                                 break;
124                                         }
125                                 }
126                                                                 
127                                 int insertLength = 0;                                                   //      figure out how long the anomaly is
128                                 while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
129                                 if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
130                 
131                                 if((leftRoom + rightRoom) >= insertLength){
132         
133                                         //      Parts D & E from Fig. 2 of DeSantis et al.
134                                         if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
135         
136                                                 if(leftRoom >= insertLength){                   //      enough room to the left to move
137                         //cout << "lr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << insertLength << endl;
138                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
139                                                         string rightTemplateString = newTemplateAlign.substr((i+insertLength));
140                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
141                                                         longAlignmentLength = newTemplateAlign.length();
142                         //cout << "lr candAln = " << candAln.length() << '\t' << leftIndex << '\t'  << endl;                            
143                                                         string leftCandidateString = candAln.substr(0,(leftIndex-insertLength+1));
144                                                         string rightCandidateString = candAln.substr((leftIndex+1));
145                                                         candAln = leftCandidateString + rightCandidateString;
146                 
147                                                 }
148                                                 else{                                                                   //      not enough room to the left, have to steal some space to
149                                                 
150                         //cout << "in else lr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << insertLength << endl;
151                                                         string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
152                                                         string rightTemplateString = newTemplateAlign.substr((i+insertLength));
153                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
154                                                         longAlignmentLength = newTemplateAlign.length();
155                         //cout << " in else lr candAln = " << candAln.length() << '\t' << " leftIndex = " << leftIndex << " leftroom = " << leftRoom << " rightIndex = " << rightIndex << '\t' << endl;                                 
156                                                         string leftCandidateString = candAln.substr(0,(leftIndex-leftRoom+1));
157                                                         string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
158                                                         string rightCandidateString = candAln.substr((rightIndex+(insertLength-leftRoom)));
159                                                         candAln = leftCandidateString + insertString + rightCandidateString;
160                                 
161                                                 }
162                                         }
163                                         else{                                                                           //      the right gap is closer - > move stuff right there's
164                                                 if(rightRoom >= insertLength){                  //      enough room to the right to move
165                         //cout << "rr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << i+insertLength << endl;
166                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
167                                                         string rightTemplateString = newTemplateAlign.substr((i+insertLength));
168                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
169                                                         longAlignmentLength = newTemplateAlign.length();
170                         //cout << "rr candAln = " << candAln.length() << '\t' << i << '\t' << rightIndex << '\t' << rightIndex+insertLength << endl;                            
171                                                         string leftCandidateString = candAln.substr(0,rightIndex);
172                                                         string rightCandidateString = candAln.substr((rightIndex+insertLength));
173                                                         candAln = leftCandidateString + rightCandidateString;   
174                                                                         
175                                                 }
176                                                 else{                                                                   //      not enough room to the right, have to steal some        
177                                                         //      space to the left lets move left and then right...
178                                         //cout << "in else rr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << i+insertLength << endl;
179                                                         string leftTemplateString = newTemplateAlign.substr(0,i);
180                                                         string rightTemplateString = newTemplateAlign.substr((i+insertLength));
181                                                         newTemplateAlign = leftTemplateString + rightTemplateString;
182                                                         longAlignmentLength = newTemplateAlign.length();
183                                         //cout << "in else rr candAln = " << candAln.length() << '\t' << '\t' << (leftIndex-(insertLength-rightRoom)+1) << '\t' <<  (leftIndex+1,rightIndex-leftIndex-1) << '\t' << (rightIndex+rightRoom) << endl;                             
184                                                         string leftCandidateString = candAln.substr(0,(leftIndex-(insertLength-rightRoom)+1));
185                                                         string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
186                                                         string rightCandidateString = candAln.substr((rightIndex+rightRoom));
187                                                         candAln = leftCandidateString + insertString + rightCandidateString;    
188                                                                         
189                                                 }
190                                         }
191                                         i -= insertLength;
192
193                                 }
194                                 else{
195                         //      there could be a case where there isn't enough room in either direction to move stuff
196 //cout << "in else else newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << (i+leftRoom+rightRoom) << endl;
197                                         string leftTemplateString = newTemplateAlign.substr(0,i);       
198                                         string rightTemplateString = newTemplateAlign.substr((i+leftRoom+rightRoom));
199                                         newTemplateAlign = leftTemplateString + rightTemplateString;
200                                         longAlignmentLength = newTemplateAlign.length();
201                                                         
202                 //cout << "in else else newTemplateAlign = " << candAln.length() << '\t' << (leftIndex-leftRoom+1) << '\t' << (leftIndex+1) << '\t' << (rightIndex-leftIndex-1) << '\t' << (rightIndex+rightRoom) << endl;      
203                                         string leftCandidateString = candAln.substr(0,(leftIndex-leftRoom+1));
204                                         string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
205                                         string rightCandidateString = candAln.substr((rightIndex+rightRoom));
206                                         candAln = leftCandidateString + insertString + rightCandidateString;
207                                         
208                                         i -= (leftRoom + rightRoom);
209                                 }
210                         
211 //                              i -= insertLength;
212                                 
213                                 //if i is negative, we want to remove the extra gaps to the right
214                                 if (i < 0) { cout << "i is negative" << endl; }
215                         } 
216                 }
217         }
218         catch(exception& e) {
219                 m->errorOut(e, "Nast", "removeExtraGaps");
220                 exit(1);
221         }       
222 }
223
224 /**************************************************************************************************/
225
226 void Nast::regapSequences(){    //This is essentially part B in Fig 2. of DeSantis et al.
227         try { 
228         //cout << candidateSeq->getName() << endl;
229                 string candPair = candidateSeq->getPairwise();
230                 string candAln = "";
231                 
232                 string tempPair = templateSeq->getPairwise();
233                 string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
234                 
235                 int pairwiseLength = candPair.length();
236                 int fullAlignLength = tempAln.length();
237                 
238                 if(candPair == ""){
239                         for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
240                         candidateSeq->setAligned(candAln);
241                         return;
242                 }
243         
244                 int fullAlignIndex = 0;
245                 int pairwiseAlignIndex = 0;
246                 string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
247                 //      alignment string
248                 while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
249                         candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
250                         newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
251                         fullAlignIndex++;
252                 }
253
254                 string lastLoop = "";
255                 
256                 while(pairwiseAlignIndex<pairwiseLength){
257         //cout << pairwiseAlignIndex << '\t' << fullAlignIndex << '\t' << pairwiseLength << endl;
258                         if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
259                            && isalpha(candPair[pairwiseAlignIndex])){
260                                 //  the template and candidate pairwise and template aligned have characters
261                                 //      need to add character onto the candidatSeq.aligned sequence
262                                 
263                                 candAln += candPair[pairwiseAlignIndex];
264                                 newTemplateAlign += tempPair[pairwiseAlignIndex];//
265                                 
266                                 pairwiseAlignIndex++;
267                                 fullAlignIndex++;
268                         }
269                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
270                                         && isalpha(candPair[pairwiseAlignIndex])){
271                                 //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
272                                 //      need to insert gaps into the candidateSeq.aligned sequence
273                                 
274                                 candAln += '-';
275                                 newTemplateAlign += '-';//
276                                 fullAlignIndex++;
277                         }
278                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
279                                         && isalpha(candPair[pairwiseAlignIndex])){
280                                 //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
281                                 //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
282                                 //      further through the tempAln sequence.
283                                 
284                                 candAln += candPair[pairwiseAlignIndex];
285                                 newTemplateAlign += '-';//
286                                 pairwiseAlignIndex++;
287                         }
288                         else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
289                                         && !isalpha(candPair[pairwiseAlignIndex])){
290                                 //  the template pairwise and full alignment are characters and the candidate sequence has a gap
291                                 //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
292                                 
293                                 candAln += candPair[pairwiseAlignIndex];
294                                 newTemplateAlign += tempAln[fullAlignIndex];//
295                                 fullAlignIndex++;                       
296                                 pairwiseAlignIndex++;
297                         }
298                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
299                                         && isalpha(candPair[pairwiseAlignIndex])){
300                                 //      the template pairwise and aligned are gaps while the candidate pairwise has a character
301                                 //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
302                                 
303                                 candAln += candPair[pairwiseAlignIndex];
304                                 newTemplateAlign += tempAln[fullAlignIndex];//
305                                 pairwiseAlignIndex++;
306                                 fullAlignIndex++;                       
307                         }
308                         else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
309                                         && !isalpha(candPair[pairwiseAlignIndex])){
310                                 //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
311                                 //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
312                                 
313                                 newTemplateAlign += tempAln[fullAlignIndex];//
314                                 candAln += "-";
315                                 fullAlignIndex++;                       
316                         }
317                         else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
318                                         && !isalpha(candPair[pairwiseAlignIndex])){
319                                 //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
320                                 //      would skip the gaps and not progress through full alignment sequence
321                                 //      not tested yet
322                                 
323                                 m->mothurOut("We're into D " + toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); m->mothurOutEndLine();
324                                 pairwiseAlignIndex++;
325                         }
326                         else{
327                                 //      everything has a gap - not possible
328                                 //      not tested yet
329                                 
330                                 m->mothurOut("We're into F " +  toString(fullAlignIndex) + " " +  toString(pairwiseAlignIndex)); m->mothurOutEndLine();
331                                 pairwiseAlignIndex++;
332                                 fullAlignIndex++;                       
333                         }               
334                 }
335                 
336                 for(int i=fullAlignIndex;i<fullAlignLength;i++){
337                         candAln += '.';
338                         newTemplateAlign += tempAln[i];//
339                 }
340                 
341                 int start = 0;
342                 int end = candAln.length()-1;
343
344                 for(int i=0;i<candAln.length();i++){
345                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
346                         else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
347                 }                                                                                                                                                               //      '.' characters
348                 
349                 for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
350                         if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
351                         else{                   end = i;                        break;          }
352                 }
353                 
354                 for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
355                         candAln[i] = toupper(candAln[i]);                       //      everything is upper case
356                 }
357                 
358
359                 if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
360                         removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
361                 }                                                                                               //      2 of Desantis et al.
362
363                 candidateSeq->setAligned(candAln);
364         //cout << "here" << endl;
365         }
366         catch(exception& e) {
367                 m->errorOut(e, "Nast", "regapSequences");
368                 exit(1);
369         }       
370 }
371
372 /**************************************************************************************************/
373
374 float Nast::getSimilarityScore(){
375         try {
376         
377                 string cand = candidateSeq->getAligned();
378                 string temp = templateSeq->getAligned();
379                 int alignmentLength = temp.length();
380                 int mismatch = 0;
381                 int denominator = 0;
382                 
383                 for(int i=0;i<alignmentLength;i++){
384                         if(cand[i] == '-' && temp[i] == '-'){
385                                 
386                         }
387                         else if(cand[i] != '.' && temp[i] != '.'){
388                                 denominator++;
389                                 
390                                 if(cand[i] != temp[i]){
391                                         mismatch++;
392                                 }
393                         }
394                 }
395                 float similarity = 100 * (1. - mismatch / (float)denominator);
396                 if(denominator == 0){   similarity = 0.0000;    }
397                 
398                 return similarity;
399                 
400         }
401         catch(exception& e) {
402                 m->errorOut(e, "Nast", "getSimilarityScore");
403                 exit(1);
404         }       
405 }
406
407 /**************************************************************************************************/
408
409 int Nast::getMaxInsertLength(){
410         
411         return maxInsertLength;
412         
413 }
414         
415 /**************************************************************************************************/