]> git.donarmstrong.com Git - mothur.git/blob - nast.cpp
fixed some bugs
[mothur.git] / nast.cpp
1 /*
2  *  nast.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/17/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
9  *
10  *      DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
11  *              sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
12  *
13  *      To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14  *      and candidate sequence that are to be aligned to each other.
15  *
16  */
17
18 #include "sequence.hpp"
19 #include "alignment.hpp"
20 #include "nast.hpp"
21
22 /**************************************************************************************************/
23
24 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
25         maxInsertLength = 0;
26         pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
27         regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
28 }
29
30 /**************************************************************************************************/
31
32 void Nast::pairwiseAlignSeqs(){ //      Here we call one of the pairwise alignment methods to align our unaligned candidate
33                                                                 //      and template sequences
34         alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
35         
36         string candAln = alignment->getSeqAAln();
37         string tempAln = alignment->getSeqBAln();
38
39         if(candAln == ""){
40                 candidateSeq->setPairwise("");
41                 templateSeq->setPairwise(templateSeq->getUnaligned());
42         }
43         else{
44                 if(tempAln[0] == '-'){
45                         int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
46                         for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
47                                 if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
48                                         candAln = candAln.substr(i);                    //      sequence
49                                         tempAln = tempAln.substr(i);
50                                         break;
51                                 }
52                         }
53                 }
54                 
55                 int pairwiseAlignmentLength = tempAln.length();
56                 if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
57                         for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
58                                 if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
59                                         candAln = candAln.substr(0,i+1);
60                                         tempAln = tempAln.substr(0,i+1);
61                                         break;
62                                 }               
63                         }
64                 }
65         }
66         candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
67         templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
68         
69 }
70
71 /**************************************************************************************************/
72
73 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
74
75 //      here we do steps C-F of Fig. 2 from DeSantis et al.
76         
77         int longAlignmentLength = newTemplateAlign.length();    
78
79         for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
80                 int rightIndex, rightRoom, leftIndex, leftRoom;
81
82                 //      Part C of Fig. 2 from DeSantis et al.
83                 if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
84                         
85 //                      cout << i << '\t';cout.flush();
86                         
87                         //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
88                         for(leftIndex=i-1;leftIndex>=0;leftIndex--){    //      then we've got problems...
89                                 if(!isalpha(candAln[leftIndex])){
90                                         leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
91                                         while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
92                                         break;
93                                 }
94                         }
95 //                      cout << leftIndex << '\t' << leftRoom << endl;
96                         
97                         for(rightIndex=i+1;rightIndex<longAlignmentLength;rightIndex++){
98                                 if(!isalpha(candAln[rightIndex])){
99                                         rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
100                                         while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
101                                         break;
102                                 }
103                         }
104                         
105                         int insertLength = 0;                                                   //      figure out how long the anomaly is
106                         while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
107                         if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
108                         
109                         if((leftRoom + rightRoom) >= insertLength){
110                                 //      Parts D & E from Fig. 2 of DeSantis et al.
111                                 if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
112                                         if(leftRoom >= insertLength){                   //      enough room to the left to move
113                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
114                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
115                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
116                                                 longAlignmentLength = newTemplateAlign.length();
117                                                 
118                                                 string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
119                                                 string rightCandidateString = candAln.substr(leftIndex+1);
120                                                 candAln = leftCandidateString + rightCandidateString;   
121                                         }
122                                         else{                                                                   //      not enough room to the left, have to steal some space to
123                                                 string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
124                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
125                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
126                                                 longAlignmentLength = newTemplateAlign.length();
127                                                 
128                                                 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
129                                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
130                                                 string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
131                                                 candAln = leftCandidateString + insertString + rightCandidateString;    
132                                         }
133                                 }
134                                 else{                                                                           //      the right gap is closer - > move stuff right there's
135                                         if(rightRoom >= insertLength){                  //      enough room to the right to move
136                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
137                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
138                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
139                                                 longAlignmentLength = newTemplateAlign.length();
140                                                 
141                                                 string leftCandidateString = candAln.substr(0,rightIndex);
142                                                 string rightCandidateString = candAln.substr(rightIndex+insertLength);
143                                                 candAln = leftCandidateString + rightCandidateString;   
144                                                 
145                                         }
146                                         else{                                                                   //      not enough room to the right, have to steal some 
147                                                                                                                         //      space to the left lets move left and then right...
148                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
149                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
150                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
151                                                 longAlignmentLength = newTemplateAlign.length();
152                                                 
153                                                 string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
154                                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
155                                                 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
156                                                 candAln = leftCandidateString + insertString + rightCandidateString;    
157                                                 
158                                         }
159                                 }
160                         }
161                         else{                                                                                   //      there could be a case where there isn't enough room in
162                                 string leftTemplateString = newTemplateAlign.substr(0,i);                       //      either direction to move stuff
163                                 string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
164                                 newTemplateAlign = leftTemplateString + rightTemplateString;
165                                 longAlignmentLength = newTemplateAlign.length();
166                                 
167                                 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
168                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
169                                 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
170                                 candAln = leftCandidateString + insertString + rightCandidateString;    
171                         }
172                         
173                         i -= insertLength;
174                 } 
175         }
176 //      cout << candAln << endl << tempAln << endl << newTemplateAlign << endl;
177 }
178
179 /**************************************************************************************************/
180
181 void Nast::regapSequences(){    //This is essentially part B in Fig 2. of DeSantis et al.
182         
183         string candPair = candidateSeq->getPairwise();
184         string candAln = "";
185         
186         string tempPair = templateSeq->getPairwise();
187         string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
188         
189         int pairwiseLength = candPair.length();
190         int fullAlignLength = tempAln.length();
191         
192         if(candPair == ""){
193                 for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
194                 candidateSeq->setAligned(candAln);
195                 return;
196         }
197         
198         int fullAlignIndex = 0;
199         int pairwiseAlignIndex = 0;
200         string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
201                                                                                                         //      alignment string
202         while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
203                 candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
204                 newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
205                 fullAlignIndex++;
206         }
207         
208         string lastLoop = "";
209         
210         while(pairwiseAlignIndex<pairwiseLength){
211                 if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
212                    && isalpha(candPair[pairwiseAlignIndex])){
213                         //  the template and candidate pairwise and template aligned have characters
214                         //      need to add character onto the candidatSeq.aligned sequence
215                         
216                         candAln += candPair[pairwiseAlignIndex];
217                         newTemplateAlign += tempPair[pairwiseAlignIndex];//
218                         
219                         pairwiseAlignIndex++;
220                         fullAlignIndex++;
221                 }
222                 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
223                                 && isalpha(candPair[pairwiseAlignIndex])){
224                         //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
225                         //      need to insert gaps into the candidateSeq.aligned sequence
226                         
227                         candAln += '-';
228                         newTemplateAlign += '-';//
229                         fullAlignIndex++;
230                 }
231                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
232                                 && isalpha(candPair[pairwiseAlignIndex])){
233                         //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
234                         //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
235                         //      further through the tempAln sequence.
236                         
237                         candAln += candPair[pairwiseAlignIndex];
238                         newTemplateAlign += '-';//
239                         pairwiseAlignIndex++;
240                 }
241                 else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
242                                 && !isalpha(candPair[pairwiseAlignIndex])){
243                         //  the template pairwise and full alignment are characters and the candidate sequence has a gap
244                         //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
245                         
246                         candAln += candPair[pairwiseAlignIndex];
247                         newTemplateAlign += tempAln[fullAlignIndex];//
248                         fullAlignIndex++;                       
249                         pairwiseAlignIndex++;
250                 }
251                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
252                                 && isalpha(candPair[pairwiseAlignIndex])){
253                         //      the template pairwise and aligned are gaps while the candidate pairwise has a character
254                         //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
255                         
256                         candAln += candPair[pairwiseAlignIndex];
257                         newTemplateAlign += tempAln[fullAlignIndex];//
258                         pairwiseAlignIndex++;
259                         fullAlignIndex++;                       
260                 }
261                 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
262                                 && !isalpha(candPair[pairwiseAlignIndex])){
263                         //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
264                         //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
265                         
266                         newTemplateAlign += tempAln[fullAlignIndex];//
267                         candAln += "-";
268                         fullAlignIndex++;                       
269                 }
270                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
271                                 && !isalpha(candPair[pairwiseAlignIndex])){
272                         //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
273                         //      would skip the gaps and not progress through full alignment sequence
274                         //      not tested yet
275                         
276                         cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
277                         pairwiseAlignIndex++;
278                 }
279                 else{
280                         //      everything has a gap - not possible
281                         //      not tested yet
282                         
283                         cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
284                         pairwiseAlignIndex++;
285                         fullAlignIndex++;                       
286                 }               
287         }
288         
289         for(int i=fullAlignIndex;i<fullAlignLength;i++){
290                 candAln += '.';
291                 newTemplateAlign += tempAln[i];//
292         }
293         
294         int start, end;
295         for(int i=0;i<candAln.length();i++){
296                 if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
297                 else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
298         }                                                                                                                                                               //      '.' characters
299         
300         for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
301                 if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
302                 else{                   end = i;                        break;          }
303         }
304         
305         for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
306                 candAln[i] = toupper(candAln[i]);                       //      everything is upper case
307         }
308
309 //      cout << candAln << endl;
310 //      cout << tempAln << endl;
311 //      cout << newTemplateAlign << endl;
312
313         if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
314                 removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
315         }                                                                                               //      2 of Desantis et al.
316         
317 //      cout << candAln << endl;
318 //      cout << tempAln << endl;
319 //      cout << newTemplateAlign << endl;
320
321         candidateSeq->setAligned(candAln);
322 }
323
324 /**************************************************************************************************/
325
326 float Nast::getSimilarityScore(){
327
328         string cand = candidateSeq->getAligned();
329         string temp = templateSeq->getAligned();
330         int alignmentLength = temp.length();
331         int mismatch = 0;
332         int denominator = 0;
333         
334         for(int i=0;i<alignmentLength;i++){
335                 if(cand[i] == '-' && temp[i] == '-'){
336                         
337                 }
338                 else if(cand[i] != '.' && temp[i] != '.'){
339                         denominator++;
340                         
341                         if(cand[i] != temp[i]){
342                                 mismatch++;
343                         }
344                 }
345         }
346         float similarity = 100 * (1. - mismatch / (float)denominator);
347         if(denominator == 0){   similarity = 0.0000;    }
348
349         return similarity;
350 }
351
352 /**************************************************************************************************/
353
354 int Nast::getMaxInsertLength(){
355         
356         return maxInsertLength;
357         
358 }
359         
360 /**************************************************************************************************/