]> git.donarmstrong.com Git - mothur.git/blob - nast.cpp
added alignment code
[mothur.git] / nast.cpp
1 /*
2  *  nast.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/17/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
9  *
10  *      DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
11  *              sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
12  *
13  *      To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14  *      and candidate sequence that are to be aligned to each other.
15  *
16  */
17
18 using namespace std;
19
20 #include "sequence.hpp"
21 #include "alignment.hpp"
22 #include "nast.hpp"
23
24 /**************************************************************************************************/
25
26 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
27         maxInsertLength = 0;
28         pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
29         regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
30 }
31
32 /**************************************************************************************************/
33
34 void Nast::pairwiseAlignSeqs(){ //      Here we call one of the pairwise alignment methods to align our unaligned candidate
35                                                                 //      and template sequences
36         alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
37         
38         string candAln = alignment->getSeqAAln();
39         string tempAln = alignment->getSeqBAln();
40
41         if(candAln == ""){
42                 candidateSeq->setPairwise("");
43                 templateSeq->setPairwise(templateSeq->getUnaligned());
44         }
45         else{
46                 if(tempAln[0] == '-'){
47                         int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
48                         for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
49                                 if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
50                                         candAln = candAln.substr(i);                    //      sequence
51                                         tempAln = tempAln.substr(i);
52                                         break;
53                                 }
54                         }
55                 }
56                 
57                 int pairwiseAlignmentLength = tempAln.length();
58                 if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
59                         for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
60                                 if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
61                                         candAln = candAln.substr(0,i+1);
62                                         tempAln = tempAln.substr(0,i+1);
63                                         break;
64                                 }               
65                         }
66                 }
67         }
68         candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
69         templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
70         
71 }
72
73 /**************************************************************************************************/
74
75 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
76
77 //      here we do steps C-F of Fig. 2 from DeSantis et al.
78         
79         int longAlignmentLength = newTemplateAlign.length();    
80
81         for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
82                 int rightIndex, rightRoom, leftIndex, leftRoom;
83
84                 //      Part C of Fig. 2 from DeSantis et al.
85                 if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
86                         
87 //                      cout << i << '\t';cout.flush();
88                         
89                         //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
90                         for(leftIndex=i-1;leftIndex>=0;leftIndex--){    //      then we've got problems...
91                                 if(!isalpha(candAln[leftIndex])){
92                                         leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
93                                         while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
94                                         break;
95                                 }
96                         }
97 //                      cout << leftIndex << '\t' << leftRoom << endl;
98                         
99                         for(rightIndex=i+1;rightIndex<longAlignmentLength;rightIndex++){
100                                 if(!isalpha(candAln[rightIndex])){
101                                         rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
102                                         while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
103                                         break;
104                                 }
105                         }
106                         
107                         int insertLength = 0;                                                   //      figure out how long the anomaly is
108                         while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
109                         if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
110                         
111                         if((leftRoom + rightRoom) >= insertLength){
112                                 //      Parts D & E from Fig. 2 of DeSantis et al.
113                                 if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
114                                         if(leftRoom >= insertLength){                   //      enough room to the left to move
115                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
116                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
117                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
118                                                 longAlignmentLength = newTemplateAlign.length();
119                                                 
120                                                 string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
121                                                 string rightCandidateString = candAln.substr(leftIndex+1);
122                                                 candAln = leftCandidateString + rightCandidateString;   
123                                         }
124                                         else{                                                                   //      not enough room to the left, have to steal some space to
125                                                 string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
126                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
127                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
128                                                 longAlignmentLength = newTemplateAlign.length();
129                                                 
130                                                 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
131                                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
132                                                 string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
133                                                 candAln = leftCandidateString + insertString + rightCandidateString;    
134                                         }
135                                 }
136                                 else{                                                                           //      the right gap is closer - > move stuff right there's
137                                         if(rightRoom >= insertLength){                  //      enough room to the right to move
138                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
139                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
140                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
141                                                 longAlignmentLength = newTemplateAlign.length();
142                                                 
143                                                 string leftCandidateString = candAln.substr(0,rightIndex);
144                                                 string rightCandidateString = candAln.substr(rightIndex+insertLength);
145                                                 candAln = leftCandidateString + rightCandidateString;   
146                                                 
147                                         }
148                                         else{                                                                   //      not enough room to the right, have to steal some 
149                                                                                                                         //      space to the left lets move left and then right...
150                                                 string leftTemplateString = newTemplateAlign.substr(0,i);
151                                                 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
152                                                 newTemplateAlign = leftTemplateString + rightTemplateString;
153                                                 longAlignmentLength = newTemplateAlign.length();
154                                                 
155                                                 string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
156                                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
157                                                 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
158                                                 candAln = leftCandidateString + insertString + rightCandidateString;    
159                                                 
160                                         }
161                                 }
162                         }
163                         else{                                                                                   //      there could be a case where there isn't enough room in
164                                 string leftTemplateString = newTemplateAlign.substr(0,i);                       //      either direction to move stuff
165                                 string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
166                                 newTemplateAlign = leftTemplateString + rightTemplateString;
167                                 longAlignmentLength = newTemplateAlign.length();
168                                 
169                                 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
170                                 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
171                                 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
172                                 candAln = leftCandidateString + insertString + rightCandidateString;    
173                         }
174                         
175                         i -= insertLength;
176                 } 
177         }
178 //      cout << candAln << endl << tempAln << endl << newTemplateAlign << endl;
179 }
180
181 /**************************************************************************************************/
182
183 void Nast::regapSequences(){    //This is essentially part B in Fig 2. of DeSantis et al.
184         
185         string candPair = candidateSeq->getPairwise();
186         string candAln = "";
187         
188         string tempPair = templateSeq->getPairwise();
189         string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
190         
191         int pairwiseLength = candPair.length();
192         int fullAlignLength = tempAln.length();
193         
194         if(candPair == ""){
195                 for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
196                 candidateSeq->setAligned(candAln);
197                 return;
198         }
199         
200         int fullAlignIndex = 0;
201         int pairwiseAlignIndex = 0;
202         string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
203                                                                                                         //      alignment string
204         while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
205                 candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
206                 newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
207                 fullAlignIndex++;
208         }
209         
210         string lastLoop = "";
211         
212         while(pairwiseAlignIndex<pairwiseLength){
213                 if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
214                    && isalpha(candPair[pairwiseAlignIndex])){
215                         //  the template and candidate pairwise and template aligned have characters
216                         //      need to add character onto the candidatSeq.aligned sequence
217                         
218                         candAln += candPair[pairwiseAlignIndex];
219                         newTemplateAlign += tempPair[pairwiseAlignIndex];//
220                         
221                         pairwiseAlignIndex++;
222                         fullAlignIndex++;
223                 }
224                 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
225                                 && isalpha(candPair[pairwiseAlignIndex])){
226                         //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
227                         //      need to insert gaps into the candidateSeq.aligned sequence
228                         
229                         candAln += '-';
230                         newTemplateAlign += '-';//
231                         fullAlignIndex++;
232                 }
233                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
234                                 && isalpha(candPair[pairwiseAlignIndex])){
235                         //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
236                         //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
237                         //      further through the tempAln sequence.
238                         
239                         candAln += candPair[pairwiseAlignIndex];
240                         newTemplateAlign += '-';//
241                         pairwiseAlignIndex++;
242                 }
243                 else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
244                                 && !isalpha(candPair[pairwiseAlignIndex])){
245                         //  the template pairwise and full alignment are characters and the candidate sequence has a gap
246                         //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
247                         
248                         candAln += candPair[pairwiseAlignIndex];
249                         newTemplateAlign += tempAln[fullAlignIndex];//
250                         fullAlignIndex++;                       
251                         pairwiseAlignIndex++;
252                 }
253                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
254                                 && isalpha(candPair[pairwiseAlignIndex])){
255                         //      the template pairwise and aligned are gaps while the candidate pairwise has a character
256                         //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
257                         
258                         candAln += candPair[pairwiseAlignIndex];
259                         newTemplateAlign += tempAln[fullAlignIndex];//
260                         pairwiseAlignIndex++;
261                         fullAlignIndex++;                       
262                 }
263                 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
264                                 && !isalpha(candPair[pairwiseAlignIndex])){
265                         //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
266                         //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
267                         
268                         newTemplateAlign += tempAln[fullAlignIndex];//
269                         candAln += "-";
270                         fullAlignIndex++;                       
271                 }
272                 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
273                                 && !isalpha(candPair[pairwiseAlignIndex])){
274                         //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
275                         //      would skip the gaps and not progress through full alignment sequence
276                         //      not tested yet
277                         
278                         cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
279                         pairwiseAlignIndex++;
280                 }
281                 else{
282                         //      everything has a gap - not possible
283                         //      not tested yet
284                         
285                         cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
286                         pairwiseAlignIndex++;
287                         fullAlignIndex++;                       
288                 }               
289         }
290         
291         for(int i=fullAlignIndex;i<fullAlignLength;i++){
292                 candAln += '.';
293                 newTemplateAlign += tempAln[i];//
294         }
295         
296         int start, end;
297         for(int i=0;i<candAln.length();i++){
298                 if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
299                 else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
300         }                                                                                                                                                               //      '.' characters
301         
302         for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
303                 if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
304                 else{                   end = i;                        break;          }
305         }
306         
307         for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
308                 candAln[i] = toupper(candAln[i]);                       //      everything is upper case
309         }
310
311 //      cout << candAln << endl;
312 //      cout << tempAln << endl;
313 //      cout << newTemplateAlign << endl;
314
315         if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
316                 removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
317         }                                                                                               //      2 of Desantis et al.
318         
319 //      cout << candAln << endl;
320 //      cout << tempAln << endl;
321 //      cout << newTemplateAlign << endl;
322
323         candidateSeq->setAligned(candAln);
324 }
325
326 /**************************************************************************************************/
327
328 float Nast::getSimilarityScore(){
329
330         string cand = candidateSeq->getAligned();
331         string temp = templateSeq->getAligned();
332         int alignmentLength = temp.length();
333         int mismatch = 0;
334         int denominator = 0;
335         
336         for(int i=0;i<alignmentLength;i++){
337                 if(cand[i] == '-' && temp[i] == '-'){
338                         
339                 }
340                 else if(cand[i] != '.' && temp[i] != '.'){
341                         denominator++;
342                         
343                         if(cand[i] != temp[i]){
344                                 mismatch++;
345                         }
346                 }
347         }
348         float similarity = 100 * (1. - mismatch / (float)denominator);
349         if(denominator == 0){   similarity = 0.0000;    }
350
351         return similarity;
352 }
353
354 /**************************************************************************************************/
355
356 int Nast::getMaxInsertLength(){
357         
358         return maxInsertLength;
359         
360 }
361         
362 /**************************************************************************************************/