5 * Created by Pat Schloss on 12/17/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
8 * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
10 * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple
11 * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9.
13 * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14 * and candidate sequence that are to be aligned to each other.
18 #include "sequence.hpp"
19 #include "alignment.hpp"
22 /**************************************************************************************************/
24 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
26 m = MothurOut::getInstance();
29 pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al.
30 regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al.
34 m->errorOut(e, "Nast", "Nast");
39 /**************************************************************************************************/
41 void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate
42 // and template sequences
44 alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
46 string candAln = alignment->getSeqAAln();
47 string tempAln = alignment->getSeqBAln();
51 candidateSeq->setPairwise("");
52 templateSeq->setPairwise(templateSeq->getUnaligned());
56 if(tempAln[0] == '-'){
57 int pairwiseAlignmentLength = tempAln.length(); // we need to make sure that the candidate sequence alignment
58 for(int i=0;i<pairwiseAlignmentLength;i++){ // starts where the template sequence alignment starts, if it
59 if(isalpha(tempAln[i])){ // starts early, we nuke the beginning of the candidate
60 candAln = candAln.substr(i); // sequence
61 tempAln = tempAln.substr(i);
66 int pairwiseAlignmentLength = tempAln.length();
67 if(tempAln[pairwiseAlignmentLength-1] == '-'){ // we need to make sure that the candidate sequence alignment
68 for(int i=pairwiseAlignmentLength-1; i>=0; i--){// ends where the template sequence alignment ends, if it runs
69 if(isalpha(tempAln[i])){ // long, we nuke the end of the candidate sequence
70 candAln = candAln.substr(0,i+1);
71 tempAln = tempAln.substr(0,i+1);
79 candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for
80 templateSeq->setPairwise(tempAln); // the candidate and template sequences
83 m->errorOut(e, "Nast", "pairwiseAlignSeqs");
88 /**************************************************************************************************/
90 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
92 // here we do steps C-F of Fig. 2 from DeSantis et al.
95 //cout << candAln << endl;
96 //cout << tempAln << endl;
97 //cout << newTemplateAlign << endl;
100 int longAlignmentLength = newTemplateAlign.length();
102 for(int i=0; i<longAlignmentLength; i++){ // use the long alignment as the standard
103 int rightIndex, rightRoom, leftIndex, leftRoom;
105 // Part C of Fig. 2 from DeSantis et al.
106 if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){ //if there is a discrepancy between the regapped
108 rightRoom = 0; leftRoom = 0;
110 // Part D of Fig. 2 from DeSantis et al. // template sequence and the official template sequence
111 for(leftIndex=i-1;leftIndex>0;leftIndex--){ // then we've got problems...
112 if(!isalpha(candAln[leftIndex])){
113 leftRoom = 1; //count how far it is to the nearest gap on the LEFT side of the anomaly
114 while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom])) { leftRoom++; }
119 for(rightIndex=i+1;rightIndex<longAlignmentLength-1;rightIndex++){
120 if(!isalpha(candAln[rightIndex])){
121 rightRoom = 1; //count how far it is to the nearest gap on the RIGHT side of the anomaly
122 while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom])) { rightRoom++; }
127 int insertLength = 0; // figure out how long the anomaly is
128 while(!isalpha(newTemplateAlign[i + insertLength])) { insertLength++; }
129 if(insertLength > maxInsertLength){ maxInsertLength = insertLength; }
131 if((leftRoom + rightRoom) >= insertLength){
133 // Parts D & E from Fig. 2 of DeSantis et al.
134 if((i-leftIndex) <= (rightIndex-i)){ // the left gap is closer - > move stuff left there's
136 if(leftRoom >= insertLength){ // enough room to the left to move
137 //cout << "lr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << insertLength << endl;
138 string leftTemplateString = newTemplateAlign.substr(0,i);
139 string rightTemplateString = newTemplateAlign.substr((i+insertLength));
140 newTemplateAlign = leftTemplateString + rightTemplateString;
141 longAlignmentLength = newTemplateAlign.length();
142 //cout << "lr candAln = " << candAln.length() << '\t' << leftIndex << '\t' << endl;
143 string leftCandidateString = candAln.substr(0,(leftIndex-insertLength+1));
144 string rightCandidateString = candAln.substr((leftIndex+1));
145 candAln = leftCandidateString + rightCandidateString;
148 else{ // not enough room to the left, have to steal some space to
150 //cout << "in else lr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << insertLength << endl;
151 string leftTemplateString = newTemplateAlign.substr(0,i); // the right
152 string rightTemplateString = newTemplateAlign.substr((i+insertLength));
153 newTemplateAlign = leftTemplateString + rightTemplateString;
154 longAlignmentLength = newTemplateAlign.length();
155 //cout << " in else lr candAln = " << candAln.length() << '\t' << " leftIndex = " << leftIndex << " leftroom = " << leftRoom << " rightIndex = " << rightIndex << '\t' << endl;
156 string leftCandidateString = candAln.substr(0,(leftIndex-leftRoom+1));
157 string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
158 string rightCandidateString = candAln.substr((rightIndex+(insertLength-leftRoom)));
159 candAln = leftCandidateString + insertString + rightCandidateString;
163 else{ // the right gap is closer - > move stuff right there's
164 if(rightRoom >= insertLength){ // enough room to the right to move
165 //cout << "rr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << i+insertLength << endl;
166 string leftTemplateString = newTemplateAlign.substr(0,i);
167 string rightTemplateString = newTemplateAlign.substr((i+insertLength));
168 newTemplateAlign = leftTemplateString + rightTemplateString;
169 longAlignmentLength = newTemplateAlign.length();
170 //cout << "rr candAln = " << candAln.length() << '\t' << i << '\t' << rightIndex << '\t' << rightIndex+insertLength << endl;
171 string leftCandidateString = candAln.substr(0,rightIndex);
172 string rightCandidateString = candAln.substr((rightIndex+insertLength));
173 candAln = leftCandidateString + rightCandidateString;
176 else{ // not enough room to the right, have to steal some
177 // space to the left lets move left and then right...
178 //cout << "in else rr newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << i+insertLength << endl;
179 string leftTemplateString = newTemplateAlign.substr(0,i);
180 string rightTemplateString = newTemplateAlign.substr((i+insertLength));
181 newTemplateAlign = leftTemplateString + rightTemplateString;
182 longAlignmentLength = newTemplateAlign.length();
183 //cout << "in else rr candAln = " << candAln.length() << '\t' << '\t' << (leftIndex-(insertLength-rightRoom)+1) << '\t' << (leftIndex+1,rightIndex-leftIndex-1) << '\t' << (rightIndex+rightRoom) << endl;
184 string leftCandidateString = candAln.substr(0,(leftIndex-(insertLength-rightRoom)+1));
185 string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
186 string rightCandidateString = candAln.substr((rightIndex+rightRoom));
187 candAln = leftCandidateString + insertString + rightCandidateString;
195 // there could be a case where there isn't enough room in either direction to move stuff
196 //cout << "in else else newTemplateAlign = " << newTemplateAlign.length() << '\t' << i << '\t' << (i+leftRoom+rightRoom) << endl;
197 string leftTemplateString = newTemplateAlign.substr(0,i);
198 string rightTemplateString = newTemplateAlign.substr((i+leftRoom+rightRoom));
199 newTemplateAlign = leftTemplateString + rightTemplateString;
200 longAlignmentLength = newTemplateAlign.length();
202 //cout << "in else else newTemplateAlign = " << candAln.length() << '\t' << (leftIndex-leftRoom+1) << '\t' << (leftIndex+1) << '\t' << (rightIndex-leftIndex-1) << '\t' << (rightIndex+rightRoom) << endl;
203 string leftCandidateString = candAln.substr(0,(leftIndex-leftRoom+1));
204 string insertString = candAln.substr((leftIndex+1),(rightIndex-leftIndex-1));
205 string rightCandidateString = candAln.substr((rightIndex+rightRoom));
206 candAln = leftCandidateString + insertString + rightCandidateString;
208 i -= (leftRoom + rightRoom);
211 // i -= insertLength;
213 //if i is negative, we want to remove the extra gaps to the right
214 if (i < 0) { cout << "i is negative" << endl; }
218 catch(exception& e) {
219 m->errorOut(e, "Nast", "removeExtraGaps");
224 /**************************************************************************************************/
226 void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis et al.
228 //cout << candidateSeq->getName() << endl;
229 string candPair = candidateSeq->getPairwise();
232 string tempPair = templateSeq->getPairwise();
233 string tempAln = templateSeq->getAligned(); // we use the template aligned sequence as our guide
235 int pairwiseLength = candPair.length();
236 int fullAlignLength = tempAln.length();
239 for(int i=0;i<fullAlignLength;i++) { candAln += '.'; }
240 candidateSeq->setAligned(candAln);
244 int fullAlignIndex = 0;
245 int pairwiseAlignIndex = 0;
246 string newTemplateAlign = ""; // this is going to be messy so we want a temporary template
248 while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex] == '-'){
249 candAln += '.'; // add the initial '-' and '.' to the candidate and template
250 newTemplateAlign += tempAln[fullAlignIndex];// pairwise sequences
254 string lastLoop = "";
256 while(pairwiseAlignIndex<pairwiseLength){
257 //cout << pairwiseAlignIndex << '\t' << fullAlignIndex << '\t' << pairwiseLength << endl;
258 if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
259 && isalpha(candPair[pairwiseAlignIndex])){
260 // the template and candidate pairwise and template aligned have characters
261 // need to add character onto the candidatSeq.aligned sequence
263 candAln += candPair[pairwiseAlignIndex];
264 newTemplateAlign += tempPair[pairwiseAlignIndex];//
266 pairwiseAlignIndex++;
269 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
270 && isalpha(candPair[pairwiseAlignIndex])){
271 // the template pairwise and candidate pairwise are characters and the template aligned is a gap
272 // need to insert gaps into the candidateSeq.aligned sequence
275 newTemplateAlign += '-';//
278 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
279 && isalpha(candPair[pairwiseAlignIndex])){
280 // the template pairwise is a gap and the template aligned and pairwise sequences have characters
281 // this is the alpha scenario. add character to the candidateSeq.aligned sequence without progressing
282 // further through the tempAln sequence.
284 candAln += candPair[pairwiseAlignIndex];
285 newTemplateAlign += '-';//
286 pairwiseAlignIndex++;
288 else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
289 && !isalpha(candPair[pairwiseAlignIndex])){
290 // the template pairwise and full alignment are characters and the candidate sequence has a gap
291 // should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
293 candAln += candPair[pairwiseAlignIndex];
294 newTemplateAlign += tempAln[fullAlignIndex];//
296 pairwiseAlignIndex++;
298 else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
299 && isalpha(candPair[pairwiseAlignIndex])){
300 // the template pairwise and aligned are gaps while the candidate pairwise has a character
301 // this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
303 candAln += candPair[pairwiseAlignIndex];
304 newTemplateAlign += tempAln[fullAlignIndex];//
305 pairwiseAlignIndex++;
308 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
309 && !isalpha(candPair[pairwiseAlignIndex])){
310 // template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
311 // this would happen like we need to add a gap. basically the opposite of the alpha situation
313 newTemplateAlign += tempAln[fullAlignIndex];//
317 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
318 && !isalpha(candPair[pairwiseAlignIndex])){
319 // template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
320 // would skip the gaps and not progress through full alignment sequence
323 m->mothurOut("We're into D " + toString(fullAlignIndex) + " " + toString(pairwiseAlignIndex)); m->mothurOutEndLine();
324 pairwiseAlignIndex++;
327 // everything has a gap - not possible
330 m->mothurOut("We're into F " + toString(fullAlignIndex) + " " + toString(pairwiseAlignIndex)); m->mothurOutEndLine();
331 pairwiseAlignIndex++;
336 for(int i=fullAlignIndex;i<fullAlignLength;i++){
338 newTemplateAlign += tempAln[i];//
342 int end = candAln.length()-1;
344 for(int i=0;i<candAln.length();i++){
345 if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; } // if we padded the alignemnt from
346 else{ start = i; break; } // blast with Z's, change them to
349 for(int i=candAln.length()-1;i>=0;i--){ // ditto.
350 if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; }
351 else{ end = i; break; }
354 for(int i=start;i<=end;i++){ // go through the candidate alignment sequence and make sure that
355 candAln[i] = toupper(candAln[i]); // everything is upper case
359 if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official
360 removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig.
361 } // 2 of Desantis et al.
363 candidateSeq->setAligned(candAln);
364 //cout << "here" << endl;
366 catch(exception& e) {
367 m->errorOut(e, "Nast", "regapSequences");
372 /**************************************************************************************************/
374 float Nast::getSimilarityScore(){
377 string cand = candidateSeq->getAligned();
378 string temp = templateSeq->getAligned();
379 int alignmentLength = temp.length();
383 for(int i=0;i<alignmentLength;i++){
384 if(cand[i] == '-' && temp[i] == '-'){
387 else if(cand[i] != '.' && temp[i] != '.'){
390 if(cand[i] != temp[i]){
395 float similarity = 100 * (1. - mismatch / (float)denominator);
396 if(denominator == 0){ similarity = 0.0000; }
401 catch(exception& e) {
402 m->errorOut(e, "Nast", "getSimilarityScore");
407 /**************************************************************************************************/
409 int Nast::getMaxInsertLength(){
411 return maxInsertLength;
415 /**************************************************************************************************/