5 * Created by Pat Schloss on 12/17/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
8 * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
10 * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple
11 * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9.
13 * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
14 * and candidate sequence that are to be aligned to each other.
20 #include "sequence.hpp"
21 #include "alignment.hpp"
24 /**************************************************************************************************/
26 Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
28 pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al.
29 regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al.
32 /**************************************************************************************************/
34 void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate
35 // and template sequences
36 alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
38 string candAln = alignment->getSeqAAln();
39 string tempAln = alignment->getSeqBAln();
42 candidateSeq->setPairwise("");
43 templateSeq->setPairwise(templateSeq->getUnaligned());
46 if(tempAln[0] == '-'){
47 int pairwiseAlignmentLength = tempAln.length(); // we need to make sure that the candidate sequence alignment
48 for(int i=0;i<pairwiseAlignmentLength;i++){ // starts where the template sequence alignment starts, if it
49 if(isalpha(tempAln[i])){ // starts early, we nuke the beginning of the candidate
50 candAln = candAln.substr(i); // sequence
51 tempAln = tempAln.substr(i);
57 int pairwiseAlignmentLength = tempAln.length();
58 if(tempAln[pairwiseAlignmentLength-1] == '-'){ // we need to make sure that the candidate sequence alignment
59 for(int i=pairwiseAlignmentLength-1; i>=0; i--){// ends where the template sequence alignment ends, if it runs
60 if(isalpha(tempAln[i])){ // long, we nuke the end of the candidate sequence
61 candAln = candAln.substr(0,i+1);
62 tempAln = tempAln.substr(0,i+1);
68 candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for
69 templateSeq->setPairwise(tempAln); // the candidate and template sequences
73 /**************************************************************************************************/
75 void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
77 // here we do steps C-F of Fig. 2 from DeSantis et al.
79 int longAlignmentLength = newTemplateAlign.length();
81 for(int i=0; i<longAlignmentLength; i++){ // use the long alignment as the standard
82 int rightIndex, rightRoom, leftIndex, leftRoom;
84 // Part C of Fig. 2 from DeSantis et al.
85 if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){ //if there is a discrepancy between the regapped
87 // cout << i << '\t';cout.flush();
89 // Part D of Fig. 2 from DeSantis et al. // template sequence and the official template sequence
90 for(leftIndex=i-1;leftIndex>=0;leftIndex--){ // then we've got problems...
91 if(!isalpha(candAln[leftIndex])){
92 leftRoom = 1; //count how far it is to the nearest gap on the LEFT side of the anomaly
93 while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom])) { leftRoom++; }
97 // cout << leftIndex << '\t' << leftRoom << endl;
99 for(rightIndex=i+1;rightIndex<longAlignmentLength;rightIndex++){
100 if(!isalpha(candAln[rightIndex])){
101 rightRoom = 1; //count how far it is to the nearest gap on the RIGHT side of the anomaly
102 while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom])) { rightRoom++; }
107 int insertLength = 0; // figure out how long the anomaly is
108 while(!isalpha(newTemplateAlign[i + insertLength])) { insertLength++; }
109 if(insertLength > maxInsertLength){ maxInsertLength = insertLength; }
111 if((leftRoom + rightRoom) >= insertLength){
112 // Parts D & E from Fig. 2 of DeSantis et al.
113 if((i-leftIndex) <= (rightIndex-i)){ // the left gap is closer - > move stuff left there's
114 if(leftRoom >= insertLength){ // enough room to the left to move
115 string leftTemplateString = newTemplateAlign.substr(0,i);
116 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
117 newTemplateAlign = leftTemplateString + rightTemplateString;
118 longAlignmentLength = newTemplateAlign.length();
120 string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
121 string rightCandidateString = candAln.substr(leftIndex+1);
122 candAln = leftCandidateString + rightCandidateString;
124 else{ // not enough room to the left, have to steal some space to
125 string leftTemplateString = newTemplateAlign.substr(0,i); // the right
126 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
127 newTemplateAlign = leftTemplateString + rightTemplateString;
128 longAlignmentLength = newTemplateAlign.length();
130 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
131 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
132 string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
133 candAln = leftCandidateString + insertString + rightCandidateString;
136 else{ // the right gap is closer - > move stuff right there's
137 if(rightRoom >= insertLength){ // enough room to the right to move
138 string leftTemplateString = newTemplateAlign.substr(0,i);
139 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
140 newTemplateAlign = leftTemplateString + rightTemplateString;
141 longAlignmentLength = newTemplateAlign.length();
143 string leftCandidateString = candAln.substr(0,rightIndex);
144 string rightCandidateString = candAln.substr(rightIndex+insertLength);
145 candAln = leftCandidateString + rightCandidateString;
148 else{ // not enough room to the right, have to steal some
149 // space to the left lets move left and then right...
150 string leftTemplateString = newTemplateAlign.substr(0,i);
151 string rightTemplateString = newTemplateAlign.substr(i+insertLength);
152 newTemplateAlign = leftTemplateString + rightTemplateString;
153 longAlignmentLength = newTemplateAlign.length();
155 string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
156 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
157 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
158 candAln = leftCandidateString + insertString + rightCandidateString;
163 else{ // there could be a case where there isn't enough room in
164 string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff
165 string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
166 newTemplateAlign = leftTemplateString + rightTemplateString;
167 longAlignmentLength = newTemplateAlign.length();
169 string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
170 string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
171 string rightCandidateString = candAln.substr(rightIndex+rightRoom);
172 candAln = leftCandidateString + insertString + rightCandidateString;
178 // cout << candAln << endl << tempAln << endl << newTemplateAlign << endl;
181 /**************************************************************************************************/
183 void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis et al.
185 string candPair = candidateSeq->getPairwise();
188 string tempPair = templateSeq->getPairwise();
189 string tempAln = templateSeq->getAligned(); // we use the template aligned sequence as our guide
191 int pairwiseLength = candPair.length();
192 int fullAlignLength = tempAln.length();
195 for(int i=0;i<fullAlignLength;i++) { candAln += '.'; }
196 candidateSeq->setAligned(candAln);
200 int fullAlignIndex = 0;
201 int pairwiseAlignIndex = 0;
202 string newTemplateAlign = ""; // this is going to be messy so we want a temporary template
204 while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex] == '-'){
205 candAln += '.'; // add the initial '-' and '.' to the candidate and template
206 newTemplateAlign += tempAln[fullAlignIndex];// pairwise sequences
210 string lastLoop = "";
212 while(pairwiseAlignIndex<pairwiseLength){
213 if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
214 && isalpha(candPair[pairwiseAlignIndex])){
215 // the template and candidate pairwise and template aligned have characters
216 // need to add character onto the candidatSeq.aligned sequence
218 candAln += candPair[pairwiseAlignIndex];
219 newTemplateAlign += tempPair[pairwiseAlignIndex];//
221 pairwiseAlignIndex++;
224 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
225 && isalpha(candPair[pairwiseAlignIndex])){
226 // the template pairwise and candidate pairwise are characters and the template aligned is a gap
227 // need to insert gaps into the candidateSeq.aligned sequence
230 newTemplateAlign += '-';//
233 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
234 && isalpha(candPair[pairwiseAlignIndex])){
235 // the template pairwise is a gap and the template aligned and pairwise sequences have characters
236 // this is the alpha scenario. add character to the candidateSeq.aligned sequence without progressing
237 // further through the tempAln sequence.
239 candAln += candPair[pairwiseAlignIndex];
240 newTemplateAlign += '-';//
241 pairwiseAlignIndex++;
243 else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
244 && !isalpha(candPair[pairwiseAlignIndex])){
245 // the template pairwise and full alignment are characters and the candidate sequence has a gap
246 // should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
248 candAln += candPair[pairwiseAlignIndex];
249 newTemplateAlign += tempAln[fullAlignIndex];//
251 pairwiseAlignIndex++;
253 else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
254 && isalpha(candPair[pairwiseAlignIndex])){
255 // the template pairwise and aligned are gaps while the candidate pairwise has a character
256 // this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
258 candAln += candPair[pairwiseAlignIndex];
259 newTemplateAlign += tempAln[fullAlignIndex];//
260 pairwiseAlignIndex++;
263 else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
264 && !isalpha(candPair[pairwiseAlignIndex])){
265 // template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
266 // this would happen like we need to add a gap. basically the opposite of the alpha situation
268 newTemplateAlign += tempAln[fullAlignIndex];//
272 else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
273 && !isalpha(candPair[pairwiseAlignIndex])){
274 // template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
275 // would skip the gaps and not progress through full alignment sequence
278 cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl;
279 pairwiseAlignIndex++;
282 // everything has a gap - not possible
285 cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl;
286 pairwiseAlignIndex++;
291 for(int i=fullAlignIndex;i<fullAlignLength;i++){
293 newTemplateAlign += tempAln[i];//
297 for(int i=0;i<candAln.length();i++){
298 if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; } // if we padded the alignemnt from
299 else{ start = i; break; } // blast with Z's, change them to
302 for(int i=candAln.length()-1;i>=0;i--){ // ditto.
303 if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; }
304 else{ end = i; break; }
307 for(int i=start;i<=end;i++){ // go through the candidate alignment sequence and make sure that
308 candAln[i] = toupper(candAln[i]); // everything is upper case
311 // cout << candAln << endl;
312 // cout << tempAln << endl;
313 // cout << newTemplateAlign << endl;
315 if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official
316 removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig.
317 } // 2 of Desantis et al.
319 // cout << candAln << endl;
320 // cout << tempAln << endl;
321 // cout << newTemplateAlign << endl;
323 candidateSeq->setAligned(candAln);
326 /**************************************************************************************************/
328 float Nast::getSimilarityScore(){
330 string cand = candidateSeq->getAligned();
331 string temp = templateSeq->getAligned();
332 int alignmentLength = temp.length();
336 for(int i=0;i<alignmentLength;i++){
337 if(cand[i] == '-' && temp[i] == '-'){
340 else if(cand[i] != '.' && temp[i] != '.'){
343 if(cand[i] != temp[i]){
348 float similarity = 100 * (1. - mismatch / (float)denominator);
349 if(denominator == 0){ similarity = 0.0000; }
354 /**************************************************************************************************/
356 int Nast::getMaxInsertLength(){
358 return maxInsertLength;
362 /**************************************************************************************************/