]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
final commit before testing 1.11
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 name = name.substr(1);
58                 string sequence;
59         
60                 //read comments
61                 while ((name[0] == '#') && fastaString) { 
62                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
63                         sequence = getCommentString(fastaString);
64                         
65                         if (fastaString) {  
66                                 fastaString >> name;  
67                                 name = name.substr(1);  
68                         }else { 
69                                 name = "";
70                                 break;
71                         }
72                 }
73                 
74                 while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
75                 
76                 sequence = getSequenceString(fastaString);              
77                 setAligned(sequence);   
78                 //setUnaligned removes any gap characters for us                                                
79                 setUnaligned(sequence);         
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "Sequence", "Sequence");
83                 exit(1);
84         }                                                               
85 }
86 //********************************************************************************************************************
87 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
88 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
89         try {
90                 m = MothurOut::getInstance();
91         
92                 initialize();
93                 fastaString >> name;
94                 name = name.substr(1);
95                 string sequence;
96         
97                 //read comments
98                 while ((name[0] == '#') && fastaString) { 
99                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
100                         sequence = getCommentString(fastaString);
101                         
102                         if (fastaString) {  
103                                 fastaString >> name;  
104                                 name = name.substr(1);  
105                         }else { 
106                                 name = "";
107                                 break;
108                         }
109                 }
110                 
111                 while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
112                 
113                 sequence = getSequenceString(fastaString);              
114                 
115                 //setUnaligned removes any gap characters for us                                                
116                 setUnaligned(sequence);         
117         }
118         catch(exception& e) {
119                 m->errorOut(e, "Sequence", "Sequence");
120                 exit(1);
121         }                                                               
122 }
123
124
125 //********************************************************************************************************************
126 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
127 Sequence::Sequence(ifstream& fastaFile){
128         try {
129                 m = MothurOut::getInstance();
130                 initialize();
131                 fastaFile >> name;
132
133                 if (name.length() != 0) { 
134                 
135                         name = name.substr(1); 
136                         
137                         string sequence;
138                 
139                         //read comments
140                         while ((name[0] == '#') && fastaFile) { 
141                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
142                                 sequence = getCommentString(fastaFile);
143                                 
144                                 if (fastaFile) {  
145                                         fastaFile >> name;  
146                                         name = name.substr(1);  
147                                 }else { 
148                                         name = "";
149                                         break;
150                                 }
151                         }
152                         
153                         //read real sequence
154                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
155                         
156                         sequence = getSequenceString(fastaFile);                
157                 
158                         setAligned(sequence);   
159                         //setUnaligned removes any gap characters for us                                                
160                         setUnaligned(sequence); 
161                 }
162                 
163         }
164         catch(exception& e) {
165                 m->errorOut(e, "Sequence", "Sequence");
166                 exit(1);
167         }                                                       
168 }
169 //********************************************************************************************************************
170 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
171 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
172         try {
173                 m = MothurOut::getInstance();
174                 initialize();
175                 fastaFile >> name;
176                 name = name.substr(1);
177                 string sequence;
178                 
179                 //read comments
180                 while ((name[0] == '#') && fastaFile) { 
181                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
182                         sequence = getCommentString(fastaFile);
183                         
184                         if (fastaFile) {  
185                                 fastaFile >> name;  
186                                 name = name.substr(1);  
187                         }else { 
188                                 name = "";
189                                 break;
190                         }
191                 }
192                 
193                 //read real sequence
194                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
195                 
196                 sequence = getSequenceString(fastaFile);                
197                 
198                 //setUnaligned removes any gap characters for us                                                
199                 setUnaligned(sequence); 
200         }
201         catch(exception& e) {
202                 m->errorOut(e, "Sequence", "Sequence");
203                 exit(1);
204         }                                                       
205 }
206
207 //********************************************************************************************************************
208 string Sequence::getSequenceString(ifstream& fastaFile) {
209         try {
210                 char letter;
211                 string sequence = "";   
212                 
213                 while(fastaFile){
214                         letter= fastaFile.get();
215                         if(letter == '>'){
216                                 fastaFile.putback(letter);
217                                 break;
218                         }
219                         else if(isprint(letter)){
220                                 letter = toupper(letter);
221                                 if(letter == 'U'){letter = 'T';}
222                                 sequence += letter;
223                         }
224                 }
225                 
226                 return sequence;
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "Sequence", "getSequenceString");
230                 exit(1);
231         }
232 }
233 //********************************************************************************************************************
234 //comment can contain '>' so we need to account for that
235 string Sequence::getCommentString(ifstream& fastaFile) {
236         try {
237                 char letter;
238                 string sequence = "";
239                 
240                 while(fastaFile){
241                         letter=fastaFile.get();
242                         if((letter == '\r') || (letter == '\n')){  
243                                 gobble(fastaFile);  //in case its a \r\n situation
244                                 break;
245                         }
246                 }
247                 
248                 return sequence;
249         }
250         catch(exception& e) {
251                 m->errorOut(e, "Sequence", "getCommentString");
252                 exit(1);
253         }
254 }
255 //********************************************************************************************************************
256 string Sequence::getSequenceString(istringstream& fastaFile) {
257         try {
258                 char letter;
259                 string sequence = "";   
260                 
261                 while(!fastaFile.eof()){
262                         letter= fastaFile.get();
263         
264                         if(letter == '>'){
265                                 fastaFile.putback(letter);
266                                 break;
267                         }
268                         else if(isprint(letter)){
269                                 letter = toupper(letter);
270                                 if(letter == 'U'){letter = 'T';}
271                                 sequence += letter;
272                         }
273                 }
274                 
275                 return sequence;
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "Sequence", "getSequenceString");
279                 exit(1);
280         }
281 }
282 //********************************************************************************************************************
283 //comment can contain '>' so we need to account for that
284 string Sequence::getCommentString(istringstream& fastaFile) {
285         try {
286                 char letter;
287                 string sequence = "";
288                 
289                 while(fastaFile){
290                         letter=fastaFile.get();
291                         if((letter == '\r') || (letter == '\n')){  
292                                 gobble(fastaFile);  //in case its a \r\n situation
293                                 break;
294                         }
295                 }
296                 
297                 return sequence;
298         }
299         catch(exception& e) {
300                 m->errorOut(e, "Sequence", "getCommentString");
301                 exit(1);
302         }
303 }
304 //********************************************************************************************************************
305
306 void Sequence::initialize(){
307         
308         name = "";
309         unaligned = "";
310         aligned = "";
311         pairwise = "";
312         
313         numBases = 0;
314         alignmentLength = 0;
315         isAligned = 0;
316         startPos = -1;
317         endPos = -1;
318         longHomoPolymer = -1;
319         ambigBases = -1;
320         
321 }       
322
323 //********************************************************************************************************************
324
325 void Sequence::setName(string seqName) {
326         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
327         else                                    {       name = seqName;                         }
328 }
329
330 //********************************************************************************************************************
331
332 void Sequence::setUnaligned(string sequence){
333         
334         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
335                 string temp = "";
336                 for(int j=0;j<sequence.length();j++) {
337                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
338                 }
339                 unaligned = temp;
340         }
341         else {
342                 unaligned = sequence;
343         }
344         numBases = unaligned.length();
345         
346 }
347
348 //********************************************************************************************************************
349
350 void Sequence::setAligned(string sequence){
351         
352         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
353         aligned = sequence;
354         alignmentLength = aligned.length();
355         setUnaligned(sequence); 
356
357         if(aligned[0] == '-'){
358                 for(int i=0;i<alignmentLength;i++){
359                         if(aligned[i] == '-'){
360                                 aligned[i] = '.';
361                         }
362                         else{
363                                 break;
364                         }
365                 }
366                 for(int i=alignmentLength-1;i>=0;i--){
367                         if(aligned[i] == '-'){
368                                 aligned[i] = '.';
369                         }
370                         else{
371                                 break;
372                         }
373                 }
374         }
375         isAligned = 1;  
376 }
377
378 //********************************************************************************************************************
379
380 void Sequence::setPairwise(string sequence){
381         pairwise = sequence;
382 }
383
384 //********************************************************************************************************************
385
386 string Sequence::convert2ints() {
387         
388         if(unaligned == "")     {       /* need to throw an error */    }
389         
390         string processed;
391         
392         for(int i=0;i<unaligned.length();i++) {
393                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
394                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
395                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
396                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
397                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
398                 else                                                                    {       processed += '4';       }
399         }
400         return processed;
401 }
402
403 //********************************************************************************************************************
404
405 string Sequence::getName(){
406         return name;
407 }
408
409 //********************************************************************************************************************
410
411 string Sequence::getAligned(){
412         if(isAligned == 0)      { return unaligned; }
413         else                            {  return aligned;  }
414 }
415
416 //********************************************************************************************************************
417
418 string Sequence::getPairwise(){
419         return pairwise;
420 }
421
422 //********************************************************************************************************************
423
424 string Sequence::getUnaligned(){
425         return unaligned;
426 }
427
428 //********************************************************************************************************************
429
430 int Sequence::getNumBases(){
431         return numBases;
432 }
433
434 //********************************************************************************************************************
435
436 void Sequence::printSequence(ostream& out){
437
438         out << ">" << name << endl;
439         if(isAligned){
440                 out << aligned << endl;
441         }
442         else{
443                 out << unaligned << endl;
444         }
445 }
446
447 //********************************************************************************************************************
448
449 int Sequence::getAlignLength(){
450         return alignmentLength;
451 }
452
453 //********************************************************************************************************************
454
455 int Sequence::getAmbigBases(){
456         if(ambigBases == -1){
457                 ambigBases = 0;
458                 for(int j=0;j<numBases;j++){
459                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
460                                 ambigBases++;
461                         }
462                 }
463         }       
464         
465         return ambigBases;
466 }
467
468 //********************************************************************************************************************
469
470 int Sequence::getLongHomoPolymer(){
471         if(longHomoPolymer == -1){
472                 longHomoPolymer = 1;
473                 int homoPolymer = 1;
474                 for(int j=1;j<numBases;j++){
475                         if(unaligned[j] == unaligned[j-1]){
476                                 homoPolymer++;
477                         }
478                         else{
479                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
480                                 homoPolymer = 1;
481                         }
482                 }
483                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
484         }
485         return longHomoPolymer;
486 }
487
488 //********************************************************************************************************************
489
490 int Sequence::getStartPos(){
491         if(endPos == -1){
492                 for(int j = 0; j < alignmentLength; j++) {
493                         if(aligned[j] != '.'){
494                                 startPos = j + 1;
495                                 break;
496                         }
497                 }
498         }
499         if(isAligned == 0){     startPos = 1;   }
500
501         return startPos;
502 }
503
504 //********************************************************************************************************************
505
506 int Sequence::getEndPos(){
507         if(endPos == -1){
508                 for(int j=alignmentLength-1;j>=0;j--){
509                         if(aligned[j] != '.'){
510                                 endPos = j + 1;
511                                 break;
512                         }
513                 }
514         }
515         if(isAligned == 0){     endPos = numBases;      }
516         
517         return endPos;
518 }
519
520 //********************************************************************************************************************
521
522 bool Sequence::getIsAligned(){
523         return isAligned;
524 }
525
526 //********************************************************************************************************************
527
528 void Sequence::reverseComplement(){
529
530         string temp;
531         for(int i=numBases-1;i>=0;i--){
532                 if(unaligned[i] == 'A')         {       temp += 'T';    }
533                 else if(unaligned[i] == 'T'){   temp += 'A';    }
534                 else if(unaligned[i] == 'G'){   temp += 'C';    }
535                 else if(unaligned[i] == 'C'){   temp += 'G';    }
536                 else                                            {       temp += 'N';    }
537         }
538         unaligned = temp;
539         aligned = temp;
540         
541 }
542 /**************************************************************************************************/