]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
fixes while testing 1.33.0
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23         
24         m->checkName(name);
25                 
26                 //setUnaligned removes any gap characters for us
27                 setUnaligned(sequence);
28                 setAligned(sequence);
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "Sequence", "Sequence");
32                 exit(1);
33         }                       
34 }
35 /***********************************************************************/
36 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
37         try {
38                 m = MothurOut::getInstance();
39                 initialize();   
40                 name = newName;
41         
42         m->checkName(name);
43                 
44                 //setUnaligned removes any gap characters for us
45                 setUnaligned(sequence);
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "Sequence", "Sequence");
49                 exit(1);
50         }                       
51 }
52
53 //********************************************************************************************************************
54 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
55 Sequence::Sequence(istringstream& fastaString){
56         try {
57                 m = MothurOut::getInstance();
58         
59                 initialize();
60         name = getSequenceName(fastaString);
61                 
62                 if (!m->control_pressed) { 
63                         string sequence;
64                 
65                         //read comments
66                         while ((name[0] == '#') && fastaString) { 
67                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
68                                 sequence = getCommentString(fastaString);
69                                 
70                                 if (fastaString) {  
71                                         fastaString >> name;  
72                                         name = name.substr(1);  
73                                 }else { 
74                                         name = "";
75                                         break;
76                                 }
77                         }
78                         
79                         //while (!fastaString.eof())    {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
80             comment = getCommentString(fastaString);
81                         
82                         int numAmbig = 0;
83                         sequence = getSequenceString(fastaString, numAmbig);
84                         
85                         setAligned(sequence);   
86                         //setUnaligned removes any gap characters for us                                                
87                         setUnaligned(sequence); 
88                         
89                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
90                 }
91                 
92         }
93         catch(exception& e) {
94                 m->errorOut(e, "Sequence", "Sequence");
95                 exit(1);
96         }                                                               
97 }
98 //********************************************************************************************************************
99 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
100 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
101         try {
102                 m = MothurOut::getInstance();
103         
104                 initialize();
105                 name = getSequenceName(fastaString);
106                 
107                 if (!m->control_pressed) { 
108                         string sequence;
109                 
110                         //read comments
111                         while ((name[0] == '#') && fastaString) { 
112                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
113                                 sequence = getCommentString(fastaString);
114                                 
115                                 if (fastaString) {  
116                                         fastaString >> name;  
117                                         name = name.substr(1);  
118                                 }else { 
119                                         name = "";
120                                         break;
121                                 }
122                         }
123                         
124                         //while (!fastaString.eof())    {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
125             comment = getCommentString(fastaString);
126                         
127                         int numAmbig = 0;
128                         sequence = getSequenceString(fastaString, numAmbig);
129                         
130                         //setUnaligned removes any gap characters for us                                                
131                         setUnaligned(sequence); 
132                         
133                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
134                         
135                 }
136                 
137         }
138         catch(exception& e) {
139                 m->errorOut(e, "Sequence", "Sequence");
140                 exit(1);
141         }                                                               
142 }
143
144
145 //********************************************************************************************************************
146 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
147 Sequence::Sequence(ifstream& fastaFile){
148         try {
149                 m = MothurOut::getInstance();
150                 initialize();
151                 name = getSequenceName(fastaFile);
152                 
153                 if (!m->control_pressed) { 
154                         
155                         string sequence;
156                 
157                         //read comments
158                         while ((name[0] == '#') && fastaFile) { 
159                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
160                                 sequence = getCommentString(fastaFile);
161                                 
162                                 if (fastaFile) {  
163                                         fastaFile >> name;  
164                                         name = name.substr(1);  
165                                 }else { 
166                                         name = "";
167                                         break;
168                                 }
169                         }
170                         
171                         //while (!fastaFile.eof())      {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
172             comment = getCommentString(fastaFile);
173                         
174                         int numAmbig = 0;
175                         sequence = getSequenceString(fastaFile, numAmbig);
176                         
177                         setAligned(sequence);   
178                         //setUnaligned removes any gap characters for us                                                
179                         setUnaligned(sequence); 
180                         
181                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
182                         
183                 }
184
185         }
186         catch(exception& e) {
187                 m->errorOut(e, "Sequence", "Sequence");
188                 exit(1);
189         }                                                       
190 }
191 //********************************************************************************************************************
192 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
193 Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
194         try {
195                 m = MothurOut::getInstance();
196                 initialize();
197         extraInfo = "";
198                 
199                 name = getSequenceName(fastaFile);
200                 
201                 if (!m->control_pressed) {                      
202                         string sequence;
203             
204                         //read comments
205                         while ((name[0] == '#') && fastaFile) { 
206                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
207                                 sequence = getCommentString(fastaFile);
208                                 
209                                 if (fastaFile) {  
210                                         fastaFile >> name;  
211                                         name = name.substr(1);  
212                                 }else { 
213                                         name = "";
214                                         break;
215                                 }
216                         }
217                         
218                         //read info after sequence name
219                         while (!fastaFile.eof())        {       
220                 char c = fastaFile.get(); 
221                 if (c == 10 || c == 13 || c == -1){   break;    }
222                 extraInfo += c;
223             }
224             
225             comment = extraInfo;
226                         
227                         int numAmbig = 0;
228                         sequence = getSequenceString(fastaFile, numAmbig);
229                         
230                         setAligned(sequence);   
231                         //setUnaligned removes any gap characters for us                                                
232                         setUnaligned(sequence); 
233                         
234                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
235                 }
236         
237         }
238         catch(exception& e) {
239                 m->errorOut(e, "Sequence", "Sequence");
240                 exit(1);
241         }                                                       
242 }
243 //********************************************************************************************************************
244 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
245 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
246         try {
247                 m = MothurOut::getInstance();
248                 initialize();
249                 name = getSequenceName(fastaFile);
250                 
251                 if (!m->control_pressed) { 
252                         string sequence;
253                         
254                         //read comments
255                         while ((name[0] == '#') && fastaFile) { 
256                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
257                                 sequence = getCommentString(fastaFile);
258                                 
259                                 if (fastaFile) {  
260                                         fastaFile >> name;  
261                                         name = name.substr(1);  
262                                 }else { 
263                                         name = "";
264                                         break;
265                                 }
266                         }
267                         
268                         //while (!fastaFile.eof())      {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
269             comment = getCommentString(fastaFile);
270                         
271                         int numAmbig = 0;
272                         sequence = getSequenceString(fastaFile, numAmbig);
273                         
274                         //setUnaligned removes any gap characters for us                                                
275                         setUnaligned(sequence); 
276                         
277                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
278                         
279                 }
280                 
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "Sequence", "Sequence");
284                 exit(1);
285         }                                                       
286 }
287 //********************************************************************************************************************
288 string Sequence::getSequenceName(ifstream& fastaFile) {
289         try {
290                 string name = "";
291                 
292         fastaFile >> name;
293                 
294                 if (name.length() != 0) { 
295             
296                         name = name.substr(1); 
297             
298             m->checkName(name);
299             
300         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
301         
302                 return name;
303         }
304         catch(exception& e) {
305                 m->errorOut(e, "Sequence", "getSequenceName");
306                 exit(1);
307         }
308 }
309 //********************************************************************************************************************
310 string Sequence::getSequenceName(istringstream& fastaFile) {
311         try {
312                 string name = "";
313                 
314         fastaFile >> name;
315                 
316                 if (name.length() != 0) { 
317             
318                         name = name.substr(1); 
319             
320             m->checkName(name);
321             
322         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
323         
324                 return name;
325         }
326         catch(exception& e) {
327                 m->errorOut(e, "Sequence", "getSequenceName");
328                 exit(1);
329         }
330 }
331 //********************************************************************************************************************
332 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
333         try {
334                 char letter;
335                 string sequence = "";   
336                 numAmbig = 0;
337                 
338                 while(fastaFile){
339                         letter= fastaFile.get();
340                         if(letter == '>'){
341                                 fastaFile.putback(letter);
342                                 break;
343                         }else if (letter == ' ') {;}
344                         else if(isprint(letter)){
345                                 letter = toupper(letter);
346                                 if(letter == 'U'){letter = 'T';}
347                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
348                                         letter = 'N';
349                                         numAmbig++;
350                                 }
351                                 sequence += letter;
352                         }
353                 }
354                 
355                 return sequence;
356         }
357         catch(exception& e) {
358                 m->errorOut(e, "Sequence", "getSequenceString");
359                 exit(1);
360         }
361 }
362 //********************************************************************************************************************
363 //comment can contain '>' so we need to account for that
364 string Sequence::getCommentString(ifstream& fastaFile) {
365         try {
366                 char letter;
367                 string temp = "";
368                 
369                 while(fastaFile){
370                         letter=fastaFile.get();
371                         if((letter == '\r') || (letter == '\n') || letter == -1){
372                                 m->gobble(fastaFile);  //in case its a \r\n situation
373                                 break;
374                         }else {
375                 temp += letter;
376             }
377                 }
378                 
379                 return temp;
380         }
381         catch(exception& e) {
382                 m->errorOut(e, "Sequence", "getCommentString");
383                 exit(1);
384         }
385 }
386 //********************************************************************************************************************
387 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
388         try {
389                 char letter;
390                 string sequence = "";
391                 numAmbig = 0;
392                 
393                 while(!fastaFile.eof()){
394                         letter= fastaFile.get();
395         
396                         if(letter == '>'){
397                                 fastaFile.putback(letter);
398                                 break;
399                         }else if (letter == ' ') {;}
400                         else if(isprint(letter)){
401                                 letter = toupper(letter);
402                                 if(letter == 'U'){letter = 'T';}
403                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
404                                         letter = 'N';
405                                         numAmbig++;
406                                 }
407                                 sequence += letter;
408                         }
409                 }
410                 
411                 return sequence;
412         }
413         catch(exception& e) {
414                 m->errorOut(e, "Sequence", "getSequenceString");
415                 exit(1);
416         }
417 }
418 //********************************************************************************************************************
419 //comment can contain '>' so we need to account for that
420 string Sequence::getCommentString(istringstream& fastaFile) {
421         try {
422                 char letter;
423                 string temp = "";
424                 
425                 while(fastaFile){
426                         letter=fastaFile.get();
427                         if((letter == '\r') || (letter == '\n') || letter == -1){  
428                                 m->gobble(fastaFile);  //in case its a \r\n situation
429                                 break;
430                         }else {
431                 temp += letter;
432             }
433                 }
434                 
435                 return temp;
436         }
437         catch(exception& e) {
438                 m->errorOut(e, "Sequence", "getCommentString");
439                 exit(1);
440         }
441 }
442 //********************************************************************************************************************
443
444 void Sequence::initialize(){
445         
446         name = "";
447         unaligned = "";
448         aligned = "";
449         pairwise = "";
450     comment = "";
451         
452         numBases = 0;
453         alignmentLength = 0;
454         isAligned = 0;
455         startPos = -1;
456         endPos = -1;
457         longHomoPolymer = -1;
458         ambigBases = -1;
459         
460 }       
461
462 //********************************************************************************************************************
463
464 void Sequence::setName(string seqName) {
465         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
466         else                                    {       name = seqName;                         }
467 }
468
469 //********************************************************************************************************************
470
471 void Sequence::setUnaligned(string sequence){
472         
473         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
474                 string temp = "";
475                 for(int j=0;j<sequence.length();j++) {
476                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
477                 }
478                 unaligned = temp;
479         }
480         else {
481                 unaligned = sequence;
482         }
483         numBases = unaligned.length();
484         
485 }
486
487 //********************************************************************************************************************
488
489 void Sequence::setAligned(string sequence){
490         
491         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
492         aligned = sequence;
493         alignmentLength = aligned.length();
494         setUnaligned(sequence); 
495
496         if(aligned[0] == '-'){
497                 for(int i=0;i<alignmentLength;i++){
498                         if(aligned[i] == '-'){
499                                 aligned[i] = '.';
500                         }
501                         else{
502                                 break;
503                         }
504                 }
505                 for(int i=alignmentLength-1;i>=0;i--){
506                         if(aligned[i] == '-'){
507                                 aligned[i] = '.';
508                         }
509                         else{
510                                 break;
511                         }
512                 }
513         }
514         isAligned = 1;  
515 }
516
517 //********************************************************************************************************************
518
519 void Sequence::setPairwise(string sequence){
520         pairwise = sequence;
521 }
522
523 //********************************************************************************************************************
524
525 string Sequence::convert2ints() {
526         
527         if(unaligned == "")     {       /* need to throw an error */    }
528         
529         string processed;
530         
531         for(int i=0;i<unaligned.length();i++) {
532                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
533                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
534                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
535                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
536                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
537                 else                                                                    {       processed += '4';       }
538         }
539         return processed;
540 }
541
542 //********************************************************************************************************************
543
544 string Sequence::getName(){
545         return name;
546 }
547
548 //********************************************************************************************************************
549
550 string Sequence::getAligned(){
551         if(isAligned == 0)      { return unaligned; }
552         else                            {  return aligned;  }
553 }
554
555 //********************************************************************************************************************
556
557 string Sequence::getInlineSeq(){
558         return name + '\t' + aligned;   
559 }
560
561
562 //********************************************************************************************************************
563
564 string Sequence::getPairwise(){
565         return pairwise;
566 }
567
568 //********************************************************************************************************************
569
570 string Sequence::getUnaligned(){
571         return unaligned;
572 }
573
574 //********************************************************************************************************************
575
576 int Sequence::getNumBases(){
577         return numBases;
578 }
579 //********************************************************************************************************************
580
581 int Sequence::getNumNs(){
582     int numNs = 0;
583         for (int i = 0; i < unaligned.length(); i++) {
584         if(toupper(unaligned[i]) == 'N') { numNs++; }
585     }
586     return numNs;
587 }
588
589 //********************************************************************************************************************
590
591 void Sequence::printSequence(ostream& out){
592
593         out << ">" << name << comment << endl;
594         if(isAligned){
595                 out << aligned << endl;
596         }
597         else{
598                 out << unaligned << endl;
599         }
600 }
601
602 //********************************************************************************************************************
603
604 int Sequence::getAlignLength(){
605         return alignmentLength;
606 }
607
608 //********************************************************************************************************************
609
610 int Sequence::getAmbigBases(){
611         if(ambigBases == -1){
612                 ambigBases = 0;
613                 for(int j=0;j<numBases;j++){
614                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
615                                 ambigBases++;
616                         }
617                 }
618         }       
619         
620         return ambigBases;
621 }
622
623 //********************************************************************************************************************
624
625 void Sequence::removeAmbigBases(){
626         
627         for(int j=0;j<alignmentLength;j++){
628                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
629                         aligned[j] = '-';
630                 }
631         }
632         setUnaligned(aligned);
633 }
634         
635 //********************************************************************************************************************
636
637 int Sequence::getLongHomoPolymer(){
638         if(longHomoPolymer == -1){
639                 longHomoPolymer = 1;
640                 int homoPolymer = 1;
641                 for(int j=1;j<numBases;j++){
642                         if(unaligned[j] == unaligned[j-1]){
643                                 homoPolymer++;
644                         }
645                         else{
646                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
647                                 homoPolymer = 1;
648                         }
649                 }
650                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
651         }
652         return longHomoPolymer;
653 }
654
655 //********************************************************************************************************************
656
657 int Sequence::getStartPos(){
658         if(startPos == -1){
659                 for(int j = 0; j < alignmentLength; j++) {
660                         if((aligned[j] != '.')&&(aligned[j] != '-')){
661                                 startPos = j + 1;
662                                 break;
663                         }
664                 }
665         }
666         if(isAligned == 0){     startPos = 1;   }
667
668         return startPos;
669 }
670
671 //********************************************************************************************************************
672
673 void Sequence::padToPos(int start){
674
675         for(int j = startPos-1; j < start-1; j++) {
676                 aligned[j] = '.';
677         }
678         startPos = start;
679
680 }
681 //********************************************************************************************************************
682
683 int Sequence::filterToPos(int start){
684     
685     if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
686     
687         for(int j = 0; j < start; j++) {
688                 aligned[j] = '.';
689         }
690         
691     //things like ......----------AT become ................AT
692     for(int j = start; j < aligned.length(); j++) {
693         if (isalpha(aligned[j])) { break; }
694         else { aligned[j] = '.'; }
695     }
696     setUnaligned(aligned);
697     
698     return 0;
699     
700 }
701 //********************************************************************************************************************
702
703 int Sequence::filterFromPos(int end){
704     
705     if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
706     
707         for(int j = end; j < aligned.length(); j++) {
708                 aligned[j] = '.';
709         }
710         
711     for(int j = aligned.length()-1; j < 0; j--) {
712         if (isalpha(aligned[j])) { break; }
713         else { aligned[j] = '.'; }
714     }
715     
716     setUnaligned(aligned);
717     
718     return 0;
719 }
720 //********************************************************************************************************************
721
722 int Sequence::getEndPos(){
723         if(endPos == -1){
724                 for(int j=alignmentLength-1;j>=0;j--){
725                         if((aligned[j] != '.')&&(aligned[j] != '-')){
726                                 endPos = j + 1;
727                                 break;
728                         }
729                 }
730         }
731         if(isAligned == 0){     endPos = numBases;      }
732         
733         return endPos;
734 }
735
736 //********************************************************************************************************************
737
738 void Sequence::padFromPos(int end){
739         //cout << end << '\t' << endPos << endl;
740         for(int j = end; j < endPos; j++) {
741                 aligned[j] = '.';
742         }
743         endPos = end;
744         
745 }
746
747 //********************************************************************************************************************
748
749 bool Sequence::getIsAligned(){
750         return isAligned;
751 }
752 //********************************************************************************************************************
753
754 void Sequence::reverseComplement(){
755
756         string temp;
757         for(int i=numBases-1;i>=0;i--){
758                 if(unaligned[i] == 'A')         {       temp += 'T';    }
759                 else if(unaligned[i] == 'T'){   temp += 'A';    }
760                 else if(unaligned[i] == 'G'){   temp += 'C';    }
761                 else if(unaligned[i] == 'C'){   temp += 'G';    }
762                 else                                            {       temp += 'N';    }
763         }
764         unaligned = temp;
765         aligned = temp;
766         
767 }
768
769 //********************************************************************************************************************
770
771 void Sequence::trim(int length){
772         
773         if(numBases > length){
774                 unaligned = unaligned.substr(0,length);
775                 numBases = length;
776         aligned = "";
777         isAligned = 0;
778         }
779         
780 }
781
782 ///**************************************************************************************************/