]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
changed random forest output filename
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23         
24         m->checkName(name);
25                 
26                 //setUnaligned removes any gap characters for us
27                 setUnaligned(sequence);
28                 setAligned(sequence);
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "Sequence", "Sequence");
32                 exit(1);
33         }                       
34 }
35 /***********************************************************************/
36 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
37         try {
38                 m = MothurOut::getInstance();
39                 initialize();   
40                 name = newName;
41         
42         m->checkName(name);
43                 
44                 //setUnaligned removes any gap characters for us
45                 setUnaligned(sequence);
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "Sequence", "Sequence");
49                 exit(1);
50         }                       
51 }
52
53 //********************************************************************************************************************
54 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
55 Sequence::Sequence(istringstream& fastaString){
56         try {
57                 m = MothurOut::getInstance();
58         
59                 initialize();
60         name = getSequenceName(fastaString);
61                 
62                 if (!m->control_pressed) { 
63                         string sequence;
64                 
65                         //read comments
66                         while ((name[0] == '#') && fastaString) { 
67                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
68                                 sequence = getCommentString(fastaString);
69                                 
70                                 if (fastaString) {  
71                                         fastaString >> name;  
72                                         name = name.substr(1);  
73                                 }else { 
74                                         name = "";
75                                         break;
76                                 }
77                         }
78                         
79                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
80                         
81                         int numAmbig = 0;
82                         sequence = getSequenceString(fastaString, numAmbig);
83                         
84                         setAligned(sequence);   
85                         //setUnaligned removes any gap characters for us                                                
86                         setUnaligned(sequence); 
87                         
88                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
89                 }
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "Sequence", "Sequence");
94                 exit(1);
95         }                                                               
96 }
97 //********************************************************************************************************************
98 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
99 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
100         try {
101                 m = MothurOut::getInstance();
102         
103                 initialize();
104                 name = getSequenceName(fastaString);
105                 
106                 if (!m->control_pressed) { 
107                         string sequence;
108                 
109                         //read comments
110                         while ((name[0] == '#') && fastaString) { 
111                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
112                                 sequence = getCommentString(fastaString);
113                                 
114                                 if (fastaString) {  
115                                         fastaString >> name;  
116                                         name = name.substr(1);  
117                                 }else { 
118                                         name = "";
119                                         break;
120                                 }
121                         }
122                         
123                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
124                         
125                         int numAmbig = 0;
126                         sequence = getSequenceString(fastaString, numAmbig);
127                         
128                         //setUnaligned removes any gap characters for us                                                
129                         setUnaligned(sequence); 
130                         
131                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
132                         
133                 }
134                 
135         }
136         catch(exception& e) {
137                 m->errorOut(e, "Sequence", "Sequence");
138                 exit(1);
139         }                                                               
140 }
141
142
143 //********************************************************************************************************************
144 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
145 Sequence::Sequence(ifstream& fastaFile){
146         try {
147                 m = MothurOut::getInstance();
148                 initialize();
149                 name = getSequenceName(fastaFile);
150                 
151                 if (!m->control_pressed) { 
152                         
153                         string sequence;
154                 
155                         //read comments
156                         while ((name[0] == '#') && fastaFile) { 
157                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
158                                 sequence = getCommentString(fastaFile);
159                                 
160                                 if (fastaFile) {  
161                                         fastaFile >> name;  
162                                         name = name.substr(1);  
163                                 }else { 
164                                         name = "";
165                                         break;
166                                 }
167                         }
168                         
169                         //read real sequence
170                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
171                         
172                         int numAmbig = 0;
173                         sequence = getSequenceString(fastaFile, numAmbig);
174                         
175                         setAligned(sequence);   
176                         //setUnaligned removes any gap characters for us                                                
177                         setUnaligned(sequence); 
178                         
179                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
180                         
181                 }
182
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "Sequence", "Sequence");
186                 exit(1);
187         }                                                       
188 }
189 //********************************************************************************************************************
190 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
191 Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
192         try {
193                 m = MothurOut::getInstance();
194                 initialize();
195         extraInfo = "";
196                 
197                 name = getSequenceName(fastaFile);
198                 
199                 if (!m->control_pressed) {                      
200                         string sequence;
201             
202                         //read comments
203                         while ((name[0] == '#') && fastaFile) { 
204                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
205                                 sequence = getCommentString(fastaFile);
206                                 
207                                 if (fastaFile) {  
208                                         fastaFile >> name;  
209                                         name = name.substr(1);  
210                                 }else { 
211                                         name = "";
212                                         break;
213                                 }
214                         }
215                         
216                         //read info after sequence name
217                         while (!fastaFile.eof())        {       
218                 char c = fastaFile.get(); 
219                 if (c == 10 || c == 13 || c == -1){  break;     }
220                 extraInfo += c;
221             } 
222                         
223                         int numAmbig = 0;
224                         sequence = getSequenceString(fastaFile, numAmbig);
225                         
226                         setAligned(sequence);   
227                         //setUnaligned removes any gap characters for us                                                
228                         setUnaligned(sequence); 
229                         
230                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
231                 }
232         
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "Sequence", "Sequence");
236                 exit(1);
237         }                                                       
238 }
239 //********************************************************************************************************************
240 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
241 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
242         try {
243                 m = MothurOut::getInstance();
244                 initialize();
245                 name = getSequenceName(fastaFile);
246                 
247                 if (!m->control_pressed) { 
248                         string sequence;
249                         
250                         //read comments
251                         while ((name[0] == '#') && fastaFile) { 
252                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
253                                 sequence = getCommentString(fastaFile);
254                                 
255                                 if (fastaFile) {  
256                                         fastaFile >> name;  
257                                         name = name.substr(1);  
258                                 }else { 
259                                         name = "";
260                                         break;
261                                 }
262                         }
263                         
264                         //read real sequence
265                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
266                         
267                         int numAmbig = 0;
268                         sequence = getSequenceString(fastaFile, numAmbig);
269                         
270                         //setUnaligned removes any gap characters for us                                                
271                         setUnaligned(sequence); 
272                         
273                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
274                         
275                 }
276                 
277         }
278         catch(exception& e) {
279                 m->errorOut(e, "Sequence", "Sequence");
280                 exit(1);
281         }                                                       
282 }
283 //********************************************************************************************************************
284 string Sequence::getSequenceName(ifstream& fastaFile) {
285         try {
286                 string name = "";
287                 
288         fastaFile >> name;
289                 
290                 if (name.length() != 0) { 
291             
292                         name = name.substr(1); 
293             
294             m->checkName(name);
295             
296         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
297         
298                 return name;
299         }
300         catch(exception& e) {
301                 m->errorOut(e, "Sequence", "getSequenceName");
302                 exit(1);
303         }
304 }
305 //********************************************************************************************************************
306 string Sequence::getSequenceName(istringstream& fastaFile) {
307         try {
308                 string name = "";
309                 
310         fastaFile >> name;
311                 
312                 if (name.length() != 0) { 
313             
314                         name = name.substr(1); 
315             
316             m->checkName(name);
317             
318         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
319         
320                 return name;
321         }
322         catch(exception& e) {
323                 m->errorOut(e, "Sequence", "getSequenceName");
324                 exit(1);
325         }
326 }
327 //********************************************************************************************************************
328 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
329         try {
330                 char letter;
331                 string sequence = "";   
332                 numAmbig = 0;
333                 
334                 while(fastaFile){
335                         letter= fastaFile.get();
336                         if(letter == '>'){
337                                 fastaFile.putback(letter);
338                                 break;
339                         }else if (letter == ' ') {;}
340                         else if(isprint(letter)){
341                                 letter = toupper(letter);
342                                 if(letter == 'U'){letter = 'T';}
343                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
344                                         letter = 'N';
345                                         numAmbig++;
346                                 }
347                                 sequence += letter;
348                         }
349                 }
350                 
351                 return sequence;
352         }
353         catch(exception& e) {
354                 m->errorOut(e, "Sequence", "getSequenceString");
355                 exit(1);
356         }
357 }
358 //********************************************************************************************************************
359 //comment can contain '>' so we need to account for that
360 string Sequence::getCommentString(ifstream& fastaFile) {
361         try {
362                 char letter;
363                 string sequence = "";
364                 
365                 while(fastaFile){
366                         letter=fastaFile.get();
367                         if((letter == '\r') || (letter == '\n')){  
368                                 m->gobble(fastaFile);  //in case its a \r\n situation
369                                 break;
370                         }
371                 }
372                 
373                 return sequence;
374         }
375         catch(exception& e) {
376                 m->errorOut(e, "Sequence", "getCommentString");
377                 exit(1);
378         }
379 }
380 //********************************************************************************************************************
381 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
382         try {
383                 char letter;
384                 string sequence = "";
385                 numAmbig = 0;
386                 
387                 while(!fastaFile.eof()){
388                         letter= fastaFile.get();
389         
390                         if(letter == '>'){
391                                 fastaFile.putback(letter);
392                                 break;
393                         }else if (letter == ' ') {;}
394                         else if(isprint(letter)){
395                                 letter = toupper(letter);
396                                 if(letter == 'U'){letter = 'T';}
397                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
398                                         letter = 'N';
399                                         numAmbig++;
400                                 }
401                                 sequence += letter;
402                         }
403                 }
404                 
405                 return sequence;
406         }
407         catch(exception& e) {
408                 m->errorOut(e, "Sequence", "getSequenceString");
409                 exit(1);
410         }
411 }
412 //********************************************************************************************************************
413 //comment can contain '>' so we need to account for that
414 string Sequence::getCommentString(istringstream& fastaFile) {
415         try {
416                 char letter;
417                 string sequence = "";
418                 
419                 while(fastaFile){
420                         letter=fastaFile.get();
421                         if((letter == '\r') || (letter == '\n')){  
422                                 m->gobble(fastaFile);  //in case its a \r\n situation
423                                 break;
424                         }
425                 }
426                 
427                 return sequence;
428         }
429         catch(exception& e) {
430                 m->errorOut(e, "Sequence", "getCommentString");
431                 exit(1);
432         }
433 }
434 //********************************************************************************************************************
435
436 void Sequence::initialize(){
437         
438         name = "";
439         unaligned = "";
440         aligned = "";
441         pairwise = "";
442         
443         numBases = 0;
444         alignmentLength = 0;
445         isAligned = 0;
446         startPos = -1;
447         endPos = -1;
448         longHomoPolymer = -1;
449         ambigBases = -1;
450         
451 }       
452
453 //********************************************************************************************************************
454
455 void Sequence::setName(string seqName) {
456         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
457         else                                    {       name = seqName;                         }
458 }
459
460 //********************************************************************************************************************
461
462 void Sequence::setUnaligned(string sequence){
463         
464         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
465                 string temp = "";
466                 for(int j=0;j<sequence.length();j++) {
467                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
468                 }
469                 unaligned = temp;
470         }
471         else {
472                 unaligned = sequence;
473         }
474         numBases = unaligned.length();
475         
476 }
477
478 //********************************************************************************************************************
479
480 void Sequence::setAligned(string sequence){
481         
482         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
483         aligned = sequence;
484         alignmentLength = aligned.length();
485         setUnaligned(sequence); 
486
487         if(aligned[0] == '-'){
488                 for(int i=0;i<alignmentLength;i++){
489                         if(aligned[i] == '-'){
490                                 aligned[i] = '.';
491                         }
492                         else{
493                                 break;
494                         }
495                 }
496                 for(int i=alignmentLength-1;i>=0;i--){
497                         if(aligned[i] == '-'){
498                                 aligned[i] = '.';
499                         }
500                         else{
501                                 break;
502                         }
503                 }
504         }
505         isAligned = 1;  
506 }
507
508 //********************************************************************************************************************
509
510 void Sequence::setPairwise(string sequence){
511         pairwise = sequence;
512 }
513
514 //********************************************************************************************************************
515
516 string Sequence::convert2ints() {
517         
518         if(unaligned == "")     {       /* need to throw an error */    }
519         
520         string processed;
521         
522         for(int i=0;i<unaligned.length();i++) {
523                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
524                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
525                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
526                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
527                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
528                 else                                                                    {       processed += '4';       }
529         }
530         return processed;
531 }
532
533 //********************************************************************************************************************
534
535 string Sequence::getName(){
536         return name;
537 }
538
539 //********************************************************************************************************************
540
541 string Sequence::getAligned(){
542         if(isAligned == 0)      { return unaligned; }
543         else                            {  return aligned;  }
544 }
545
546 //********************************************************************************************************************
547
548 string Sequence::getInlineSeq(){
549         return name + '\t' + aligned;   
550 }
551
552
553 //********************************************************************************************************************
554
555 string Sequence::getPairwise(){
556         return pairwise;
557 }
558
559 //********************************************************************************************************************
560
561 string Sequence::getUnaligned(){
562         return unaligned;
563 }
564
565 //********************************************************************************************************************
566
567 int Sequence::getNumBases(){
568         return numBases;
569 }
570 //********************************************************************************************************************
571
572 int Sequence::getNumNs(){
573     int numNs = 0;
574         for (int i = 0; i < unaligned.length(); i++) {
575         if(toupper(unaligned[i]) == 'N') { numNs++; }
576     }
577     return numNs;
578 }
579
580 //********************************************************************************************************************
581
582 void Sequence::printSequence(ostream& out){
583
584         out << ">" << name << endl;
585         if(isAligned){
586                 out << aligned << endl;
587         }
588         else{
589                 out << unaligned << endl;
590         }
591 }
592
593 //********************************************************************************************************************
594
595 int Sequence::getAlignLength(){
596         return alignmentLength;
597 }
598
599 //********************************************************************************************************************
600
601 int Sequence::getAmbigBases(){
602         if(ambigBases == -1){
603                 ambigBases = 0;
604                 for(int j=0;j<numBases;j++){
605                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
606                                 ambigBases++;
607                         }
608                 }
609         }       
610         
611         return ambigBases;
612 }
613
614 //********************************************************************************************************************
615
616 void Sequence::removeAmbigBases(){
617         
618         for(int j=0;j<alignmentLength;j++){
619                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
620                         aligned[j] = '-';
621                 }
622         }
623         setUnaligned(aligned);
624 }
625         
626 //********************************************************************************************************************
627
628 int Sequence::getLongHomoPolymer(){
629         if(longHomoPolymer == -1){
630                 longHomoPolymer = 1;
631                 int homoPolymer = 1;
632                 for(int j=1;j<numBases;j++){
633                         if(unaligned[j] == unaligned[j-1]){
634                                 homoPolymer++;
635                         }
636                         else{
637                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
638                                 homoPolymer = 1;
639                         }
640                 }
641                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
642         }
643         return longHomoPolymer;
644 }
645
646 //********************************************************************************************************************
647
648 int Sequence::getStartPos(){
649         if(startPos == -1){
650                 for(int j = 0; j < alignmentLength; j++) {
651                         if((aligned[j] != '.')&&(aligned[j] != '-')){
652                                 startPos = j + 1;
653                                 break;
654                         }
655                 }
656         }
657         if(isAligned == 0){     startPos = 1;   }
658
659         return startPos;
660 }
661
662 //********************************************************************************************************************
663
664 void Sequence::padToPos(int start){
665
666         for(int j = startPos-1; j < start-1; j++) {
667                 aligned[j] = '.';
668         }
669         startPos = start;
670
671 }
672 //********************************************************************************************************************
673
674 int Sequence::filterToPos(int start){
675     
676     if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
677     
678         for(int j = 0; j < start; j++) {
679                 aligned[j] = '.';
680         }
681         
682     //things like ......----------AT become ................AT
683     for(int j = start; j < aligned.length(); j++) {
684         if (isalpha(aligned[j])) { break; }
685         else { aligned[j] = '.'; }
686     }
687     setUnaligned(aligned);
688     
689     return 0;
690     
691 }
692 //********************************************************************************************************************
693
694 int Sequence::filterFromPos(int end){
695     
696     if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
697     
698         for(int j = end; j < aligned.length(); j++) {
699                 aligned[j] = '.';
700         }
701         
702     for(int j = aligned.length()-1; j < 0; j--) {
703         if (isalpha(aligned[j])) { break; }
704         else { aligned[j] = '.'; }
705     }
706     
707     setUnaligned(aligned);
708     
709     return 0;
710 }
711 //********************************************************************************************************************
712
713 int Sequence::getEndPos(){
714         if(endPos == -1){
715                 for(int j=alignmentLength-1;j>=0;j--){
716                         if((aligned[j] != '.')&&(aligned[j] != '-')){
717                                 endPos = j + 1;
718                                 break;
719                         }
720                 }
721         }
722         if(isAligned == 0){     endPos = numBases;      }
723         
724         return endPos;
725 }
726
727 //********************************************************************************************************************
728
729 void Sequence::padFromPos(int end){
730         //cout << end << '\t' << endPos << endl;
731         for(int j = end; j < endPos; j++) {
732                 aligned[j] = '.';
733         }
734         endPos = end;
735         
736 }
737
738 //********************************************************************************************************************
739
740 bool Sequence::getIsAligned(){
741         return isAligned;
742 }
743 //********************************************************************************************************************
744
745 void Sequence::reverseComplement(){
746
747         string temp;
748         for(int i=numBases-1;i>=0;i--){
749                 if(unaligned[i] == 'A')         {       temp += 'T';    }
750                 else if(unaligned[i] == 'T'){   temp += 'A';    }
751                 else if(unaligned[i] == 'G'){   temp += 'C';    }
752                 else if(unaligned[i] == 'C'){   temp += 'G';    }
753                 else                                            {       temp += 'N';    }
754         }
755         unaligned = temp;
756         aligned = temp;
757         
758 }
759
760 //********************************************************************************************************************
761
762 void Sequence::trim(int length){
763         
764         if(numBases > length){
765                 unaligned = unaligned.substr(0,length);
766                 numBases = length;
767         aligned = "";
768         isAligned = 0;
769         }
770         
771 }
772
773 ///**************************************************************************************************/