]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23         
24         for (int i = 0; i < name.length(); i++) {
25             if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
26         }
27                 
28                 //setUnaligned removes any gap characters for us
29                 setUnaligned(sequence);
30                 setAligned(sequence);
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "Sequence", "Sequence");
34                 exit(1);
35         }                       
36 }
37 /***********************************************************************/
38 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
39         try {
40                 m = MothurOut::getInstance();
41                 initialize();   
42                 name = newName;
43         
44         for (int i = 0; i < name.length(); i++) {
45             if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
46         }
47                 
48                 //setUnaligned removes any gap characters for us
49                 setUnaligned(sequence);
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "Sequence", "Sequence");
53                 exit(1);
54         }                       
55 }
56
57 //********************************************************************************************************************
58 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
59 Sequence::Sequence(istringstream& fastaString){
60         try {
61                 m = MothurOut::getInstance();
62         
63                 initialize();
64         name = getSequenceName(fastaString);
65                 
66                 if (!m->control_pressed) { 
67                         string sequence;
68                 
69                         //read comments
70                         while ((name[0] == '#') && fastaString) { 
71                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
72                                 sequence = getCommentString(fastaString);
73                                 
74                                 if (fastaString) {  
75                                         fastaString >> name;  
76                                         name = name.substr(1);  
77                                 }else { 
78                                         name = "";
79                                         break;
80                                 }
81                         }
82                         
83                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
84                         
85                         int numAmbig = 0;
86                         sequence = getSequenceString(fastaString, numAmbig);
87                         
88                         setAligned(sequence);   
89                         //setUnaligned removes any gap characters for us                                                
90                         setUnaligned(sequence); 
91                         
92                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
93                 }
94                 
95         }
96         catch(exception& e) {
97                 m->errorOut(e, "Sequence", "Sequence");
98                 exit(1);
99         }                                                               
100 }
101 //********************************************************************************************************************
102 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
103 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
104         try {
105                 m = MothurOut::getInstance();
106         
107                 initialize();
108                 name = getSequenceName(fastaString);
109                 
110                 if (!m->control_pressed) { 
111                         string sequence;
112                 
113                         //read comments
114                         while ((name[0] == '#') && fastaString) { 
115                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
116                                 sequence = getCommentString(fastaString);
117                                 
118                                 if (fastaString) {  
119                                         fastaString >> name;  
120                                         name = name.substr(1);  
121                                 }else { 
122                                         name = "";
123                                         break;
124                                 }
125                         }
126                         
127                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
128                         
129                         int numAmbig = 0;
130                         sequence = getSequenceString(fastaString, numAmbig);
131                         
132                         //setUnaligned removes any gap characters for us                                                
133                         setUnaligned(sequence); 
134                         
135                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
136                         
137                 }
138                 
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "Sequence", "Sequence");
142                 exit(1);
143         }                                                               
144 }
145
146
147 //********************************************************************************************************************
148 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
149 Sequence::Sequence(ifstream& fastaFile){
150         try {
151                 m = MothurOut::getInstance();
152                 initialize();
153                 name = getSequenceName(fastaFile);
154                 
155                 if (!m->control_pressed) { 
156                         
157                         string sequence;
158                 
159                         //read comments
160                         while ((name[0] == '#') && fastaFile) { 
161                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
162                                 sequence = getCommentString(fastaFile);
163                                 
164                                 if (fastaFile) {  
165                                         fastaFile >> name;  
166                                         name = name.substr(1);  
167                                 }else { 
168                                         name = "";
169                                         break;
170                                 }
171                         }
172                         
173                         //read real sequence
174                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
175                         
176                         int numAmbig = 0;
177                         sequence = getSequenceString(fastaFile, numAmbig);
178                         
179                         setAligned(sequence);   
180                         //setUnaligned removes any gap characters for us                                                
181                         setUnaligned(sequence); 
182                         
183                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
184                         
185                 }
186
187         }
188         catch(exception& e) {
189                 m->errorOut(e, "Sequence", "Sequence");
190                 exit(1);
191         }                                                       
192 }
193 //********************************************************************************************************************
194 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
195 Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
196         try {
197                 m = MothurOut::getInstance();
198                 initialize();
199         extraInfo = "";
200                 
201                 name = getSequenceName(fastaFile);
202                 
203                 if (!m->control_pressed) {                      
204                         string sequence;
205             
206                         //read comments
207                         while ((name[0] == '#') && fastaFile) { 
208                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
209                                 sequence = getCommentString(fastaFile);
210                                 
211                                 if (fastaFile) {  
212                                         fastaFile >> name;  
213                                         name = name.substr(1);  
214                                 }else { 
215                                         name = "";
216                                         break;
217                                 }
218                         }
219                         
220                         //read info after sequence name
221                         while (!fastaFile.eof())        {       
222                 char c = fastaFile.get(); 
223                 if (c == 10 || c == 13 || c == -1){  break;     }
224                 extraInfo += c;
225             } 
226                         
227                         int numAmbig = 0;
228                         sequence = getSequenceString(fastaFile, numAmbig);
229                         
230                         setAligned(sequence);   
231                         //setUnaligned removes any gap characters for us                                                
232                         setUnaligned(sequence); 
233                         
234                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
235                 }
236         
237         }
238         catch(exception& e) {
239                 m->errorOut(e, "Sequence", "Sequence");
240                 exit(1);
241         }                                                       
242 }
243 //********************************************************************************************************************
244 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
245 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
246         try {
247                 m = MothurOut::getInstance();
248                 initialize();
249                 name = getSequenceName(fastaFile);
250                 
251                 if (!m->control_pressed) { 
252                         string sequence;
253                         
254                         //read comments
255                         while ((name[0] == '#') && fastaFile) { 
256                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
257                                 sequence = getCommentString(fastaFile);
258                                 
259                                 if (fastaFile) {  
260                                         fastaFile >> name;  
261                                         name = name.substr(1);  
262                                 }else { 
263                                         name = "";
264                                         break;
265                                 }
266                         }
267                         
268                         //read real sequence
269                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
270                         
271                         int numAmbig = 0;
272                         sequence = getSequenceString(fastaFile, numAmbig);
273                         
274                         //setUnaligned removes any gap characters for us                                                
275                         setUnaligned(sequence); 
276                         
277                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
278                         
279                 }
280                 
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "Sequence", "Sequence");
284                 exit(1);
285         }                                                       
286 }
287 //********************************************************************************************************************
288 string Sequence::getSequenceName(ifstream& fastaFile) {
289         try {
290                 string name = "";
291                 
292         fastaFile >> name;
293                 
294                 if (name.length() != 0) { 
295             
296                         name = name.substr(1); 
297             
298             for (int i = 0; i < name.length(); i++) {
299                 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
300             }
301             
302         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
303         
304                 return name;
305         }
306         catch(exception& e) {
307                 m->errorOut(e, "Sequence", "getSequenceName");
308                 exit(1);
309         }
310 }
311 //********************************************************************************************************************
312 string Sequence::getSequenceName(istringstream& fastaFile) {
313         try {
314                 string name = "";
315                 
316         fastaFile >> name;
317                 
318                 if (name.length() != 0) { 
319             
320                         name = name.substr(1); 
321             
322             for (int i = 0; i < name.length(); i++) {
323                 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
324             }
325             
326         }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true;  }
327         
328                 return name;
329         }
330         catch(exception& e) {
331                 m->errorOut(e, "Sequence", "getSequenceName");
332                 exit(1);
333         }
334 }
335 //********************************************************************************************************************
336 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
337         try {
338                 char letter;
339                 string sequence = "";   
340                 numAmbig = 0;
341                 
342                 while(fastaFile){
343                         letter= fastaFile.get();
344                         if(letter == '>'){
345                                 fastaFile.putback(letter);
346                                 break;
347                         }else if (letter == ' ') {;}
348                         else if(isprint(letter)){
349                                 letter = toupper(letter);
350                                 if(letter == 'U'){letter = 'T';}
351                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
352                                         letter = 'N';
353                                         numAmbig++;
354                                 }
355                                 sequence += letter;
356                         }
357                 }
358                 
359                 return sequence;
360         }
361         catch(exception& e) {
362                 m->errorOut(e, "Sequence", "getSequenceString");
363                 exit(1);
364         }
365 }
366 //********************************************************************************************************************
367 //comment can contain '>' so we need to account for that
368 string Sequence::getCommentString(ifstream& fastaFile) {
369         try {
370                 char letter;
371                 string sequence = "";
372                 
373                 while(fastaFile){
374                         letter=fastaFile.get();
375                         if((letter == '\r') || (letter == '\n')){  
376                                 m->gobble(fastaFile);  //in case its a \r\n situation
377                                 break;
378                         }
379                 }
380                 
381                 return sequence;
382         }
383         catch(exception& e) {
384                 m->errorOut(e, "Sequence", "getCommentString");
385                 exit(1);
386         }
387 }
388 //********************************************************************************************************************
389 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
390         try {
391                 char letter;
392                 string sequence = "";
393                 numAmbig = 0;
394                 
395                 while(!fastaFile.eof()){
396                         letter= fastaFile.get();
397         
398                         if(letter == '>'){
399                                 fastaFile.putback(letter);
400                                 break;
401                         }else if (letter == ' ') {;}
402                         else if(isprint(letter)){
403                                 letter = toupper(letter);
404                                 if(letter == 'U'){letter = 'T';}
405                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
406                                         letter = 'N';
407                                         numAmbig++;
408                                 }
409                                 sequence += letter;
410                         }
411                 }
412                 
413                 return sequence;
414         }
415         catch(exception& e) {
416                 m->errorOut(e, "Sequence", "getSequenceString");
417                 exit(1);
418         }
419 }
420 //********************************************************************************************************************
421 //comment can contain '>' so we need to account for that
422 string Sequence::getCommentString(istringstream& fastaFile) {
423         try {
424                 char letter;
425                 string sequence = "";
426                 
427                 while(fastaFile){
428                         letter=fastaFile.get();
429                         if((letter == '\r') || (letter == '\n')){  
430                                 m->gobble(fastaFile);  //in case its a \r\n situation
431                                 break;
432                         }
433                 }
434                 
435                 return sequence;
436         }
437         catch(exception& e) {
438                 m->errorOut(e, "Sequence", "getCommentString");
439                 exit(1);
440         }
441 }
442 //********************************************************************************************************************
443
444 void Sequence::initialize(){
445         
446         name = "";
447         unaligned = "";
448         aligned = "";
449         pairwise = "";
450         
451         numBases = 0;
452         alignmentLength = 0;
453         isAligned = 0;
454         startPos = -1;
455         endPos = -1;
456         longHomoPolymer = -1;
457         ambigBases = -1;
458         
459 }       
460
461 //********************************************************************************************************************
462
463 void Sequence::setName(string seqName) {
464         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
465         else                                    {       name = seqName;                         }
466 }
467
468 //********************************************************************************************************************
469
470 void Sequence::setUnaligned(string sequence){
471         
472         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
473                 string temp = "";
474                 for(int j=0;j<sequence.length();j++) {
475                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
476                 }
477                 unaligned = temp;
478         }
479         else {
480                 unaligned = sequence;
481         }
482         numBases = unaligned.length();
483         
484 }
485
486 //********************************************************************************************************************
487
488 void Sequence::setAligned(string sequence){
489         
490         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
491         aligned = sequence;
492         alignmentLength = aligned.length();
493         setUnaligned(sequence); 
494
495         if(aligned[0] == '-'){
496                 for(int i=0;i<alignmentLength;i++){
497                         if(aligned[i] == '-'){
498                                 aligned[i] = '.';
499                         }
500                         else{
501                                 break;
502                         }
503                 }
504                 for(int i=alignmentLength-1;i>=0;i--){
505                         if(aligned[i] == '-'){
506                                 aligned[i] = '.';
507                         }
508                         else{
509                                 break;
510                         }
511                 }
512         }
513         isAligned = 1;  
514 }
515
516 //********************************************************************************************************************
517
518 void Sequence::setPairwise(string sequence){
519         pairwise = sequence;
520 }
521
522 //********************************************************************************************************************
523
524 string Sequence::convert2ints() {
525         
526         if(unaligned == "")     {       /* need to throw an error */    }
527         
528         string processed;
529         
530         for(int i=0;i<unaligned.length();i++) {
531                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
532                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
533                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
534                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
535                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
536                 else                                                                    {       processed += '4';       }
537         }
538         return processed;
539 }
540
541 //********************************************************************************************************************
542
543 string Sequence::getName(){
544         return name;
545 }
546
547 //********************************************************************************************************************
548
549 string Sequence::getAligned(){
550         if(isAligned == 0)      { return unaligned; }
551         else                            {  return aligned;  }
552 }
553
554 //********************************************************************************************************************
555
556 string Sequence::getInlineSeq(){
557         return name + '\t' + aligned;   
558 }
559
560
561 //********************************************************************************************************************
562
563 string Sequence::getPairwise(){
564         return pairwise;
565 }
566
567 //********************************************************************************************************************
568
569 string Sequence::getUnaligned(){
570         return unaligned;
571 }
572
573 //********************************************************************************************************************
574
575 int Sequence::getNumBases(){
576         return numBases;
577 }
578 //********************************************************************************************************************
579
580 int Sequence::getNumNs(){
581     int numNs = 0;
582         for (int i = 0; i < unaligned.length(); i++) {
583         if(toupper(unaligned[i]) == 'N') { numNs++; }
584     }
585     return numNs;
586 }
587
588 //********************************************************************************************************************
589
590 void Sequence::printSequence(ostream& out){
591
592         out << ">" << name << endl;
593         if(isAligned){
594                 out << aligned << endl;
595         }
596         else{
597                 out << unaligned << endl;
598         }
599 }
600
601 //********************************************************************************************************************
602
603 int Sequence::getAlignLength(){
604         return alignmentLength;
605 }
606
607 //********************************************************************************************************************
608
609 int Sequence::getAmbigBases(){
610         if(ambigBases == -1){
611                 ambigBases = 0;
612                 for(int j=0;j<numBases;j++){
613                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
614                                 ambigBases++;
615                         }
616                 }
617         }       
618         
619         return ambigBases;
620 }
621
622 //********************************************************************************************************************
623
624 void Sequence::removeAmbigBases(){
625         
626         for(int j=0;j<alignmentLength;j++){
627                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
628                         aligned[j] = '-';
629                 }
630         }
631         setUnaligned(aligned);
632 }
633         
634 //********************************************************************************************************************
635
636 int Sequence::getLongHomoPolymer(){
637         if(longHomoPolymer == -1){
638                 longHomoPolymer = 1;
639                 int homoPolymer = 1;
640                 for(int j=1;j<numBases;j++){
641                         if(unaligned[j] == unaligned[j-1]){
642                                 homoPolymer++;
643                         }
644                         else{
645                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
646                                 homoPolymer = 1;
647                         }
648                 }
649                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
650         }
651         return longHomoPolymer;
652 }
653
654 //********************************************************************************************************************
655
656 int Sequence::getStartPos(){
657         if(startPos == -1){
658                 for(int j = 0; j < alignmentLength; j++) {
659                         if((aligned[j] != '.')&&(aligned[j] != '-')){
660                                 startPos = j + 1;
661                                 break;
662                         }
663                 }
664         }
665         if(isAligned == 0){     startPos = 1;   }
666
667         return startPos;
668 }
669
670 //********************************************************************************************************************
671
672 void Sequence::padToPos(int start){
673
674         for(int j = startPos-1; j < start-1; j++) {
675                 aligned[j] = '.';
676         }
677         startPos = start;
678
679 }
680 //********************************************************************************************************************
681
682 int Sequence::filterToPos(int start){
683     
684     if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
685     
686         for(int j = 0; j < start-1; j++) {
687                 aligned[j] = '.';
688         }
689         
690     //things like ......----------AT become ................AT
691     for(int j = start-1; j < aligned.length(); j++) {
692         if (isalpha(aligned[j])) { break; }
693         else { aligned[j] = '.'; }
694     }
695     setUnaligned(aligned);
696     
697     return 0;
698     
699 }
700 //********************************************************************************************************************
701
702 int Sequence::filterFromPos(int end){
703     
704     if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
705     
706         for(int j = end; j < aligned.length(); j++) {
707                 aligned[j] = '.';
708         }
709         
710     for(int j = aligned.length()-1; j < 0; j--) {
711         if (isalpha(aligned[j])) { break; }
712         else { aligned[j] = '.'; }
713     }
714     
715     setUnaligned(aligned);
716     
717     return 0;
718 }
719 //********************************************************************************************************************
720
721 int Sequence::getEndPos(){
722         if(endPos == -1){
723                 for(int j=alignmentLength-1;j>=0;j--){
724                         if((aligned[j] != '.')&&(aligned[j] != '-')){
725                                 endPos = j + 1;
726                                 break;
727                         }
728                 }
729         }
730         if(isAligned == 0){     endPos = numBases;      }
731         
732         return endPos;
733 }
734
735 //********************************************************************************************************************
736
737 void Sequence::padFromPos(int end){
738         //cout << end << '\t' << endPos << endl;
739         for(int j = end; j < endPos; j++) {
740                 aligned[j] = '.';
741         }
742         endPos = end;
743         
744 }
745
746 //********************************************************************************************************************
747
748 bool Sequence::getIsAligned(){
749         return isAligned;
750 }
751 //********************************************************************************************************************
752
753 void Sequence::reverseComplement(){
754
755         string temp;
756         for(int i=numBases-1;i>=0;i--){
757                 if(unaligned[i] == 'A')         {       temp += 'T';    }
758                 else if(unaligned[i] == 'T'){   temp += 'A';    }
759                 else if(unaligned[i] == 'G'){   temp += 'C';    }
760                 else if(unaligned[i] == 'C'){   temp += 'G';    }
761                 else                                            {       temp += 'N';    }
762         }
763         unaligned = temp;
764         aligned = temp;
765         
766 }
767
768 //********************************************************************************************************************
769
770 void Sequence::trim(int length){
771         
772         if(numBases > length){
773                 unaligned = unaligned.substr(0,length);
774                 numBases = length;
775         aligned = "";
776         isAligned = 0;
777         }
778         
779 }
780
781 ///**************************************************************************************************/