]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 
58                 if (name.length() != 0) { 
59                 
60                         name = name.substr(1);
61                         string sequence;
62                 
63                         //read comments
64                         while ((name[0] == '#') && fastaString) { 
65                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
66                                 sequence = getCommentString(fastaString);
67                                 
68                                 if (fastaString) {  
69                                         fastaString >> name;  
70                                         name = name.substr(1);  
71                                 }else { 
72                                         name = "";
73                                         break;
74                                 }
75                         }
76                         
77                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
78                         
79                         int numAmbig = 0;
80                         sequence = getSequenceString(fastaString, numAmbig);
81                         
82                         setAligned(sequence);   
83                         //setUnaligned removes any gap characters for us                                                
84                         setUnaligned(sequence); 
85                         
86                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
87                 
88                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
89                 
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "Sequence", "Sequence");
93                 exit(1);
94         }                                                               
95 }
96 //********************************************************************************************************************
97 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
98 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
99         try {
100                 m = MothurOut::getInstance();
101         
102                 initialize();
103                 fastaString >> name;
104                 
105                 if (name.length() != 0) { 
106                 
107                         name = name.substr(1);
108                         string sequence;
109                 
110                         //read comments
111                         while ((name[0] == '#') && fastaString) { 
112                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
113                                 sequence = getCommentString(fastaString);
114                                 
115                                 if (fastaString) {  
116                                         fastaString >> name;  
117                                         name = name.substr(1);  
118                                 }else { 
119                                         name = "";
120                                         break;
121                                 }
122                         }
123                         
124                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
125                         
126                         int numAmbig = 0;
127                         sequence = getSequenceString(fastaString, numAmbig);
128                         
129                         //setUnaligned removes any gap characters for us                                                
130                         setUnaligned(sequence); 
131                         
132                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
133                         
134                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
135                 
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "Sequence", "Sequence");
139                 exit(1);
140         }                                                               
141 }
142
143
144 //********************************************************************************************************************
145 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
146 Sequence::Sequence(ifstream& fastaFile){
147         try {
148                 m = MothurOut::getInstance();
149                 initialize();
150                 fastaFile >> name;
151                 
152                 if (name.length() != 0) { 
153                 
154                         name = name.substr(1); 
155                         
156                         string sequence;
157                 
158                         //read comments
159                         while ((name[0] == '#') && fastaFile) { 
160                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
161                                 sequence = getCommentString(fastaFile);
162                                 
163                                 if (fastaFile) {  
164                                         fastaFile >> name;  
165                                         name = name.substr(1);  
166                                 }else { 
167                                         name = "";
168                                         break;
169                                 }
170                         }
171                         
172                         //read real sequence
173                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
174                         
175                         int numAmbig = 0;
176                         sequence = getSequenceString(fastaFile, numAmbig);
177                         
178                         setAligned(sequence);   
179                         //setUnaligned removes any gap characters for us                                                
180                         setUnaligned(sequence); 
181                         
182                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
183                         
184                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
185
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "Sequence", "Sequence");
189                 exit(1);
190         }                                                       
191 }
192 //********************************************************************************************************************
193 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
194 Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
195         try {
196                 m = MothurOut::getInstance();
197                 initialize();
198                 fastaFile >> name;
199         extraInfo = "";
200                 
201                 if (name.length() != 0) { 
202             
203                         name = name.substr(1); 
204                         
205                         string sequence;
206             
207                         //read comments
208                         while ((name[0] == '#') && fastaFile) { 
209                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
210                                 sequence = getCommentString(fastaFile);
211                                 
212                                 if (fastaFile) {  
213                                         fastaFile >> name;  
214                                         name = name.substr(1);  
215                                 }else { 
216                                         name = "";
217                                         break;
218                                 }
219                         }
220                         
221                         //read info after sequence name
222                         while (!fastaFile.eof())        {       
223                 char c = fastaFile.get(); 
224                 if (c == 10 || c == 13){  break;        }       
225                 extraInfo += c;
226             } 
227                         
228                         int numAmbig = 0;
229                         sequence = getSequenceString(fastaFile, numAmbig);
230                         
231                         setAligned(sequence);   
232                         //setUnaligned removes any gap characters for us                                                
233                         setUnaligned(sequence); 
234                         
235                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
236                         
237                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
238         
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "Sequence", "Sequence");
242                 exit(1);
243         }                                                       
244 }
245 //********************************************************************************************************************
246 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
247 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
248         try {
249                 m = MothurOut::getInstance();
250                 initialize();
251                 fastaFile >> name;
252                 
253                 if (name.length() != 0) { 
254                         name = name.substr(1);
255                         string sequence;
256                         
257                         //read comments
258                         while ((name[0] == '#') && fastaFile) { 
259                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
260                                 sequence = getCommentString(fastaFile);
261                                 
262                                 if (fastaFile) {  
263                                         fastaFile >> name;  
264                                         name = name.substr(1);  
265                                 }else { 
266                                         name = "";
267                                         break;
268                                 }
269                         }
270                         
271                         //read real sequence
272                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
273                         
274                         int numAmbig = 0;
275                         sequence = getSequenceString(fastaFile, numAmbig);
276                         
277                         //setUnaligned removes any gap characters for us                                                
278                         setUnaligned(sequence); 
279                         
280                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
281                         
282                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
283                 
284         }
285         catch(exception& e) {
286                 m->errorOut(e, "Sequence", "Sequence");
287                 exit(1);
288         }                                                       
289 }
290
291 //********************************************************************************************************************
292 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
293         try {
294                 char letter;
295                 string sequence = "";   
296                 numAmbig = 0;
297                 
298                 while(fastaFile){
299                         letter= fastaFile.get();
300                         if(letter == '>'){
301                                 fastaFile.putback(letter);
302                                 break;
303                         }else if (letter == ' ') {;}
304                         else if(isprint(letter)){
305                                 letter = toupper(letter);
306                                 if(letter == 'U'){letter = 'T';}
307                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
308                                         letter = 'N';
309                                         numAmbig++;
310                                 }
311                                 sequence += letter;
312                         }
313                 }
314                 
315                 return sequence;
316         }
317         catch(exception& e) {
318                 m->errorOut(e, "Sequence", "getSequenceString");
319                 exit(1);
320         }
321 }
322 //********************************************************************************************************************
323 //comment can contain '>' so we need to account for that
324 string Sequence::getCommentString(ifstream& fastaFile) {
325         try {
326                 char letter;
327                 string sequence = "";
328                 
329                 while(fastaFile){
330                         letter=fastaFile.get();
331                         if((letter == '\r') || (letter == '\n')){  
332                                 m->gobble(fastaFile);  //in case its a \r\n situation
333                                 break;
334                         }
335                 }
336                 
337                 return sequence;
338         }
339         catch(exception& e) {
340                 m->errorOut(e, "Sequence", "getCommentString");
341                 exit(1);
342         }
343 }
344 //********************************************************************************************************************
345 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
346         try {
347                 char letter;
348                 string sequence = "";
349                 numAmbig = 0;
350                 
351                 while(!fastaFile.eof()){
352                         letter= fastaFile.get();
353         
354                         if(letter == '>'){
355                                 fastaFile.putback(letter);
356                                 break;
357                         }else if (letter == ' ') {;}
358                         else if(isprint(letter)){
359                                 letter = toupper(letter);
360                                 if(letter == 'U'){letter = 'T';}
361                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
362                                         letter = 'N';
363                                         numAmbig++;
364                                 }
365                                 sequence += letter;
366                         }
367                 }
368                 
369                 return sequence;
370         }
371         catch(exception& e) {
372                 m->errorOut(e, "Sequence", "getSequenceString");
373                 exit(1);
374         }
375 }
376 //********************************************************************************************************************
377 //comment can contain '>' so we need to account for that
378 string Sequence::getCommentString(istringstream& fastaFile) {
379         try {
380                 char letter;
381                 string sequence = "";
382                 
383                 while(fastaFile){
384                         letter=fastaFile.get();
385                         if((letter == '\r') || (letter == '\n')){  
386                                 m->gobble(fastaFile);  //in case its a \r\n situation
387                                 break;
388                         }
389                 }
390                 
391                 return sequence;
392         }
393         catch(exception& e) {
394                 m->errorOut(e, "Sequence", "getCommentString");
395                 exit(1);
396         }
397 }
398 //********************************************************************************************************************
399
400 void Sequence::initialize(){
401         
402         name = "";
403         unaligned = "";
404         aligned = "";
405         pairwise = "";
406         
407         numBases = 0;
408         alignmentLength = 0;
409         isAligned = 0;
410         startPos = -1;
411         endPos = -1;
412         longHomoPolymer = -1;
413         ambigBases = -1;
414         
415 }       
416
417 //********************************************************************************************************************
418
419 void Sequence::setName(string seqName) {
420         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
421         else                                    {       name = seqName;                         }
422 }
423
424 //********************************************************************************************************************
425
426 void Sequence::setUnaligned(string sequence){
427         
428         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
429                 string temp = "";
430                 for(int j=0;j<sequence.length();j++) {
431                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
432                 }
433                 unaligned = temp;
434         }
435         else {
436                 unaligned = sequence;
437         }
438         numBases = unaligned.length();
439         
440 }
441
442 //********************************************************************************************************************
443
444 void Sequence::setAligned(string sequence){
445         
446         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
447         aligned = sequence;
448         alignmentLength = aligned.length();
449         setUnaligned(sequence); 
450
451         if(aligned[0] == '-'){
452                 for(int i=0;i<alignmentLength;i++){
453                         if(aligned[i] == '-'){
454                                 aligned[i] = '.';
455                         }
456                         else{
457                                 break;
458                         }
459                 }
460                 for(int i=alignmentLength-1;i>=0;i--){
461                         if(aligned[i] == '-'){
462                                 aligned[i] = '.';
463                         }
464                         else{
465                                 break;
466                         }
467                 }
468         }
469         isAligned = 1;  
470 }
471
472 //********************************************************************************************************************
473
474 void Sequence::setPairwise(string sequence){
475         pairwise = sequence;
476 }
477
478 //********************************************************************************************************************
479
480 string Sequence::convert2ints() {
481         
482         if(unaligned == "")     {       /* need to throw an error */    }
483         
484         string processed;
485         
486         for(int i=0;i<unaligned.length();i++) {
487                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
488                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
489                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
490                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
491                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
492                 else                                                                    {       processed += '4';       }
493         }
494         return processed;
495 }
496
497 //********************************************************************************************************************
498
499 string Sequence::getName(){
500         return name;
501 }
502
503 //********************************************************************************************************************
504
505 string Sequence::getAligned(){
506         if(isAligned == 0)      { return unaligned; }
507         else                            {  return aligned;  }
508 }
509
510 //********************************************************************************************************************
511
512 string Sequence::getInlineSeq(){
513         return name + '\t' + aligned;   
514 }
515
516
517 //********************************************************************************************************************
518
519 string Sequence::getPairwise(){
520         return pairwise;
521 }
522
523 //********************************************************************************************************************
524
525 string Sequence::getUnaligned(){
526         return unaligned;
527 }
528
529 //********************************************************************************************************************
530
531 int Sequence::getNumBases(){
532         return numBases;
533 }
534
535 //********************************************************************************************************************
536
537 void Sequence::printSequence(ostream& out){
538
539         out << ">" << name << endl;
540         if(isAligned){
541                 out << aligned << endl;
542         }
543         else{
544                 out << unaligned << endl;
545         }
546 }
547
548 //********************************************************************************************************************
549
550 int Sequence::getAlignLength(){
551         return alignmentLength;
552 }
553
554 //********************************************************************************************************************
555
556 int Sequence::getAmbigBases(){
557         if(ambigBases == -1){
558                 ambigBases = 0;
559                 for(int j=0;j<numBases;j++){
560                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
561                                 ambigBases++;
562                         }
563                 }
564         }       
565         
566         return ambigBases;
567 }
568
569 //********************************************************************************************************************
570
571 void Sequence::removeAmbigBases(){
572         
573         for(int j=0;j<alignmentLength;j++){
574                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
575                         aligned[j] = '-';
576                 }
577         }
578         setUnaligned(aligned);
579 }
580         
581 //********************************************************************************************************************
582
583 int Sequence::getLongHomoPolymer(){
584         if(longHomoPolymer == -1){
585                 longHomoPolymer = 1;
586                 int homoPolymer = 1;
587                 for(int j=1;j<numBases;j++){
588                         if(unaligned[j] == unaligned[j-1]){
589                                 homoPolymer++;
590                         }
591                         else{
592                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
593                                 homoPolymer = 1;
594                         }
595                 }
596                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
597         }
598         return longHomoPolymer;
599 }
600
601 //********************************************************************************************************************
602
603 int Sequence::getStartPos(){
604         if(startPos == -1){
605                 for(int j = 0; j < alignmentLength; j++) {
606                         if((aligned[j] != '.')&&(aligned[j] != '-')){
607                                 startPos = j + 1;
608                                 break;
609                         }
610                 }
611         }
612         if(isAligned == 0){     startPos = 1;   }
613
614         return startPos;
615 }
616
617 //********************************************************************************************************************
618
619 void Sequence::padToPos(int start){
620
621         for(int j = startPos-1; j < start-1; j++) {
622                 aligned[j] = '.';
623         }
624         startPos = start;
625
626 }
627 //********************************************************************************************************************
628
629 int Sequence::filterToPos(int start){
630     
631     if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
632     
633         for(int j = 0; j < start-1; j++) {
634                 aligned[j] = '.';
635         }
636         
637     //things like ......----------AT become ................AT
638     for(int j = start-1; j < aligned.length(); j++) {
639         if (isalpha(aligned[j])) { break; }
640         else { aligned[j] = '.'; }
641     }
642     setUnaligned(aligned);
643     
644     return 0;
645     
646 }
647 //********************************************************************************************************************
648
649 int Sequence::filterFromPos(int end){
650     
651     if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
652     
653         for(int j = end; j < aligned.length(); j++) {
654                 aligned[j] = '.';
655         }
656         
657     for(int j = aligned.length()-1; j < 0; j--) {
658         if (isalpha(aligned[j])) { break; }
659         else { aligned[j] = '.'; }
660     }
661     
662     setUnaligned(aligned);
663     
664     return 0;
665 }
666 //********************************************************************************************************************
667
668 int Sequence::getEndPos(){
669         if(endPos == -1){
670                 for(int j=alignmentLength-1;j>=0;j--){
671                         if((aligned[j] != '.')&&(aligned[j] != '-')){
672                                 endPos = j + 1;
673                                 break;
674                         }
675                 }
676         }
677         if(isAligned == 0){     endPos = numBases;      }
678         
679         return endPos;
680 }
681
682 //********************************************************************************************************************
683
684 void Sequence::padFromPos(int end){
685         //cout << end << '\t' << endPos << endl;
686         for(int j = end; j < endPos; j++) {
687                 aligned[j] = '.';
688         }
689         endPos = end;
690         
691 }
692
693 //********************************************************************************************************************
694
695 bool Sequence::getIsAligned(){
696         return isAligned;
697 }
698 //********************************************************************************************************************
699
700 void Sequence::reverseComplement(){
701
702         string temp;
703         for(int i=numBases-1;i>=0;i--){
704                 if(unaligned[i] == 'A')         {       temp += 'T';    }
705                 else if(unaligned[i] == 'T'){   temp += 'A';    }
706                 else if(unaligned[i] == 'G'){   temp += 'C';    }
707                 else if(unaligned[i] == 'C'){   temp += 'G';    }
708                 else                                            {       temp += 'N';    }
709         }
710         unaligned = temp;
711         aligned = temp;
712         
713 }
714
715 //********************************************************************************************************************
716
717 void Sequence::trim(int length){
718         
719         if(numBases > length){
720                 unaligned = unaligned.substr(0,length);
721                 numBases = length;
722         }
723         
724 }
725
726 ///**************************************************************************************************/