]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
fixed bug with shhh.flow from file path name in write functions, added "smart" featur...
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 
58                 if (name.length() != 0) { 
59                 
60                         name = name.substr(1);
61                         string sequence;
62                 
63                         //read comments
64                         while ((name[0] == '#') && fastaString) { 
65                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
66                                 sequence = getCommentString(fastaString);
67                                 
68                                 if (fastaString) {  
69                                         fastaString >> name;  
70                                         name = name.substr(1);  
71                                 }else { 
72                                         name = "";
73                                         break;
74                                 }
75                         }
76                         
77                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
78                         
79                         int numAmbig = 0;
80                         sequence = getSequenceString(fastaString, numAmbig);
81                         
82                         setAligned(sequence);   
83                         //setUnaligned removes any gap characters for us                                                
84                         setUnaligned(sequence); 
85                         
86                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
87                 
88                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
89                 
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "Sequence", "Sequence");
93                 exit(1);
94         }                                                               
95 }
96 //********************************************************************************************************************
97 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
98 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
99         try {
100                 m = MothurOut::getInstance();
101         
102                 initialize();
103                 fastaString >> name;
104                 
105                 if (name.length() != 0) { 
106                 
107                         name = name.substr(1);
108                         string sequence;
109                 
110                         //read comments
111                         while ((name[0] == '#') && fastaString) { 
112                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
113                                 sequence = getCommentString(fastaString);
114                                 
115                                 if (fastaString) {  
116                                         fastaString >> name;  
117                                         name = name.substr(1);  
118                                 }else { 
119                                         name = "";
120                                         break;
121                                 }
122                         }
123                         
124                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
125                         
126                         int numAmbig = 0;
127                         sequence = getSequenceString(fastaString, numAmbig);
128                         
129                         //setUnaligned removes any gap characters for us                                                
130                         setUnaligned(sequence); 
131                         
132                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
133                         
134                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
135                 
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "Sequence", "Sequence");
139                 exit(1);
140         }                                                               
141 }
142
143
144 //********************************************************************************************************************
145 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
146 Sequence::Sequence(ifstream& fastaFile){
147         try {
148                 m = MothurOut::getInstance();
149                 initialize();
150                 fastaFile >> name;
151                 
152                 if (name.length() != 0) { 
153                 
154                         name = name.substr(1); 
155                         
156                         string sequence;
157                 
158                         //read comments
159                         while ((name[0] == '#') && fastaFile) { 
160                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
161                                 sequence = getCommentString(fastaFile);
162                                 
163                                 if (fastaFile) {  
164                                         fastaFile >> name;  
165                                         name = name.substr(1);  
166                                 }else { 
167                                         name = "";
168                                         break;
169                                 }
170                         }
171                         
172                         //read real sequence
173                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
174                         
175                         int numAmbig = 0;
176                         sequence = getSequenceString(fastaFile, numAmbig);
177                         
178                         setAligned(sequence);   
179                         //setUnaligned removes any gap characters for us                                                
180                         setUnaligned(sequence); 
181                         
182                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
183                         
184                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
185
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "Sequence", "Sequence");
189                 exit(1);
190         }                                                       
191 }
192 //********************************************************************************************************************
193 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
194 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
195         try {
196                 m = MothurOut::getInstance();
197                 initialize();
198                 fastaFile >> name;
199                 
200                 if (name.length() != 0) { 
201                         name = name.substr(1);
202                         string sequence;
203                         
204                         //read comments
205                         while ((name[0] == '#') && fastaFile) { 
206                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
207                                 sequence = getCommentString(fastaFile);
208                                 
209                                 if (fastaFile) {  
210                                         fastaFile >> name;  
211                                         name = name.substr(1);  
212                                 }else { 
213                                         name = "";
214                                         break;
215                                 }
216                         }
217                         
218                         //read real sequence
219                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
220                         
221                         int numAmbig = 0;
222                         sequence = getSequenceString(fastaFile, numAmbig);
223                         
224                         //setUnaligned removes any gap characters for us                                                
225                         setUnaligned(sequence); 
226                         
227                         if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
228                         
229                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
230                 
231         }
232         catch(exception& e) {
233                 m->errorOut(e, "Sequence", "Sequence");
234                 exit(1);
235         }                                                       
236 }
237
238 //********************************************************************************************************************
239 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
240         try {
241                 char letter;
242                 string sequence = "";   
243                 numAmbig = 0;
244                 
245                 while(fastaFile){
246                         letter= fastaFile.get();
247                         if(letter == '>'){
248                                 fastaFile.putback(letter);
249                                 break;
250                         }
251                         else if(isprint(letter)){
252                                 letter = toupper(letter);
253                                 if(letter == 'U'){letter = 'T';}
254                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
255                                         letter = 'N';
256                                         numAmbig++;
257                                 }
258                                 sequence += letter;
259                         }
260                 }
261                 
262                 return sequence;
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "Sequence", "getSequenceString");
266                 exit(1);
267         }
268 }
269 //********************************************************************************************************************
270 //comment can contain '>' so we need to account for that
271 string Sequence::getCommentString(ifstream& fastaFile) {
272         try {
273                 char letter;
274                 string sequence = "";
275                 
276                 while(fastaFile){
277                         letter=fastaFile.get();
278                         if((letter == '\r') || (letter == '\n')){  
279                                 m->gobble(fastaFile);  //in case its a \r\n situation
280                                 break;
281                         }
282                 }
283                 
284                 return sequence;
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "Sequence", "getCommentString");
288                 exit(1);
289         }
290 }
291 //********************************************************************************************************************
292 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
293         try {
294                 char letter;
295                 string sequence = "";
296                 numAmbig = 0;
297                 
298                 while(!fastaFile.eof()){
299                         letter= fastaFile.get();
300         
301                         if(letter == '>'){
302                                 fastaFile.putback(letter);
303                                 break;
304                         }
305                         else if(isprint(letter)){
306                                 letter = toupper(letter);
307                                 if(letter == 'U'){letter = 'T';}
308                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C' && letter != 'N'){
309                                         letter = 'N';
310                                         numAmbig++;
311                                 }
312                                 sequence += letter;
313                         }
314                 }
315                 
316                 return sequence;
317         }
318         catch(exception& e) {
319                 m->errorOut(e, "Sequence", "getSequenceString");
320                 exit(1);
321         }
322 }
323 //********************************************************************************************************************
324 //comment can contain '>' so we need to account for that
325 string Sequence::getCommentString(istringstream& fastaFile) {
326         try {
327                 char letter;
328                 string sequence = "";
329                 
330                 while(fastaFile){
331                         letter=fastaFile.get();
332                         if((letter == '\r') || (letter == '\n')){  
333                                 m->gobble(fastaFile);  //in case its a \r\n situation
334                                 break;
335                         }
336                 }
337                 
338                 return sequence;
339         }
340         catch(exception& e) {
341                 m->errorOut(e, "Sequence", "getCommentString");
342                 exit(1);
343         }
344 }
345 //********************************************************************************************************************
346
347 void Sequence::initialize(){
348         
349         name = "";
350         unaligned = "";
351         aligned = "";
352         pairwise = "";
353         
354         numBases = 0;
355         alignmentLength = 0;
356         isAligned = 0;
357         startPos = -1;
358         endPos = -1;
359         longHomoPolymer = -1;
360         ambigBases = -1;
361         
362 }       
363
364 //********************************************************************************************************************
365
366 void Sequence::setName(string seqName) {
367         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
368         else                                    {       name = seqName;                         }
369 }
370
371 //********************************************************************************************************************
372
373 void Sequence::setUnaligned(string sequence){
374         
375         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
376                 string temp = "";
377                 for(int j=0;j<sequence.length();j++) {
378                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
379                 }
380                 unaligned = temp;
381         }
382         else {
383                 unaligned = sequence;
384         }
385         numBases = unaligned.length();
386         
387 }
388
389 //********************************************************************************************************************
390
391 void Sequence::setAligned(string sequence){
392         
393         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
394         aligned = sequence;
395         alignmentLength = aligned.length();
396         setUnaligned(sequence); 
397
398         if(aligned[0] == '-'){
399                 for(int i=0;i<alignmentLength;i++){
400                         if(aligned[i] == '-'){
401                                 aligned[i] = '.';
402                         }
403                         else{
404                                 break;
405                         }
406                 }
407                 for(int i=alignmentLength-1;i>=0;i--){
408                         if(aligned[i] == '-'){
409                                 aligned[i] = '.';
410                         }
411                         else{
412                                 break;
413                         }
414                 }
415         }
416         isAligned = 1;  
417 }
418
419 //********************************************************************************************************************
420
421 void Sequence::setPairwise(string sequence){
422         pairwise = sequence;
423 }
424
425 //********************************************************************************************************************
426
427 string Sequence::convert2ints() {
428         
429         if(unaligned == "")     {       /* need to throw an error */    }
430         
431         string processed;
432         
433         for(int i=0;i<unaligned.length();i++) {
434                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
435                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
436                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
437                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
438                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
439                 else                                                                    {       processed += '4';       }
440         }
441         return processed;
442 }
443
444 //********************************************************************************************************************
445
446 string Sequence::getName(){
447         return name;
448 }
449
450 //********************************************************************************************************************
451
452 string Sequence::getAligned(){
453         if(isAligned == 0)      { return unaligned; }
454         else                            {  return aligned;  }
455 }
456
457 //********************************************************************************************************************
458
459 string Sequence::getInlineSeq(){
460         return name + '\t' + aligned;   
461 }
462
463
464 //********************************************************************************************************************
465
466 string Sequence::getPairwise(){
467         return pairwise;
468 }
469
470 //********************************************************************************************************************
471
472 string Sequence::getUnaligned(){
473         return unaligned;
474 }
475
476 //********************************************************************************************************************
477
478 int Sequence::getNumBases(){
479         return numBases;
480 }
481
482 //********************************************************************************************************************
483
484 void Sequence::printSequence(ostream& out){
485
486         out << ">" << name << endl;
487         if(isAligned){
488                 out << aligned << endl;
489         }
490         else{
491                 out << unaligned << endl;
492         }
493 }
494
495 //********************************************************************************************************************
496
497 int Sequence::getAlignLength(){
498         return alignmentLength;
499 }
500
501 //********************************************************************************************************************
502
503 int Sequence::getAmbigBases(){
504         if(ambigBases == -1){
505                 ambigBases = 0;
506                 for(int j=0;j<numBases;j++){
507                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
508                                 ambigBases++;
509                         }
510                 }
511         }       
512         
513         return ambigBases;
514 }
515
516 //********************************************************************************************************************
517
518 void Sequence::removeAmbigBases(){
519         
520         for(int j=0;j<alignmentLength;j++){
521                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
522                         aligned[j] = '-';
523                 }
524         }
525         setUnaligned(aligned);
526 }
527         
528 //********************************************************************************************************************
529
530 int Sequence::getLongHomoPolymer(){
531         if(longHomoPolymer == -1){
532                 longHomoPolymer = 1;
533                 int homoPolymer = 1;
534                 for(int j=1;j<numBases;j++){
535                         if(unaligned[j] == unaligned[j-1]){
536                                 homoPolymer++;
537                         }
538                         else{
539                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
540                                 homoPolymer = 1;
541                         }
542                 }
543                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
544         }
545         return longHomoPolymer;
546 }
547
548 //********************************************************************************************************************
549
550 int Sequence::getStartPos(){
551         if(startPos == -1){
552                 for(int j = 0; j < alignmentLength; j++) {
553                         if(aligned[j] != '.'){
554                                 startPos = j + 1;
555                                 break;
556                         }
557                 }
558         }
559         if(isAligned == 0){     startPos = 1;   }
560
561         return startPos;
562 }
563
564 //********************************************************************************************************************
565
566 void Sequence::padToPos(int start){
567
568         for(int j = startPos-1; j < start-1; j++) {
569                 aligned[j] = '.';
570         }
571         startPos = start;
572
573 }
574
575 //********************************************************************************************************************
576
577 int Sequence::getEndPos(){
578         if(endPos == -1){
579                 for(int j=alignmentLength-1;j>=0;j--){
580                         if(aligned[j] != '.'){
581                                 endPos = j + 1;
582                                 break;
583                         }
584                 }
585         }
586         if(isAligned == 0){     endPos = numBases;      }
587         
588         return endPos;
589 }
590
591 //********************************************************************************************************************
592
593 void Sequence::padFromPos(int end){
594         
595         for(int j = end; j < endPos; j++) {
596                 aligned[j] = '.';
597         }
598         endPos = end;
599         
600 }
601
602 //********************************************************************************************************************
603
604 bool Sequence::getIsAligned(){
605         return isAligned;
606 }
607 //********************************************************************************************************************
608
609 void Sequence::reverseComplement(){
610
611         string temp;
612         for(int i=numBases-1;i>=0;i--){
613                 if(unaligned[i] == 'A')         {       temp += 'T';    }
614                 else if(unaligned[i] == 'T'){   temp += 'A';    }
615                 else if(unaligned[i] == 'G'){   temp += 'C';    }
616                 else if(unaligned[i] == 'C'){   temp += 'G';    }
617                 else                                            {       temp += 'N';    }
618         }
619         unaligned = temp;
620         aligned = temp;
621         
622 }
623
624 //********************************************************************************************************************
625
626 void Sequence::trim(int length){
627         
628         if(numBases > length){
629                 unaligned = unaligned.substr(0,length);
630                 numBases = length;
631         }
632         
633 }
634
635 ///**************************************************************************************************/