]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
some alterations to chimera.slayer
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 
58                 if (name.length() != 0) { 
59                 
60                         name = name.substr(1);
61                         string sequence;
62                 
63                         //read comments
64                         while ((name[0] == '#') && fastaString) { 
65                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
66                                 sequence = getCommentString(fastaString);
67                                 
68                                 if (fastaString) {  
69                                         fastaString >> name;  
70                                         name = name.substr(1);  
71                                 }else { 
72                                         name = "";
73                                         break;
74                                 }
75                         }
76                         
77                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
78                         
79                         sequence = getSequenceString(fastaString);              
80                         setAligned(sequence);   
81                         //setUnaligned removes any gap characters for us                                                
82                         setUnaligned(sequence);         
83                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
84                 
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "Sequence", "Sequence");
88                 exit(1);
89         }                                                               
90 }
91 //********************************************************************************************************************
92 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
93 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
94         try {
95                 m = MothurOut::getInstance();
96         
97                 initialize();
98                 fastaString >> name;
99                 
100                 if (name.length() != 0) { 
101                 
102                         name = name.substr(1);
103                         string sequence;
104                 
105                         //read comments
106                         while ((name[0] == '#') && fastaString) { 
107                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
108                                 sequence = getCommentString(fastaString);
109                                 
110                                 if (fastaString) {  
111                                         fastaString >> name;  
112                                         name = name.substr(1);  
113                                 }else { 
114                                         name = "";
115                                         break;
116                                 }
117                         }
118                         
119                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
120                         
121                         sequence = getSequenceString(fastaString);              
122                         
123                         //setUnaligned removes any gap characters for us                                                
124                         setUnaligned(sequence);         
125                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
126                 
127         }
128         catch(exception& e) {
129                 m->errorOut(e, "Sequence", "Sequence");
130                 exit(1);
131         }                                                               
132 }
133
134
135 //********************************************************************************************************************
136 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
137 Sequence::Sequence(ifstream& fastaFile){
138         try {
139                 m = MothurOut::getInstance();
140                 initialize();
141                 fastaFile >> name;
142                 
143                 if (name.length() != 0) { 
144                 
145                         name = name.substr(1); 
146                         
147                         string sequence;
148                 
149                         //read comments
150                         while ((name[0] == '#') && fastaFile) { 
151                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
152                                 sequence = getCommentString(fastaFile);
153                                 
154                                 if (fastaFile) {  
155                                         fastaFile >> name;  
156                                         name = name.substr(1);  
157                                 }else { 
158                                         name = "";
159                                         break;
160                                 }
161                         }
162                         
163                         //read real sequence
164                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
165                         
166                         sequence = getSequenceString(fastaFile);                
167         
168                         setAligned(sequence);   
169                         //setUnaligned removes any gap characters for us                                                
170                         setUnaligned(sequence); 
171                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
172
173         }
174         catch(exception& e) {
175                 m->errorOut(e, "Sequence", "Sequence");
176                 exit(1);
177         }                                                       
178 }
179 //********************************************************************************************************************
180 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
181 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
182         try {
183                 m = MothurOut::getInstance();
184                 initialize();
185                 fastaFile >> name;
186                 
187                 if (name.length() != 0) { 
188                         name = name.substr(1);
189                         string sequence;
190                         
191                         //read comments
192                         while ((name[0] == '#') && fastaFile) { 
193                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
194                                 sequence = getCommentString(fastaFile);
195                                 
196                                 if (fastaFile) {  
197                                         fastaFile >> name;  
198                                         name = name.substr(1);  
199                                 }else { 
200                                         name = "";
201                                         break;
202                                 }
203                         }
204                         
205                         //read real sequence
206                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
207                         
208                         sequence = getSequenceString(fastaFile);                
209                         
210                         //setUnaligned removes any gap characters for us                                                
211                         setUnaligned(sequence); 
212                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
213                 
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "Sequence", "Sequence");
217                 exit(1);
218         }                                                       
219 }
220
221 //********************************************************************************************************************
222 string Sequence::getSequenceString(ifstream& fastaFile) {
223         try {
224                 char letter;
225                 string sequence = "";   
226                 
227                 while(fastaFile){
228                         letter= fastaFile.get();
229                         if(letter == '>'){
230                                 fastaFile.putback(letter);
231                                 break;
232                         }
233                         else if(isprint(letter)){
234                                 letter = toupper(letter);
235                                 if(letter == 'U'){letter = 'T';}
236                                 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C'){
237                                         letter = 'N';
238                                 }
239                                 sequence += letter;
240                         }
241                 }
242                 
243                 return sequence;
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "Sequence", "getSequenceString");
247                 exit(1);
248         }
249 }
250 //********************************************************************************************************************
251 //comment can contain '>' so we need to account for that
252 string Sequence::getCommentString(ifstream& fastaFile) {
253         try {
254                 char letter;
255                 string sequence = "";
256                 
257                 while(fastaFile){
258                         letter=fastaFile.get();
259                         if((letter == '\r') || (letter == '\n')){  
260                                 m->gobble(fastaFile);  //in case its a \r\n situation
261                                 break;
262                         }
263                 }
264                 
265                 return sequence;
266         }
267         catch(exception& e) {
268                 m->errorOut(e, "Sequence", "getCommentString");
269                 exit(1);
270         }
271 }
272 //********************************************************************************************************************
273 string Sequence::getSequenceString(istringstream& fastaFile) {
274         try {
275                 char letter;
276                 string sequence = "";   
277                 
278                 while(!fastaFile.eof()){
279                         letter= fastaFile.get();
280         
281                         if(letter == '>'){
282                                 fastaFile.putback(letter);
283                                 break;
284                         }
285                         else if(isprint(letter)){
286                                 letter = toupper(letter);
287                                 if(letter == 'U'){letter = 'T';}
288                                 sequence += letter;
289                         }
290                 }
291                 
292                 return sequence;
293         }
294         catch(exception& e) {
295                 m->errorOut(e, "Sequence", "getSequenceString");
296                 exit(1);
297         }
298 }
299 //********************************************************************************************************************
300 //comment can contain '>' so we need to account for that
301 string Sequence::getCommentString(istringstream& fastaFile) {
302         try {
303                 char letter;
304                 string sequence = "";
305                 
306                 while(fastaFile){
307                         letter=fastaFile.get();
308                         if((letter == '\r') || (letter == '\n')){  
309                                 m->gobble(fastaFile);  //in case its a \r\n situation
310                                 break;
311                         }
312                 }
313                 
314                 return sequence;
315         }
316         catch(exception& e) {
317                 m->errorOut(e, "Sequence", "getCommentString");
318                 exit(1);
319         }
320 }
321 //********************************************************************************************************************
322
323 void Sequence::initialize(){
324         
325         name = "";
326         unaligned = "";
327         aligned = "";
328         pairwise = "";
329         
330         numBases = 0;
331         alignmentLength = 0;
332         isAligned = 0;
333         startPos = -1;
334         endPos = -1;
335         longHomoPolymer = -1;
336         ambigBases = -1;
337         
338 }       
339
340 //********************************************************************************************************************
341
342 void Sequence::setName(string seqName) {
343         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
344         else                                    {       name = seqName;                         }
345 }
346
347 //********************************************************************************************************************
348
349 void Sequence::setUnaligned(string sequence){
350         
351         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
352                 string temp = "";
353                 for(int j=0;j<sequence.length();j++) {
354                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
355                 }
356                 unaligned = temp;
357         }
358         else {
359                 unaligned = sequence;
360         }
361         numBases = unaligned.length();
362         
363 }
364
365 //********************************************************************************************************************
366
367 void Sequence::setAligned(string sequence){
368         
369         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
370         aligned = sequence;
371         alignmentLength = aligned.length();
372         setUnaligned(sequence); 
373
374         if(aligned[0] == '-'){
375                 for(int i=0;i<alignmentLength;i++){
376                         if(aligned[i] == '-'){
377                                 aligned[i] = '.';
378                         }
379                         else{
380                                 break;
381                         }
382                 }
383                 for(int i=alignmentLength-1;i>=0;i--){
384                         if(aligned[i] == '-'){
385                                 aligned[i] = '.';
386                         }
387                         else{
388                                 break;
389                         }
390                 }
391         }
392         isAligned = 1;  
393 }
394
395 //********************************************************************************************************************
396
397 void Sequence::setPairwise(string sequence){
398         pairwise = sequence;
399 }
400
401 //********************************************************************************************************************
402
403 string Sequence::convert2ints() {
404         
405         if(unaligned == "")     {       /* need to throw an error */    }
406         
407         string processed;
408         
409         for(int i=0;i<unaligned.length();i++) {
410                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
411                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
412                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
413                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
414                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
415                 else                                                                    {       processed += '4';       }
416         }
417         return processed;
418 }
419
420 //********************************************************************************************************************
421
422 string Sequence::getName(){
423         return name;
424 }
425
426 //********************************************************************************************************************
427
428 string Sequence::getAligned(){
429         if(isAligned == 0)      { return unaligned; }
430         else                            {  return aligned;  }
431 }
432
433 //********************************************************************************************************************
434
435 string Sequence::getInlineSeq(){
436         return name + '\t' + aligned;   
437 }
438
439
440 //********************************************************************************************************************
441
442 string Sequence::getPairwise(){
443         return pairwise;
444 }
445
446 //********************************************************************************************************************
447
448 string Sequence::getUnaligned(){
449         return unaligned;
450 }
451
452 //********************************************************************************************************************
453
454 int Sequence::getNumBases(){
455         return numBases;
456 }
457
458 //********************************************************************************************************************
459
460 void Sequence::printSequence(ostream& out){
461
462         out << ">" << name << endl;
463         if(isAligned){
464                 out << aligned << endl;
465         }
466         else{
467                 out << unaligned << endl;
468         }
469 }
470
471 //********************************************************************************************************************
472
473 int Sequence::getAlignLength(){
474         return alignmentLength;
475 }
476
477 //********************************************************************************************************************
478
479 int Sequence::getAmbigBases(){
480         if(ambigBases == -1){
481                 ambigBases = 0;
482                 for(int j=0;j<numBases;j++){
483                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
484                                 ambigBases++;
485                         }
486                 }
487         }       
488         
489         return ambigBases;
490 }
491
492 //********************************************************************************************************************
493
494 void Sequence::removeAmbigBases(){
495         
496         for(int j=0;j<alignmentLength;j++){
497                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
498                         aligned[j] = '-';
499                 }
500         }
501         setUnaligned(aligned);
502 }
503         
504 //********************************************************************************************************************
505
506 int Sequence::getLongHomoPolymer(){
507         if(longHomoPolymer == -1){
508                 longHomoPolymer = 1;
509                 int homoPolymer = 1;
510                 for(int j=1;j<numBases;j++){
511                         if(unaligned[j] == unaligned[j-1]){
512                                 homoPolymer++;
513                         }
514                         else{
515                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
516                                 homoPolymer = 1;
517                         }
518                 }
519                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
520         }
521         return longHomoPolymer;
522 }
523
524 //********************************************************************************************************************
525
526 int Sequence::getStartPos(){
527         if(startPos == -1){
528                 for(int j = 0; j < alignmentLength; j++) {
529                         if(aligned[j] != '.'){
530                                 startPos = j + 1;
531                                 break;
532                         }
533                 }
534         }
535         if(isAligned == 0){     startPos = 1;   }
536
537         return startPos;
538 }
539
540 //********************************************************************************************************************
541
542 void Sequence::padToPos(int start){
543
544         for(int j = startPos-1; j < start-1; j++) {
545                 aligned[j] = '.';
546         }
547         startPos = start;
548
549 }
550
551 //********************************************************************************************************************
552
553 int Sequence::getEndPos(){
554         if(endPos == -1){
555                 for(int j=alignmentLength-1;j>=0;j--){
556                         if(aligned[j] != '.'){
557                                 endPos = j + 1;
558                                 break;
559                         }
560                 }
561         }
562         if(isAligned == 0){     endPos = numBases;      }
563         
564         return endPos;
565 }
566
567 //********************************************************************************************************************
568
569 void Sequence::padFromPos(int end){
570         
571         for(int j = end; j < endPos; j++) {
572                 aligned[j] = '.';
573         }
574         endPos = end;
575         
576 }
577
578 //********************************************************************************************************************
579
580 bool Sequence::getIsAligned(){
581         return isAligned;
582 }
583
584 //********************************************************************************************************************
585
586 void Sequence::reverseComplement(){
587
588         string temp;
589         for(int i=numBases-1;i>=0;i--){
590                 if(unaligned[i] == 'A')         {       temp += 'T';    }
591                 else if(unaligned[i] == 'T'){   temp += 'A';    }
592                 else if(unaligned[i] == 'G'){   temp += 'C';    }
593                 else if(unaligned[i] == 'C'){   temp += 'G';    }
594                 else                                            {       temp += 'N';    }
595         }
596         unaligned = temp;
597         aligned = temp;
598         
599 }
600
601 //********************************************************************************************************************
602
603 void Sequence::trim(int length){
604         
605         if(numBases > length){
606                 unaligned = unaligned.substr(0,length);
607                 numBases = length;
608         }
609         
610 }
611
612 ///**************************************************************************************************/