]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
mods to seq.errror
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 
58                 if (name.length() != 0) { 
59                 
60                         name = name.substr(1);
61                         string sequence;
62                 
63                         //read comments
64                         while ((name[0] == '#') && fastaString) { 
65                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
66                                 sequence = getCommentString(fastaString);
67                                 
68                                 if (fastaString) {  
69                                         fastaString >> name;  
70                                         name = name.substr(1);  
71                                 }else { 
72                                         name = "";
73                                         break;
74                                 }
75                         }
76                         
77                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
78                         
79                         sequence = getSequenceString(fastaString);              
80                         setAligned(sequence);   
81                         //setUnaligned removes any gap characters for us                                                
82                         setUnaligned(sequence);         
83                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
84                 
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "Sequence", "Sequence");
88                 exit(1);
89         }                                                               
90 }
91 //********************************************************************************************************************
92 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
93 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
94         try {
95                 m = MothurOut::getInstance();
96         
97                 initialize();
98                 fastaString >> name;
99                 
100                 if (name.length() != 0) { 
101                 
102                         name = name.substr(1);
103                         string sequence;
104                 
105                         //read comments
106                         while ((name[0] == '#') && fastaString) { 
107                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
108                                 sequence = getCommentString(fastaString);
109                                 
110                                 if (fastaString) {  
111                                         fastaString >> name;  
112                                         name = name.substr(1);  
113                                 }else { 
114                                         name = "";
115                                         break;
116                                 }
117                         }
118                         
119                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){ break;    }       } // get rest of line if there's any crap there
120                         
121                         sequence = getSequenceString(fastaString);              
122                         
123                         //setUnaligned removes any gap characters for us                                                
124                         setUnaligned(sequence);         
125                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
126                 
127         }
128         catch(exception& e) {
129                 m->errorOut(e, "Sequence", "Sequence");
130                 exit(1);
131         }                                                               
132 }
133
134
135 //********************************************************************************************************************
136 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
137 Sequence::Sequence(ifstream& fastaFile){
138         try {
139                 m = MothurOut::getInstance();
140                 initialize();
141                 fastaFile >> name;
142                 
143                 if (name.length() != 0) { 
144                 
145                         name = name.substr(1); 
146                         
147                         string sequence;
148                 
149                         //read comments
150                         while ((name[0] == '#') && fastaFile) { 
151                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
152                                 sequence = getCommentString(fastaFile);
153                                 
154                                 if (fastaFile) {  
155                                         fastaFile >> name;  
156                                         name = name.substr(1);  
157                                 }else { 
158                                         name = "";
159                                         break;
160                                 }
161                         }
162                         
163                         //read real sequence
164                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){  break;      }       } // get rest of line if there's any crap there
165                         
166                         sequence = getSequenceString(fastaFile);                
167         
168                         setAligned(sequence);   
169                         //setUnaligned removes any gap characters for us                                                
170                         setUnaligned(sequence); 
171                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
172
173         }
174         catch(exception& e) {
175                 m->errorOut(e, "Sequence", "Sequence");
176                 exit(1);
177         }                                                       
178 }
179 //********************************************************************************************************************
180 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
181 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
182         try {
183                 m = MothurOut::getInstance();
184                 initialize();
185                 fastaFile >> name;
186                 
187                 if (name.length() != 0) { 
188                         name = name.substr(1);
189                         string sequence;
190                         
191                         //read comments
192                         while ((name[0] == '#') && fastaFile) { 
193                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
194                                 sequence = getCommentString(fastaFile);
195                                 
196                                 if (fastaFile) {  
197                                         fastaFile >> name;  
198                                         name = name.substr(1);  
199                                 }else { 
200                                         name = "";
201                                         break;
202                                 }
203                         }
204                         
205                         //read real sequence
206                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){       break; }       } // get rest of line if there's any crap there
207                         
208                         sequence = getSequenceString(fastaFile);                
209                         
210                         //setUnaligned removes any gap characters for us                                                
211                         setUnaligned(sequence); 
212                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
213                 
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "Sequence", "Sequence");
217                 exit(1);
218         }                                                       
219 }
220
221 //********************************************************************************************************************
222 string Sequence::getSequenceString(ifstream& fastaFile) {
223         try {
224                 char letter;
225                 string sequence = "";   
226                 
227                 while(fastaFile){
228                         letter= fastaFile.get();
229                         if(letter == '>'){
230                                 fastaFile.putback(letter);
231                                 break;
232                         }
233                         else if(isprint(letter)){
234                                 letter = toupper(letter);
235                                 if(letter == 'U'){letter = 'T';}
236                                 sequence += letter;
237                         }
238                 }
239                 
240                 return sequence;
241         }
242         catch(exception& e) {
243                 m->errorOut(e, "Sequence", "getSequenceString");
244                 exit(1);
245         }
246 }
247 //********************************************************************************************************************
248 //comment can contain '>' so we need to account for that
249 string Sequence::getCommentString(ifstream& fastaFile) {
250         try {
251                 char letter;
252                 string sequence = "";
253                 
254                 while(fastaFile){
255                         letter=fastaFile.get();
256                         if((letter == '\r') || (letter == '\n')){  
257                                 m->gobble(fastaFile);  //in case its a \r\n situation
258                                 break;
259                         }
260                 }
261                 
262                 return sequence;
263         }
264         catch(exception& e) {
265                 m->errorOut(e, "Sequence", "getCommentString");
266                 exit(1);
267         }
268 }
269 //********************************************************************************************************************
270 string Sequence::getSequenceString(istringstream& fastaFile) {
271         try {
272                 char letter;
273                 string sequence = "";   
274                 
275                 while(!fastaFile.eof()){
276                         letter= fastaFile.get();
277         
278                         if(letter == '>'){
279                                 fastaFile.putback(letter);
280                                 break;
281                         }
282                         else if(isprint(letter)){
283                                 letter = toupper(letter);
284                                 if(letter == 'U'){letter = 'T';}
285                                 sequence += letter;
286                         }
287                 }
288                 
289                 return sequence;
290         }
291         catch(exception& e) {
292                 m->errorOut(e, "Sequence", "getSequenceString");
293                 exit(1);
294         }
295 }
296 //********************************************************************************************************************
297 //comment can contain '>' so we need to account for that
298 string Sequence::getCommentString(istringstream& fastaFile) {
299         try {
300                 char letter;
301                 string sequence = "";
302                 
303                 while(fastaFile){
304                         letter=fastaFile.get();
305                         if((letter == '\r') || (letter == '\n')){  
306                                 m->gobble(fastaFile);  //in case its a \r\n situation
307                                 break;
308                         }
309                 }
310                 
311                 return sequence;
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "Sequence", "getCommentString");
315                 exit(1);
316         }
317 }
318 //********************************************************************************************************************
319
320 void Sequence::initialize(){
321         
322         name = "";
323         unaligned = "";
324         aligned = "";
325         pairwise = "";
326         
327         numBases = 0;
328         alignmentLength = 0;
329         isAligned = 0;
330         startPos = -1;
331         endPos = -1;
332         longHomoPolymer = -1;
333         ambigBases = -1;
334         
335 }       
336
337 //********************************************************************************************************************
338
339 void Sequence::setName(string seqName) {
340         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
341         else                                    {       name = seqName;                         }
342 }
343
344 //********************************************************************************************************************
345
346 void Sequence::setUnaligned(string sequence){
347         
348         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
349                 string temp = "";
350                 for(int j=0;j<sequence.length();j++) {
351                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
352                 }
353                 unaligned = temp;
354         }
355         else {
356                 unaligned = sequence;
357         }
358         numBases = unaligned.length();
359         
360 }
361
362 //********************************************************************************************************************
363
364 void Sequence::setAligned(string sequence){
365         
366         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
367         aligned = sequence;
368         alignmentLength = aligned.length();
369         setUnaligned(sequence); 
370
371         if(aligned[0] == '-'){
372                 for(int i=0;i<alignmentLength;i++){
373                         if(aligned[i] == '-'){
374                                 aligned[i] = '.';
375                         }
376                         else{
377                                 break;
378                         }
379                 }
380                 for(int i=alignmentLength-1;i>=0;i--){
381                         if(aligned[i] == '-'){
382                                 aligned[i] = '.';
383                         }
384                         else{
385                                 break;
386                         }
387                 }
388         }
389         isAligned = 1;  
390 }
391
392 //********************************************************************************************************************
393
394 void Sequence::setPairwise(string sequence){
395         pairwise = sequence;
396 }
397
398 //********************************************************************************************************************
399
400 string Sequence::convert2ints() {
401         
402         if(unaligned == "")     {       /* need to throw an error */    }
403         
404         string processed;
405         
406         for(int i=0;i<unaligned.length();i++) {
407                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
408                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
409                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
410                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
411                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
412                 else                                                                    {       processed += '4';       }
413         }
414         return processed;
415 }
416
417 //********************************************************************************************************************
418
419 string Sequence::getName(){
420         return name;
421 }
422
423 //********************************************************************************************************************
424
425 string Sequence::getAligned(){
426         if(isAligned == 0)      { return unaligned; }
427         else                            {  return aligned;  }
428 }
429
430 //********************************************************************************************************************
431
432 string Sequence::getInlineSeq(){
433         return name + '\t' + aligned;   
434 }
435
436
437 //********************************************************************************************************************
438
439 string Sequence::getPairwise(){
440         return pairwise;
441 }
442
443 //********************************************************************************************************************
444
445 string Sequence::getUnaligned(){
446         return unaligned;
447 }
448
449 //********************************************************************************************************************
450
451 int Sequence::getNumBases(){
452         return numBases;
453 }
454
455 //********************************************************************************************************************
456
457 void Sequence::printSequence(ostream& out){
458
459         out << ">" << name << endl;
460         if(isAligned){
461                 out << aligned << endl;
462         }
463         else{
464                 out << unaligned << endl;
465         }
466 }
467
468 //********************************************************************************************************************
469
470 int Sequence::getAlignLength(){
471         return alignmentLength;
472 }
473
474 //********************************************************************************************************************
475
476 int Sequence::getAmbigBases(){
477         if(ambigBases == -1){
478                 ambigBases = 0;
479                 for(int j=0;j<numBases;j++){
480                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
481                                 ambigBases++;
482                         }
483                 }
484         }       
485         
486         return ambigBases;
487 }
488
489 //********************************************************************************************************************
490
491 void Sequence::removeAmbigBases(){
492         
493         for(int j=0;j<alignmentLength;j++){
494                 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
495                         aligned[j] = '-';
496                 }
497         }
498         setUnaligned(aligned);
499 }
500         
501 //********************************************************************************************************************
502
503 int Sequence::getLongHomoPolymer(){
504         if(longHomoPolymer == -1){
505                 longHomoPolymer = 1;
506                 int homoPolymer = 1;
507                 for(int j=1;j<numBases;j++){
508                         if(unaligned[j] == unaligned[j-1]){
509                                 homoPolymer++;
510                         }
511                         else{
512                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
513                                 homoPolymer = 1;
514                         }
515                 }
516                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
517         }
518         return longHomoPolymer;
519 }
520
521 //********************************************************************************************************************
522
523 int Sequence::getStartPos(){
524         if(startPos == -1){
525                 for(int j = 0; j < alignmentLength; j++) {
526                         if(aligned[j] != '.'){
527                                 startPos = j + 1;
528                                 break;
529                         }
530                 }
531         }
532         if(isAligned == 0){     startPos = 1;   }
533
534         return startPos;
535 }
536
537 //********************************************************************************************************************
538
539 void Sequence::padToPos(int start){
540
541         for(int j = startPos-1; j < start-1; j++) {
542                 aligned[j] = '.';
543         }
544         startPos = start;
545
546 }
547
548 //********************************************************************************************************************
549
550 int Sequence::getEndPos(){
551         if(endPos == -1){
552                 for(int j=alignmentLength-1;j>=0;j--){
553                         if(aligned[j] != '.'){
554                                 endPos = j + 1;
555                                 break;
556                         }
557                 }
558         }
559         if(isAligned == 0){     endPos = numBases;      }
560         
561         return endPos;
562 }
563
564 //********************************************************************************************************************
565
566 void Sequence::padFromPos(int end){
567         
568         for(int j = end; j < endPos; j++) {
569                 aligned[j] = '.';
570         }
571         endPos = end;
572         
573 }
574
575 //********************************************************************************************************************
576
577 bool Sequence::getIsAligned(){
578         return isAligned;
579 }
580
581 //********************************************************************************************************************
582
583 void Sequence::reverseComplement(){
584
585         string temp;
586         for(int i=numBases-1;i>=0;i--){
587                 if(unaligned[i] == 'A')         {       temp += 'T';    }
588                 else if(unaligned[i] == 'T'){   temp += 'A';    }
589                 else if(unaligned[i] == 'G'){   temp += 'C';    }
590                 else if(unaligned[i] == 'C'){   temp += 'G';    }
591                 else                                            {       temp += 'N';    }
592         }
593         unaligned = temp;
594         aligned = temp;
595         
596 }
597
598 //********************************************************************************************************************
599
600 void Sequence::trim(int length){
601         
602         if(numBases > length){
603                 unaligned = unaligned.substr(0,length);
604                 numBases = length;
605         }
606         
607 }
608
609 ///**************************************************************************************************/