]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
changed how we count sequences in a fastafile to allow for '>' in sequence names
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 
58                 if (name.length() != 0) { 
59                 
60                         name = name.substr(1);
61                         string sequence;
62                 
63                         //read comments
64                         while ((name[0] == '#') && fastaString) { 
65                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
66                                 sequence = getCommentString(fastaString);
67                                 
68                                 if (fastaString) {  
69                                         fastaString >> name;  
70                                         name = name.substr(1);  
71                                 }else { 
72                                         name = "";
73                                         break;
74                                 }
75                         }
76                         
77                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
78                         
79                         sequence = getSequenceString(fastaString);              
80                         setAligned(sequence);   
81                         //setUnaligned removes any gap characters for us                                                
82                         setUnaligned(sequence);         
83                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
84
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "Sequence", "Sequence");
88                 exit(1);
89         }                                                               
90 }
91 //********************************************************************************************************************
92 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
93 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
94         try {
95                 m = MothurOut::getInstance();
96         
97                 initialize();
98                 fastaString >> name;
99                 
100                 if (name.length() != 0) { 
101                 
102                         name = name.substr(1);
103                         string sequence;
104                 
105                         //read comments
106                         while ((name[0] == '#') && fastaString) { 
107                                 while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
108                                 sequence = getCommentString(fastaString);
109                                 
110                                 if (fastaString) {  
111                                         fastaString >> name;  
112                                         name = name.substr(1);  
113                                 }else { 
114                                         name = "";
115                                         break;
116                                 }
117                         }
118                         
119                         while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
120                         
121                         sequence = getSequenceString(fastaString);              
122                         
123                         //setUnaligned removes any gap characters for us                                                
124                         setUnaligned(sequence);         
125                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "Sequence", "Sequence");
129                 exit(1);
130         }                                                               
131 }
132
133
134 //********************************************************************************************************************
135 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
136 Sequence::Sequence(ifstream& fastaFile){
137         try {
138                 m = MothurOut::getInstance();
139                 initialize();
140                 fastaFile >> name;
141
142                 if (name.length() != 0) { 
143                 
144                         name = name.substr(1); 
145                         
146                         string sequence;
147                 
148                         //read comments
149                         while ((name[0] == '#') && fastaFile) { 
150                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
151                                 sequence = getCommentString(fastaFile);
152                                 
153                                 if (fastaFile) {  
154                                         fastaFile >> name;  
155                                         name = name.substr(1);  
156                                 }else { 
157                                         name = "";
158                                         break;
159                                 }
160                         }
161                         
162                         //read real sequence
163                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
164                         
165                         sequence = getSequenceString(fastaFile);                
166                 
167                         setAligned(sequence);   
168                         //setUnaligned removes any gap characters for us                                                
169                         setUnaligned(sequence); 
170                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
171                 
172         }
173         catch(exception& e) {
174                 m->errorOut(e, "Sequence", "Sequence");
175                 exit(1);
176         }                                                       
177 }
178 //********************************************************************************************************************
179 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
180 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
181         try {
182                 m = MothurOut::getInstance();
183                 initialize();
184                 fastaFile >> name;
185                 
186                 if (name.length() != 0) { 
187                         name = name.substr(1);
188                         string sequence;
189                         
190                         //read comments
191                         while ((name[0] == '#') && fastaFile) { 
192                                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
193                                 sequence = getCommentString(fastaFile);
194                                 
195                                 if (fastaFile) {  
196                                         fastaFile >> name;  
197                                         name = name.substr(1);  
198                                 }else { 
199                                         name = "";
200                                         break;
201                                 }
202                         }
203                         
204                         //read real sequence
205                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
206                         
207                         sequence = getSequenceString(fastaFile);                
208                         
209                         //setUnaligned removes any gap characters for us                                                
210                         setUnaligned(sequence); 
211                 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
212
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "Sequence", "Sequence");
216                 exit(1);
217         }                                                       
218 }
219
220 //********************************************************************************************************************
221 string Sequence::getSequenceString(ifstream& fastaFile) {
222         try {
223                 char letter;
224                 string sequence = "";   
225                 
226                 while(fastaFile){
227                         letter= fastaFile.get();
228                         if(letter == '>'){
229                                 fastaFile.putback(letter);
230                                 break;
231                         }
232                         else if(isprint(letter)){
233                                 letter = toupper(letter);
234                                 if(letter == 'U'){letter = 'T';}
235                                 sequence += letter;
236                         }
237                 }
238                 
239                 return sequence;
240         }
241         catch(exception& e) {
242                 m->errorOut(e, "Sequence", "getSequenceString");
243                 exit(1);
244         }
245 }
246 //********************************************************************************************************************
247 //comment can contain '>' so we need to account for that
248 string Sequence::getCommentString(ifstream& fastaFile) {
249         try {
250                 char letter;
251                 string sequence = "";
252                 
253                 while(fastaFile){
254                         letter=fastaFile.get();
255                         if((letter == '\r') || (letter == '\n')){  
256                                 gobble(fastaFile);  //in case its a \r\n situation
257                                 break;
258                         }
259                 }
260                 
261                 return sequence;
262         }
263         catch(exception& e) {
264                 m->errorOut(e, "Sequence", "getCommentString");
265                 exit(1);
266         }
267 }
268 //********************************************************************************************************************
269 string Sequence::getSequenceString(istringstream& fastaFile) {
270         try {
271                 char letter;
272                 string sequence = "";   
273                 
274                 while(!fastaFile.eof()){
275                         letter= fastaFile.get();
276         
277                         if(letter == '>'){
278                                 fastaFile.putback(letter);
279                                 break;
280                         }
281                         else if(isprint(letter)){
282                                 letter = toupper(letter);
283                                 if(letter == 'U'){letter = 'T';}
284                                 sequence += letter;
285                         }
286                 }
287                 
288                 return sequence;
289         }
290         catch(exception& e) {
291                 m->errorOut(e, "Sequence", "getSequenceString");
292                 exit(1);
293         }
294 }
295 //********************************************************************************************************************
296 //comment can contain '>' so we need to account for that
297 string Sequence::getCommentString(istringstream& fastaFile) {
298         try {
299                 char letter;
300                 string sequence = "";
301                 
302                 while(fastaFile){
303                         letter=fastaFile.get();
304                         if((letter == '\r') || (letter == '\n')){  
305                                 gobble(fastaFile);  //in case its a \r\n situation
306                                 break;
307                         }
308                 }
309                 
310                 return sequence;
311         }
312         catch(exception& e) {
313                 m->errorOut(e, "Sequence", "getCommentString");
314                 exit(1);
315         }
316 }
317 //********************************************************************************************************************
318
319 void Sequence::initialize(){
320         
321         name = "";
322         unaligned = "";
323         aligned = "";
324         pairwise = "";
325         
326         numBases = 0;
327         alignmentLength = 0;
328         isAligned = 0;
329         startPos = -1;
330         endPos = -1;
331         longHomoPolymer = -1;
332         ambigBases = -1;
333         
334 }       
335
336 //********************************************************************************************************************
337
338 void Sequence::setName(string seqName) {
339         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
340         else                                    {       name = seqName;                         }
341 }
342
343 //********************************************************************************************************************
344
345 void Sequence::setUnaligned(string sequence){
346         
347         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
348                 string temp = "";
349                 for(int j=0;j<sequence.length();j++) {
350                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
351                 }
352                 unaligned = temp;
353         }
354         else {
355                 unaligned = sequence;
356         }
357         numBases = unaligned.length();
358         
359 }
360
361 //********************************************************************************************************************
362
363 void Sequence::setAligned(string sequence){
364         
365         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
366         aligned = sequence;
367         alignmentLength = aligned.length();
368         setUnaligned(sequence); 
369
370         if(aligned[0] == '-'){
371                 for(int i=0;i<alignmentLength;i++){
372                         if(aligned[i] == '-'){
373                                 aligned[i] = '.';
374                         }
375                         else{
376                                 break;
377                         }
378                 }
379                 for(int i=alignmentLength-1;i>=0;i--){
380                         if(aligned[i] == '-'){
381                                 aligned[i] = '.';
382                         }
383                         else{
384                                 break;
385                         }
386                 }
387         }
388         isAligned = 1;  
389 }
390
391 //********************************************************************************************************************
392
393 void Sequence::setPairwise(string sequence){
394         pairwise = sequence;
395 }
396
397 //********************************************************************************************************************
398
399 string Sequence::convert2ints() {
400         
401         if(unaligned == "")     {       /* need to throw an error */    }
402         
403         string processed;
404         
405         for(int i=0;i<unaligned.length();i++) {
406                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
407                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
408                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
409                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
410                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
411                 else                                                                    {       processed += '4';       }
412         }
413         return processed;
414 }
415
416 //********************************************************************************************************************
417
418 string Sequence::getName(){
419         return name;
420 }
421
422 //********************************************************************************************************************
423
424 string Sequence::getAligned(){
425         if(isAligned == 0)      { return unaligned; }
426         else                            {  return aligned;  }
427 }
428
429 //********************************************************************************************************************
430
431 string Sequence::getPairwise(){
432         return pairwise;
433 }
434
435 //********************************************************************************************************************
436
437 string Sequence::getUnaligned(){
438         return unaligned;
439 }
440
441 //********************************************************************************************************************
442
443 int Sequence::getNumBases(){
444         return numBases;
445 }
446
447 //********************************************************************************************************************
448
449 void Sequence::printSequence(ostream& out){
450
451         out << ">" << name << endl;
452         if(isAligned){
453                 out << aligned << endl;
454         }
455         else{
456                 out << unaligned << endl;
457         }
458 }
459
460 //********************************************************************************************************************
461
462 int Sequence::getAlignLength(){
463         return alignmentLength;
464 }
465
466 //********************************************************************************************************************
467
468 int Sequence::getAmbigBases(){
469         if(ambigBases == -1){
470                 ambigBases = 0;
471                 for(int j=0;j<numBases;j++){
472                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
473                                 ambigBases++;
474                         }
475                 }
476         }       
477         
478         return ambigBases;
479 }
480
481 //********************************************************************************************************************
482
483 int Sequence::getLongHomoPolymer(){
484         if(longHomoPolymer == -1){
485                 longHomoPolymer = 1;
486                 int homoPolymer = 1;
487                 for(int j=1;j<numBases;j++){
488                         if(unaligned[j] == unaligned[j-1]){
489                                 homoPolymer++;
490                         }
491                         else{
492                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
493                                 homoPolymer = 1;
494                         }
495                 }
496                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
497         }
498         return longHomoPolymer;
499 }
500
501 //********************************************************************************************************************
502
503 int Sequence::getStartPos(){
504         if(endPos == -1){
505                 for(int j = 0; j < alignmentLength; j++) {
506                         if(aligned[j] != '.'){
507                                 startPos = j + 1;
508                                 break;
509                         }
510                 }
511         }
512         if(isAligned == 0){     startPos = 1;   }
513
514         return startPos;
515 }
516
517 //********************************************************************************************************************
518
519 int Sequence::getEndPos(){
520         if(endPos == -1){
521                 for(int j=alignmentLength-1;j>=0;j--){
522                         if(aligned[j] != '.'){
523                                 endPos = j + 1;
524                                 break;
525                         }
526                 }
527         }
528         if(isAligned == 0){     endPos = numBases;      }
529         
530         return endPos;
531 }
532
533 //********************************************************************************************************************
534
535 bool Sequence::getIsAligned(){
536         return isAligned;
537 }
538
539 //********************************************************************************************************************
540
541 void Sequence::reverseComplement(){
542
543         string temp;
544         for(int i=numBases-1;i>=0;i--){
545                 if(unaligned[i] == 'A')         {       temp += 'T';    }
546                 else if(unaligned[i] == 'T'){   temp += 'A';    }
547                 else if(unaligned[i] == 'G'){   temp += 'C';    }
548                 else if(unaligned[i] == 'C'){   temp += 'G';    }
549                 else                                            {       temp += 'N';    }
550         }
551         unaligned = temp;
552         aligned = temp;
553         
554 }
555 /**************************************************************************************************/