]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
fixed problem with class.fy.seqs where group totals did not add up. added constructo...
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13 Sequence::Sequence(){
14         m = MothurOut::getInstance();
15         initialize();
16 }
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
19         try {
20                 m = MothurOut::getInstance();
21                 initialize();   
22                 name = newName;
23                 
24                 //setUnaligned removes any gap characters for us
25                 setUnaligned(sequence);
26                 setAligned(sequence);
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "Sequence", "Sequence");
30                 exit(1);
31         }                       
32 }
33 /***********************************************************************/
34 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
35         try {
36                 m = MothurOut::getInstance();
37                 initialize();   
38                 name = newName;
39                 
40                 //setUnaligned removes any gap characters for us
41                 setUnaligned(sequence);
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "Sequence", "Sequence");
45                 exit(1);
46         }                       
47 }
48
49 //********************************************************************************************************************
50 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
51 Sequence::Sequence(istringstream& fastaString){
52         try {
53                 m = MothurOut::getInstance();
54         
55                 initialize();
56                 fastaString >> name;
57                 name = name.substr(1);
58                 string sequence;
59         
60                 //read comments
61                 while ((name[0] == '#') && fastaString) { 
62                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
63                         sequence = getCommentString(fastaString);
64                         
65                         if (fastaString) {  
66                                 fastaString >> name;  
67                                 name = name.substr(1);  
68                         }else { 
69                                 name = "";
70                                 break;
71                         }
72                 }
73                 
74                 while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
75                 
76                 sequence = getSequenceString(fastaString);              
77                 setAligned(sequence);   
78                 //setUnaligned removes any gap characters for us                                                
79                 setUnaligned(sequence);         
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "Sequence", "Sequence");
83                 exit(1);
84         }                                                               
85 }
86 //********************************************************************************************************************
87 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
88 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
89         try {
90                 m = MothurOut::getInstance();
91         
92                 initialize();
93                 fastaString >> name;
94                 name = name.substr(1);
95                 string sequence;
96         
97                 //read comments
98                 while ((name[0] == '#') && fastaString) { 
99                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
100                         sequence = getCommentString(fastaString);
101                         
102                         if (fastaString) {  
103                                 fastaString >> name;  
104                                 name = name.substr(1);  
105                         }else { 
106                                 name = "";
107                                 break;
108                         }
109                 }
110                 
111                 while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
112                 
113                 sequence = getSequenceString(fastaString);              
114                 
115                 //setUnaligned removes any gap characters for us                                                
116                 setUnaligned(sequence);         
117         }
118         catch(exception& e) {
119                 m->errorOut(e, "Sequence", "Sequence");
120                 exit(1);
121         }                                                               
122 }
123
124
125 //********************************************************************************************************************
126 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
127 Sequence::Sequence(ifstream& fastaFile){
128         try {
129                 m = MothurOut::getInstance();
130                 initialize();
131                 fastaFile >> name;
132                 name = name.substr(1);
133                 string sequence;
134                 
135                 //read comments
136                 while ((name[0] == '#') && fastaFile) { 
137                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
138                         sequence = getCommentString(fastaFile);
139                         
140                         if (fastaFile) {  
141                                 fastaFile >> name;  
142                                 name = name.substr(1);  
143                         }else { 
144                                 name = "";
145                                 break;
146                         }
147                 }
148                 
149                 //read real sequence
150                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
151                 
152                 sequence = getSequenceString(fastaFile);                
153                 
154                 setAligned(sequence);   
155                 //setUnaligned removes any gap characters for us                                                
156                 setUnaligned(sequence); 
157         }
158         catch(exception& e) {
159                 m->errorOut(e, "Sequence", "Sequence");
160                 exit(1);
161         }                                                       
162 }
163 //********************************************************************************************************************
164 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
165 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
166         try {
167                 m = MothurOut::getInstance();
168                 initialize();
169                 fastaFile >> name;
170                 name = name.substr(1);
171                 string sequence;
172                 
173                 //read comments
174                 while ((name[0] == '#') && fastaFile) { 
175                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
176                         sequence = getCommentString(fastaFile);
177                         
178                         if (fastaFile) {  
179                                 fastaFile >> name;  
180                                 name = name.substr(1);  
181                         }else { 
182                                 name = "";
183                                 break;
184                         }
185                 }
186                 
187                 //read real sequence
188                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
189                 
190                 sequence = getSequenceString(fastaFile);                
191                 
192                 //setUnaligned removes any gap characters for us                                                
193                 setUnaligned(sequence); 
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "Sequence", "Sequence");
197                 exit(1);
198         }                                                       
199 }
200
201 //********************************************************************************************************************
202 string Sequence::getSequenceString(ifstream& fastaFile) {
203         try {
204                 char letter;
205                 string sequence = "";   
206                 
207                 while(fastaFile){
208                         letter= fastaFile.get();
209                         if(letter == '>'){
210                                 fastaFile.putback(letter);
211                                 break;
212                         }
213                         else if(isprint(letter)){
214                                 letter = toupper(letter);
215                                 if(letter == 'U'){letter = 'T';}
216                                 sequence += letter;
217                         }
218                 }
219                 
220                 return sequence;
221         }
222         catch(exception& e) {
223                 m->errorOut(e, "Sequence", "getSequenceString");
224                 exit(1);
225         }
226 }
227 //********************************************************************************************************************
228 //comment can contain '>' so we need to account for that
229 string Sequence::getCommentString(ifstream& fastaFile) {
230         try {
231                 char letter;
232                 string sequence = "";
233                 
234                 while(fastaFile){
235                         letter=fastaFile.get();
236                         if((letter == '\r') || (letter == '\n')){  
237                                 gobble(fastaFile);  //in case its a \r\n situation
238                                 break;
239                         }
240                 }
241                 
242                 return sequence;
243         }
244         catch(exception& e) {
245                 m->errorOut(e, "Sequence", "getCommentString");
246                 exit(1);
247         }
248 }
249 //********************************************************************************************************************
250 string Sequence::getSequenceString(istringstream& fastaFile) {
251         try {
252                 char letter;
253                 string sequence = "";   
254                 
255                 while(!fastaFile.eof()){
256                         letter= fastaFile.get();
257         
258                         if(letter == '>'){
259                                 fastaFile.putback(letter);
260                                 break;
261                         }
262                         else if(isprint(letter)){
263                                 letter = toupper(letter);
264                                 if(letter == 'U'){letter = 'T';}
265                                 sequence += letter;
266                         }
267                 }
268                 
269                 return sequence;
270         }
271         catch(exception& e) {
272                 m->errorOut(e, "Sequence", "getSequenceString");
273                 exit(1);
274         }
275 }
276 //********************************************************************************************************************
277 //comment can contain '>' so we need to account for that
278 string Sequence::getCommentString(istringstream& fastaFile) {
279         try {
280                 char letter;
281                 string sequence = "";
282                 
283                 while(fastaFile){
284                         letter=fastaFile.get();
285                         if((letter == '\r') || (letter == '\n')){  
286                                 gobble(fastaFile);  //in case its a \r\n situation
287                                 break;
288                         }
289                 }
290                 
291                 return sequence;
292         }
293         catch(exception& e) {
294                 m->errorOut(e, "Sequence", "getCommentString");
295                 exit(1);
296         }
297 }
298 //********************************************************************************************************************
299
300 void Sequence::initialize(){
301         
302         name = "";
303         unaligned = "";
304         aligned = "";
305         pairwise = "";
306         
307         numBases = 0;
308         alignmentLength = 0;
309         isAligned = 0;
310         startPos = -1;
311         endPos = -1;
312         longHomoPolymer = -1;
313         ambigBases = -1;
314         
315 }       
316
317 //********************************************************************************************************************
318
319 void Sequence::setName(string seqName) {
320         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
321         else                                    {       name = seqName;                         }
322 }
323
324 //********************************************************************************************************************
325
326 void Sequence::setUnaligned(string sequence){
327         
328         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
329                 string temp = "";
330                 for(int j=0;j<sequence.length();j++) {
331                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
332                 }
333                 unaligned = temp;
334         }
335         else {
336                 unaligned = sequence;
337         }
338         numBases = unaligned.length();
339         
340 }
341
342 //********************************************************************************************************************
343
344 void Sequence::setAligned(string sequence){
345         
346         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
347         aligned = sequence;
348         alignmentLength = aligned.length();
349         setUnaligned(sequence); 
350
351         if(aligned[0] == '-'){
352                 for(int i=0;i<alignmentLength;i++){
353                         if(aligned[i] == '-'){
354                                 aligned[i] = '.';
355                         }
356                         else{
357                                 break;
358                         }
359                 }
360                 for(int i=alignmentLength-1;i>=0;i--){
361                         if(aligned[i] == '-'){
362                                 aligned[i] = '.';
363                         }
364                         else{
365                                 break;
366                         }
367                 }
368         }
369         isAligned = 1;  
370 }
371
372 //********************************************************************************************************************
373
374 void Sequence::setPairwise(string sequence){
375         pairwise = sequence;
376 }
377
378 //********************************************************************************************************************
379
380 string Sequence::convert2ints() {
381         
382         if(unaligned == "")     {       /* need to throw an error */    }
383         
384         string processed;
385         
386         for(int i=0;i<unaligned.length();i++) {
387                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
388                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
389                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
390                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
391                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
392                 else                                                                    {       processed += '4';       }
393         }
394         return processed;
395 }
396
397 //********************************************************************************************************************
398
399 string Sequence::getName(){
400         return name;
401 }
402
403 //********************************************************************************************************************
404
405 string Sequence::getAligned(){
406         if(isAligned == 0)      { return unaligned; }
407         else                            {  return aligned;  }
408 }
409
410 //********************************************************************************************************************
411
412 string Sequence::getPairwise(){
413         return pairwise;
414 }
415
416 //********************************************************************************************************************
417
418 string Sequence::getUnaligned(){
419         return unaligned;
420 }
421
422 //********************************************************************************************************************
423
424 int Sequence::getNumBases(){
425         return numBases;
426 }
427
428 //********************************************************************************************************************
429
430 void Sequence::printSequence(ostream& out){
431
432         out << ">" << name << endl;
433         if(isAligned){
434                 out << aligned << endl;
435         }
436         else{
437                 out << unaligned << endl;
438         }
439 }
440
441 //********************************************************************************************************************
442
443 int Sequence::getAlignLength(){
444         return alignmentLength;
445 }
446
447 //********************************************************************************************************************
448
449 int Sequence::getAmbigBases(){
450         if(ambigBases == -1){
451                 ambigBases = 0;
452                 for(int j=0;j<numBases;j++){
453                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
454                                 ambigBases++;
455                         }
456                 }
457         }       
458         
459         return ambigBases;
460 }
461
462 //********************************************************************************************************************
463
464 int Sequence::getLongHomoPolymer(){
465         if(longHomoPolymer == -1){
466                 longHomoPolymer = 1;
467                 int homoPolymer = 1;
468                 for(int j=1;j<numBases;j++){
469                         if(unaligned[j] == unaligned[j-1]){
470                                 homoPolymer++;
471                         }
472                         else{
473                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
474                                 homoPolymer = 1;
475                         }
476                 }
477                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
478         }
479         return longHomoPolymer;
480 }
481
482 //********************************************************************************************************************
483
484 int Sequence::getStartPos(){
485         if(endPos == -1){
486                 for(int j = 0; j < alignmentLength; j++) {
487                         if(aligned[j] != '.'){
488                                 startPos = j + 1;
489                                 break;
490                         }
491                 }
492         }
493         if(isAligned == 0){     startPos = 1;   }
494
495         return startPos;
496 }
497
498 //********************************************************************************************************************
499
500 int Sequence::getEndPos(){
501         if(endPos == -1){
502                 for(int j=alignmentLength-1;j>=0;j--){
503                         if(aligned[j] != '.'){
504                                 endPos = j + 1;
505                                 break;
506                         }
507                 }
508         }
509         if(isAligned == 0){     endPos = numBases;      }
510         
511         return endPos;
512 }
513
514 //********************************************************************************************************************
515
516 bool Sequence::getIsAligned(){
517         return isAligned;
518 }
519
520 //********************************************************************************************************************
521
522 void Sequence::reverseComplement(){
523
524         string temp;
525         for(int i=numBases-1;i>=0;i--){
526                 if(unaligned[i] == 'A')         {       temp += 'T';    }
527                 else if(unaligned[i] == 'T'){   temp += 'A';    }
528                 else if(unaligned[i] == 'G'){   temp += 'C';    }
529                 else if(unaligned[i] == 'C'){   temp += 'G';    }
530                 else                                            {       temp += 'N';    }
531         }
532         unaligned = temp;
533         aligned = temp;
534         
535 }
536 /**************************************************************************************************/