]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
testing mpi
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13
14 Sequence::Sequence(){
15         m = MothurOut::getInstance();
16         initialize();
17 }
18
19 /***********************************************************************/
20
21 Sequence::Sequence(string newName, string sequence) {
22         try {
23                 m = MothurOut::getInstance();
24                 initialize();   
25                 name = newName;
26                 
27                 //setUnaligned removes any gap characters for us
28                 setUnaligned(sequence);
29                 setAligned(sequence);
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "Sequence", "Sequence");
33                 exit(1);
34         }                       
35 }
36 //********************************************************************************************************************
37 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
38 Sequence::Sequence(istringstream& fastaString){
39         try {
40                 m = MothurOut::getInstance();
41         int pid;
42         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
43         cout << pid << " after mothur instance " << &name << endl;
44                 initialize();
45         cout << "after mothur initialize" << endl;
46                 fastaString >> name;
47         cout << "after name "  << endl;
48                 name = name.substr(1);
49                 string sequence;
50                 
51                 //read comments
52                 while ((name[0] == '#') && fastaString) { 
53                         while (fastaString)     {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
54                         sequence = getCommentString(fastaString);
55                         
56                         if (fastaString) {  
57                                 fastaString >> name;  
58                                 name = name.substr(1);  
59                         }else { 
60                                 name = "";
61                                 break;
62                         }
63                 }
64         cout << "after mothur comment" << endl; 
65                 //read real sequence
66                 while (fastaString)     {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
67         cout << "after mothur name" << endl;    
68                 sequence = getSequenceString(fastaString);              
69         cout << "after mothur sequence" << endl;        
70                 setAligned(sequence);   
71                 //setUnaligned removes any gap characters for us                                                
72                 setUnaligned(sequence);         
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "Sequence", "Sequence");
76                 exit(1);
77         }                                                               
78 }
79
80 //********************************************************************************************************************
81 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
82 Sequence::Sequence(ifstream& fastaFile){
83         try {
84                 m = MothurOut::getInstance();
85                 initialize();
86                 fastaFile >> name;
87                 name = name.substr(1);
88                 string sequence;
89                 
90                 //read comments
91                 while ((name[0] == '#') && fastaFile) { 
92                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
93                         sequence = getCommentString(fastaFile);
94                         
95                         if (fastaFile) {  
96                                 fastaFile >> name;  
97                                 name = name.substr(1);  
98                         }else { 
99                                 name = "";
100                                 break;
101                         }
102                 }
103                 
104                 //read real sequence
105                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
106                 
107                 sequence = getSequenceString(fastaFile);                
108                 
109                 setAligned(sequence);   
110                 //setUnaligned removes any gap characters for us                                                
111                 setUnaligned(sequence); 
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "Sequence", "Sequence");
115                 exit(1);
116         }                                                       
117 }
118 //********************************************************************************************************************
119 string Sequence::getSequenceString(ifstream& fastaFile) {
120         try {
121                 char letter;
122                 string sequence = "";   
123                 
124                 while(fastaFile){
125                         letter= fastaFile.get();
126                         if(letter == '>'){
127                                 fastaFile.putback(letter);
128                                 break;
129                         }
130                         else if(isprint(letter)){
131                                 letter = toupper(letter);
132                                 if(letter == 'U'){letter = 'T';}
133                                 sequence += letter;
134                         }
135                 }
136                 
137                 return sequence;
138         }
139         catch(exception& e) {
140                 m->errorOut(e, "Sequence", "getSequenceString");
141                 exit(1);
142         }
143 }
144 //********************************************************************************************************************
145 //comment can contain '>' so we need to account for that
146 string Sequence::getCommentString(ifstream& fastaFile) {
147         try {
148                 char letter;
149                 string sequence = "";
150                 
151                 while(fastaFile){
152                         letter=fastaFile.get();
153                         if((letter == '\r') || (letter == '\n')){  
154                                 gobble(fastaFile);  //in case its a \r\n situation
155                                 break;
156                         }
157                 }
158                 
159                 return sequence;
160         }
161         catch(exception& e) {
162                 m->errorOut(e, "Sequence", "getCommentString");
163                 exit(1);
164         }
165 }
166 //********************************************************************************************************************
167 string Sequence::getSequenceString(istringstream& fastaFile) {
168         try {
169                 char letter;
170                 string sequence = "";   
171                 
172                 while(fastaFile){
173                         letter= fastaFile.get();
174                         if(letter == '>'){
175                                 fastaFile.putback(letter);
176                                 break;
177                         }
178                         else if(isprint(letter)){
179                                 letter = toupper(letter);
180                                 if(letter == 'U'){letter = 'T';}
181                                 sequence += letter;
182                         }
183                 }
184                 
185                 return sequence;
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "Sequence", "getSequenceString");
189                 exit(1);
190         }
191 }
192 //********************************************************************************************************************
193 //comment can contain '>' so we need to account for that
194 string Sequence::getCommentString(istringstream& fastaFile) {
195         try {
196                 char letter;
197                 string sequence = "";
198                 
199                 while(fastaFile){
200                         letter=fastaFile.get();
201                         if((letter == '\r') || (letter == '\n')){  
202                                 gobble(fastaFile);  //in case its a \r\n situation
203                                 break;
204                         }
205                 }
206                 
207                 return sequence;
208         }
209         catch(exception& e) {
210                 m->errorOut(e, "Sequence", "getCommentString");
211                 exit(1);
212         }
213 }
214 //********************************************************************************************************************
215
216 void Sequence::initialize(){
217         
218         name = "";
219         unaligned = "";
220         aligned = "";
221         pairwise = "";
222         
223         numBases = 0;
224         alignmentLength = 0;
225         isAligned = 0;
226         startPos = -1;
227         endPos = -1;
228         longHomoPolymer = -1;
229         ambigBases = -1;
230         
231 }       
232
233 //********************************************************************************************************************
234
235 void Sequence::setName(string seqName) {
236         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
237         else                                    {       name = seqName;                         }
238 }
239
240 //********************************************************************************************************************
241
242 void Sequence::setUnaligned(string sequence){
243         
244         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
245                 string temp = "";
246                 for(int j=0;j<sequence.length();j++) {
247                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
248                 }
249                 unaligned = temp;
250         }
251         else {
252                 unaligned = sequence;
253         }
254         numBases = unaligned.length();
255         
256 }
257
258 //********************************************************************************************************************
259
260 void Sequence::setAligned(string sequence){
261         
262         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
263         aligned = sequence;
264         alignmentLength = aligned.length();
265         setUnaligned(sequence); 
266
267         if(aligned[0] == '-'){
268                 for(int i=0;i<alignmentLength;i++){
269                         if(aligned[i] == '-'){
270                                 aligned[i] = '.';
271                         }
272                         else{
273                                 break;
274                         }
275                 }
276                 for(int i=alignmentLength-1;i>=0;i--){
277                         if(aligned[i] == '-'){
278                                 aligned[i] = '.';
279                         }
280                         else{
281                                 break;
282                         }
283                 }
284         }
285         isAligned = 1;  
286 }
287
288 //********************************************************************************************************************
289
290 void Sequence::setPairwise(string sequence){
291         pairwise = sequence;
292 }
293
294 //********************************************************************************************************************
295
296 string Sequence::convert2ints() {
297         
298         if(unaligned == "")     {       /* need to throw an error */    }
299         
300         string processed;
301         
302         for(int i=0;i<unaligned.length();i++) {
303                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
304                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
305                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
306                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
307                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
308                 else                                                                    {       processed += '4';       }
309         }
310         return processed;
311 }
312
313 //********************************************************************************************************************
314
315 string Sequence::getName(){
316         return name;
317 }
318
319 //********************************************************************************************************************
320
321 string Sequence::getAligned(){
322         return aligned;
323 }
324
325 //********************************************************************************************************************
326
327 string Sequence::getPairwise(){
328         return pairwise;
329 }
330
331 //********************************************************************************************************************
332
333 string Sequence::getUnaligned(){
334         return unaligned;
335 }
336
337 //********************************************************************************************************************
338
339 int Sequence::getNumBases(){
340         return numBases;
341 }
342
343 //********************************************************************************************************************
344
345 void Sequence::printSequence(ostream& out){
346
347         out << ">" << name << endl;
348         if(isAligned){
349                 out << aligned << endl;
350         }
351         else{
352                 out << unaligned << endl;
353         }
354 }
355
356 //********************************************************************************************************************
357
358 int Sequence::getAlignLength(){
359         return alignmentLength;
360 }
361
362 //********************************************************************************************************************
363
364 int Sequence::getAmbigBases(){
365         if(ambigBases == -1){
366                 ambigBases = 0;
367                 for(int j=0;j<numBases;j++){
368                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
369                                 ambigBases++;
370                         }
371                 }
372         }       
373         
374         return ambigBases;
375 }
376
377 //********************************************************************************************************************
378
379 int Sequence::getLongHomoPolymer(){
380         if(longHomoPolymer == -1){
381                 longHomoPolymer = 1;
382                 int homoPolymer = 1;
383                 for(int j=1;j<numBases;j++){
384                         if(unaligned[j] == unaligned[j-1]){
385                                 homoPolymer++;
386                         }
387                         else{
388                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
389                                 homoPolymer = 1;
390                         }
391                 }
392                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
393         }
394         return longHomoPolymer;
395 }
396
397 //********************************************************************************************************************
398
399 int Sequence::getStartPos(){
400         if(endPos == -1){
401                 for(int j = 0; j < alignmentLength; j++) {
402                         if(aligned[j] != '.'){
403                                 startPos = j + 1;
404                                 break;
405                         }
406                 }
407         }
408         if(isAligned == 0){     startPos = 1;   }
409
410         return startPos;
411 }
412
413 //********************************************************************************************************************
414
415 int Sequence::getEndPos(){
416         if(endPos == -1){
417                 for(int j=alignmentLength-1;j>=0;j--){
418                         if(aligned[j] != '.'){
419                                 endPos = j + 1;
420                                 break;
421                         }
422                 }
423         }
424         if(isAligned == 0){     endPos = numBases;      }
425         
426         return endPos;
427 }
428
429 //********************************************************************************************************************
430
431 bool Sequence::getIsAligned(){
432         return isAligned;
433 }
434
435 //********************************************************************************************************************
436
437 void Sequence::reverseComplement(){
438
439         string temp;
440         for(int i=numBases-1;i>=0;i--){
441                 if(unaligned[i] == 'A')         {       temp += 'T';    }
442                 else if(unaligned[i] == 'T'){   temp += 'A';    }
443                 else if(unaligned[i] == 'G'){   temp += 'C';    }
444                 else if(unaligned[i] == 'C'){   temp += 'G';    }
445                 else                                            {       temp += 'N';    }
446         }
447         unaligned = temp;
448         aligned = temp;
449         
450 }
451
452 //********************************************************************************************************************