]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
added MPI to dist.seqs command
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13
14 Sequence::Sequence(){
15         m = MothurOut::getInstance();
16         initialize();
17 }
18
19 /***********************************************************************/
20
21 Sequence::Sequence(string newName, string sequence) {
22         try {
23                 m = MothurOut::getInstance();
24                 initialize();   
25                 name = newName;
26                 
27                 //setUnaligned removes any gap characters for us
28                 setUnaligned(sequence);
29                 setAligned(sequence);
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "Sequence", "Sequence");
33                 exit(1);
34         }                       
35 }
36 //********************************************************************************************************************
37 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
38 Sequence::Sequence(istringstream& fastaString){
39         try {
40                 m = MothurOut::getInstance();
41         int pid;
42         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
43         cout << pid << " after mothur instance " << &name << endl;
44                 initialize();
45         cout << "after mothur initialize" << endl;
46                 fastaString >> name;
47         cout << pid << " after name "  << name << endl;
48                 name = name.substr(1);
49
50                 string sequence;
51 cout << pid << " name = "  << name << endl;             
52                 //read comments
53                 while ((name[0] == '#') && fastaString) { 
54                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
55                         sequence = getCommentString(fastaString);
56                         
57                         if (fastaString) {  
58                                 fastaString >> name;  
59                                 name = name.substr(1);  
60                         }else { 
61                                 name = "";
62                                 break;
63                         }
64                         cout << pid << "in while comment" << endl;      
65                 }
66         cout << pid << "after mothur comment" << endl;  
67                 while (!fastaString.eof())      {       char c = fastaString.get(); cout << pid << " char = " << int(c) << endl; if (c == 10 || c == 13){       break;  }       } // get rest of line if there's any crap there
68         cout << pid << " after mothur name" << endl;    
69                 sequence = getSequenceString(fastaString);              
70         cout << pid << " after mothur sequence" << endl;        
71                 setAligned(sequence);   
72                 //setUnaligned removes any gap characters for us                                                
73                 setUnaligned(sequence);         
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "Sequence", "Sequence");
77                 exit(1);
78         }                                                               
79 }
80
81 //********************************************************************************************************************
82 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
83 Sequence::Sequence(ifstream& fastaFile){
84         try {
85                 m = MothurOut::getInstance();
86                 initialize();
87                 fastaFile >> name;
88                 name = name.substr(1);
89                 string sequence;
90                 
91                 //read comments
92                 while ((name[0] == '#') && fastaFile) { 
93                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
94                         sequence = getCommentString(fastaFile);
95                         
96                         if (fastaFile) {  
97                                 fastaFile >> name;  
98                                 name = name.substr(1);  
99                         }else { 
100                                 name = "";
101                                 break;
102                         }
103                 }
104                 
105                 //read real sequence
106                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
107                 
108                 sequence = getSequenceString(fastaFile);                
109                 
110                 setAligned(sequence);   
111                 //setUnaligned removes any gap characters for us                                                
112                 setUnaligned(sequence); 
113         }
114         catch(exception& e) {
115                 m->errorOut(e, "Sequence", "Sequence");
116                 exit(1);
117         }                                                       
118 }
119 //********************************************************************************************************************
120 string Sequence::getSequenceString(ifstream& fastaFile) {
121         try {
122                 char letter;
123                 string sequence = "";   
124                 
125                 while(fastaFile){
126                         letter= fastaFile.get();
127                         if(letter == '>'){
128                                 fastaFile.putback(letter);
129                                 break;
130                         }
131                         else if(isprint(letter)){
132                                 letter = toupper(letter);
133                                 if(letter == 'U'){letter = 'T';}
134                                 sequence += letter;
135                         }
136                 }
137                 
138                 return sequence;
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "Sequence", "getSequenceString");
142                 exit(1);
143         }
144 }
145 //********************************************************************************************************************
146 //comment can contain '>' so we need to account for that
147 string Sequence::getCommentString(ifstream& fastaFile) {
148         try {
149                 char letter;
150                 string sequence = "";
151                 
152                 while(fastaFile){
153                         letter=fastaFile.get();
154                         if((letter == '\r') || (letter == '\n')){  
155                                 gobble(fastaFile);  //in case its a \r\n situation
156                                 break;
157                         }
158                 }
159                 
160                 return sequence;
161         }
162         catch(exception& e) {
163                 m->errorOut(e, "Sequence", "getCommentString");
164                 exit(1);
165         }
166 }
167 //********************************************************************************************************************
168 string Sequence::getSequenceString(istringstream& fastaFile) {
169         try {
170                 char letter;
171                 string sequence = "";   
172 int pid;
173 MPI_Comm_rank(MPI_COMM_WORLD, &pid);            
174                 while(!fastaFile.eof()){
175                         letter= fastaFile.get();
176         cout << pid << '\t' << letter << endl;
177                         if(letter == '>'){
178                                 fastaFile.putback(letter);
179                                 break;
180                         }
181                         else if(isprint(letter)){
182                                 letter = toupper(letter);
183                                 if(letter == 'U'){letter = 'T';}
184                                 sequence += letter;
185                         }
186                 }
187                 
188                 return sequence;
189         }
190         catch(exception& e) {
191                 m->errorOut(e, "Sequence", "getSequenceString");
192                 exit(1);
193         }
194 }
195 //********************************************************************************************************************
196 //comment can contain '>' so we need to account for that
197 string Sequence::getCommentString(istringstream& fastaFile) {
198         try {
199                 char letter;
200                 string sequence = "";
201                 
202                 while(fastaFile){
203                         letter=fastaFile.get();
204                         if((letter == '\r') || (letter == '\n')){  
205                                 gobble(fastaFile);  //in case its a \r\n situation
206                                 break;
207                         }
208                 }
209                 
210                 return sequence;
211         }
212         catch(exception& e) {
213                 m->errorOut(e, "Sequence", "getCommentString");
214                 exit(1);
215         }
216 }
217 //********************************************************************************************************************
218
219 void Sequence::initialize(){
220         
221         name = "";
222         unaligned = "";
223         aligned = "";
224         pairwise = "";
225         
226         numBases = 0;
227         alignmentLength = 0;
228         isAligned = 0;
229         startPos = -1;
230         endPos = -1;
231         longHomoPolymer = -1;
232         ambigBases = -1;
233         
234 }       
235
236 //********************************************************************************************************************
237
238 void Sequence::setName(string seqName) {
239         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
240         else                                    {       name = seqName;                         }
241 }
242
243 //********************************************************************************************************************
244
245 void Sequence::setUnaligned(string sequence){
246         
247         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
248                 string temp = "";
249                 for(int j=0;j<sequence.length();j++) {
250                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
251                 }
252                 unaligned = temp;
253         }
254         else {
255                 unaligned = sequence;
256         }
257         numBases = unaligned.length();
258         
259 }
260
261 //********************************************************************************************************************
262
263 void Sequence::setAligned(string sequence){
264         
265         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
266         aligned = sequence;
267         alignmentLength = aligned.length();
268         setUnaligned(sequence); 
269
270         if(aligned[0] == '-'){
271                 for(int i=0;i<alignmentLength;i++){
272                         if(aligned[i] == '-'){
273                                 aligned[i] = '.';
274                         }
275                         else{
276                                 break;
277                         }
278                 }
279                 for(int i=alignmentLength-1;i>=0;i--){
280                         if(aligned[i] == '-'){
281                                 aligned[i] = '.';
282                         }
283                         else{
284                                 break;
285                         }
286                 }
287         }
288         isAligned = 1;  
289 }
290
291 //********************************************************************************************************************
292
293 void Sequence::setPairwise(string sequence){
294         pairwise = sequence;
295 }
296
297 //********************************************************************************************************************
298
299 string Sequence::convert2ints() {
300         
301         if(unaligned == "")     {       /* need to throw an error */    }
302         
303         string processed;
304         
305         for(int i=0;i<unaligned.length();i++) {
306                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
307                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
308                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
309                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
310                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
311                 else                                                                    {       processed += '4';       }
312         }
313         return processed;
314 }
315
316 //********************************************************************************************************************
317
318 string Sequence::getName(){
319         return name;
320 }
321
322 //********************************************************************************************************************
323
324 string Sequence::getAligned(){
325         return aligned;
326 }
327
328 //********************************************************************************************************************
329
330 string Sequence::getPairwise(){
331         return pairwise;
332 }
333
334 //********************************************************************************************************************
335
336 string Sequence::getUnaligned(){
337         return unaligned;
338 }
339
340 //********************************************************************************************************************
341
342 int Sequence::getNumBases(){
343         return numBases;
344 }
345
346 //********************************************************************************************************************
347
348 void Sequence::printSequence(ostream& out){
349
350         out << ">" << name << endl;
351         if(isAligned){
352                 out << aligned << endl;
353         }
354         else{
355                 out << unaligned << endl;
356         }
357 }
358
359 //********************************************************************************************************************
360
361 int Sequence::getAlignLength(){
362         return alignmentLength;
363 }
364
365 //********************************************************************************************************************
366
367 int Sequence::getAmbigBases(){
368         if(ambigBases == -1){
369                 ambigBases = 0;
370                 for(int j=0;j<numBases;j++){
371                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
372                                 ambigBases++;
373                         }
374                 }
375         }       
376         
377         return ambigBases;
378 }
379
380 //********************************************************************************************************************
381
382 int Sequence::getLongHomoPolymer(){
383         if(longHomoPolymer == -1){
384                 longHomoPolymer = 1;
385                 int homoPolymer = 1;
386                 for(int j=1;j<numBases;j++){
387                         if(unaligned[j] == unaligned[j-1]){
388                                 homoPolymer++;
389                         }
390                         else{
391                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
392                                 homoPolymer = 1;
393                         }
394                 }
395                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
396         }
397         return longHomoPolymer;
398 }
399
400 //********************************************************************************************************************
401
402 int Sequence::getStartPos(){
403         if(endPos == -1){
404                 for(int j = 0; j < alignmentLength; j++) {
405                         if(aligned[j] != '.'){
406                                 startPos = j + 1;
407                                 break;
408                         }
409                 }
410         }
411         if(isAligned == 0){     startPos = 1;   }
412
413         return startPos;
414 }
415
416 //********************************************************************************************************************
417
418 int Sequence::getEndPos(){
419         if(endPos == -1){
420                 for(int j=alignmentLength-1;j>=0;j--){
421                         if(aligned[j] != '.'){
422                                 endPos = j + 1;
423                                 break;
424                         }
425                 }
426         }
427         if(isAligned == 0){     endPos = numBases;      }
428         
429         return endPos;
430 }
431
432 //********************************************************************************************************************
433
434 bool Sequence::getIsAligned(){
435         return isAligned;
436 }
437
438 //********************************************************************************************************************
439
440 void Sequence::reverseComplement(){
441
442         string temp;
443         for(int i=numBases-1;i>=0;i--){
444                 if(unaligned[i] == 'A')         {       temp += 'T';    }
445                 else if(unaligned[i] == 'T'){   temp += 'A';    }
446                 else if(unaligned[i] == 'G'){   temp += 'C';    }
447                 else if(unaligned[i] == 'C'){   temp += 'G';    }
448                 else                                            {       temp += 'N';    }
449         }
450         unaligned = temp;
451         aligned = temp;
452         
453 }
454
455 //********************************************************************************************************************