]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13
14 Sequence::Sequence(){
15         m = MothurOut::getInstance();
16         initialize();
17 }
18
19 /***********************************************************************/
20
21 Sequence::Sequence(string newName, string sequence) {
22         try {
23                 m = MothurOut::getInstance();
24                 initialize();   
25                 name = newName;
26                 
27                 //setUnaligned removes any gap characters for us
28                 setUnaligned(sequence);
29                 setAligned(sequence);
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "Sequence", "Sequence");
33                 exit(1);
34         }                       
35 }
36 //********************************************************************************************************************
37 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
38 Sequence::Sequence(istringstream& fastaString){
39         try {
40                 m = MothurOut::getInstance();
41         
42                 initialize();
43                 fastaString >> name;
44                 name = name.substr(1);
45                 string sequence;
46         
47                 //read comments
48                 while ((name[0] == '#') && fastaString) { 
49                         while (!fastaString.eof())      {       char c = fastaString.get(); if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
50                         sequence = getCommentString(fastaString);
51                         
52                         if (fastaString) {  
53                                 fastaString >> name;  
54                                 name = name.substr(1);  
55                         }else { 
56                                 name = "";
57                                 break;
58                         }
59                 }
60                 
61                 while (!fastaString.eof())      {       char c = fastaString.get();  if (c == 10 || c == 13){   break;  }       } // get rest of line if there's any crap there
62                 
63                 sequence = getSequenceString(fastaString);              
64                 setAligned(sequence);   
65                 //setUnaligned removes any gap characters for us                                                
66                 setUnaligned(sequence);         
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "Sequence", "Sequence");
70                 exit(1);
71         }                                                               
72 }
73
74 //********************************************************************************************************************
75 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
76 Sequence::Sequence(ifstream& fastaFile){
77         try {
78                 m = MothurOut::getInstance();
79                 initialize();
80                 fastaFile >> name;
81                 name = name.substr(1);
82                 string sequence;
83                 
84                 //read comments
85                 while ((name[0] == '#') && fastaFile) { 
86                         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
87                         sequence = getCommentString(fastaFile);
88                         
89                         if (fastaFile) {  
90                                 fastaFile >> name;  
91                                 name = name.substr(1);  
92                         }else { 
93                                 name = "";
94                                 break;
95                         }
96                 }
97                 
98                 //read real sequence
99                 while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
100                 
101                 sequence = getSequenceString(fastaFile);                
102                 
103                 setAligned(sequence);   
104                 //setUnaligned removes any gap characters for us                                                
105                 setUnaligned(sequence); 
106         }
107         catch(exception& e) {
108                 m->errorOut(e, "Sequence", "Sequence");
109                 exit(1);
110         }                                                       
111 }
112 //********************************************************************************************************************
113 string Sequence::getSequenceString(ifstream& fastaFile) {
114         try {
115                 char letter;
116                 string sequence = "";   
117                 
118                 while(fastaFile){
119                         letter= fastaFile.get();
120                         if(letter == '>'){
121                                 fastaFile.putback(letter);
122                                 break;
123                         }
124                         else if(isprint(letter)){
125                                 letter = toupper(letter);
126                                 if(letter == 'U'){letter = 'T';}
127                                 sequence += letter;
128                         }
129                 }
130                 
131                 return sequence;
132         }
133         catch(exception& e) {
134                 m->errorOut(e, "Sequence", "getSequenceString");
135                 exit(1);
136         }
137 }
138 //********************************************************************************************************************
139 //comment can contain '>' so we need to account for that
140 string Sequence::getCommentString(ifstream& fastaFile) {
141         try {
142                 char letter;
143                 string sequence = "";
144                 
145                 while(fastaFile){
146                         letter=fastaFile.get();
147                         if((letter == '\r') || (letter == '\n')){  
148                                 gobble(fastaFile);  //in case its a \r\n situation
149                                 break;
150                         }
151                 }
152                 
153                 return sequence;
154         }
155         catch(exception& e) {
156                 m->errorOut(e, "Sequence", "getCommentString");
157                 exit(1);
158         }
159 }
160 //********************************************************************************************************************
161 string Sequence::getSequenceString(istringstream& fastaFile) {
162         try {
163                 char letter;
164                 string sequence = "";   
165                 
166                 while(!fastaFile.eof()){
167                         letter= fastaFile.get();
168         
169                         if(letter == '>'){
170                                 fastaFile.putback(letter);
171                                 break;
172                         }
173                         else if(isprint(letter)){
174                                 letter = toupper(letter);
175                                 if(letter == 'U'){letter = 'T';}
176                                 sequence += letter;
177                         }
178                 }
179                 
180                 return sequence;
181         }
182         catch(exception& e) {
183                 m->errorOut(e, "Sequence", "getSequenceString");
184                 exit(1);
185         }
186 }
187 //********************************************************************************************************************
188 //comment can contain '>' so we need to account for that
189 string Sequence::getCommentString(istringstream& fastaFile) {
190         try {
191                 char letter;
192                 string sequence = "";
193                 
194                 while(fastaFile){
195                         letter=fastaFile.get();
196                         if((letter == '\r') || (letter == '\n')){  
197                                 gobble(fastaFile);  //in case its a \r\n situation
198                                 break;
199                         }
200                 }
201                 
202                 return sequence;
203         }
204         catch(exception& e) {
205                 m->errorOut(e, "Sequence", "getCommentString");
206                 exit(1);
207         }
208 }
209 //********************************************************************************************************************
210
211 void Sequence::initialize(){
212         
213         name = "";
214         unaligned = "";
215         aligned = "";
216         pairwise = "";
217         
218         numBases = 0;
219         alignmentLength = 0;
220         isAligned = 0;
221         startPos = -1;
222         endPos = -1;
223         longHomoPolymer = -1;
224         ambigBases = -1;
225         
226 }       
227
228 //********************************************************************************************************************
229
230 void Sequence::setName(string seqName) {
231         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
232         else                                    {       name = seqName;                         }
233 }
234
235 //********************************************************************************************************************
236
237 void Sequence::setUnaligned(string sequence){
238         
239         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
240                 string temp = "";
241                 for(int j=0;j<sequence.length();j++) {
242                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
243                 }
244                 unaligned = temp;
245         }
246         else {
247                 unaligned = sequence;
248         }
249         numBases = unaligned.length();
250         
251 }
252
253 //********************************************************************************************************************
254
255 void Sequence::setAligned(string sequence){
256         
257         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
258         aligned = sequence;
259         alignmentLength = aligned.length();
260         setUnaligned(sequence); 
261
262         if(aligned[0] == '-'){
263                 for(int i=0;i<alignmentLength;i++){
264                         if(aligned[i] == '-'){
265                                 aligned[i] = '.';
266                         }
267                         else{
268                                 break;
269                         }
270                 }
271                 for(int i=alignmentLength-1;i>=0;i--){
272                         if(aligned[i] == '-'){
273                                 aligned[i] = '.';
274                         }
275                         else{
276                                 break;
277                         }
278                 }
279         }
280         isAligned = 1;  
281 }
282
283 //********************************************************************************************************************
284
285 void Sequence::setPairwise(string sequence){
286         pairwise = sequence;
287 }
288
289 //********************************************************************************************************************
290
291 string Sequence::convert2ints() {
292         
293         if(unaligned == "")     {       /* need to throw an error */    }
294         
295         string processed;
296         
297         for(int i=0;i<unaligned.length();i++) {
298                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
299                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
300                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
301                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
302                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
303                 else                                                                    {       processed += '4';       }
304         }
305         return processed;
306 }
307
308 //********************************************************************************************************************
309
310 string Sequence::getName(){
311         return name;
312 }
313
314 //********************************************************************************************************************
315
316 string Sequence::getAligned(){
317         return aligned;
318 }
319
320 //********************************************************************************************************************
321
322 string Sequence::getPairwise(){
323         return pairwise;
324 }
325
326 //********************************************************************************************************************
327
328 string Sequence::getUnaligned(){
329         return unaligned;
330 }
331
332 //********************************************************************************************************************
333
334 int Sequence::getNumBases(){
335         return numBases;
336 }
337
338 //********************************************************************************************************************
339
340 void Sequence::printSequence(ostream& out){
341
342         out << ">" << name << endl;
343         if(isAligned){
344                 out << aligned << endl;
345         }
346         else{
347                 out << unaligned << endl;
348         }
349 }
350
351 //********************************************************************************************************************
352
353 int Sequence::getAlignLength(){
354         return alignmentLength;
355 }
356
357 //********************************************************************************************************************
358
359 int Sequence::getAmbigBases(){
360         if(ambigBases == -1){
361                 ambigBases = 0;
362                 for(int j=0;j<numBases;j++){
363                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
364                                 ambigBases++;
365                         }
366                 }
367         }       
368         
369         return ambigBases;
370 }
371
372 //********************************************************************************************************************
373
374 int Sequence::getLongHomoPolymer(){
375         if(longHomoPolymer == -1){
376                 longHomoPolymer = 1;
377                 int homoPolymer = 1;
378                 for(int j=1;j<numBases;j++){
379                         if(unaligned[j] == unaligned[j-1]){
380                                 homoPolymer++;
381                         }
382                         else{
383                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
384                                 homoPolymer = 1;
385                         }
386                 }
387                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
388         }
389         return longHomoPolymer;
390 }
391
392 //********************************************************************************************************************
393
394 int Sequence::getStartPos(){
395         if(endPos == -1){
396                 for(int j = 0; j < alignmentLength; j++) {
397                         if(aligned[j] != '.'){
398                                 startPos = j + 1;
399                                 break;
400                         }
401                 }
402         }
403         if(isAligned == 0){     startPos = 1;   }
404
405         return startPos;
406 }
407
408 //********************************************************************************************************************
409
410 int Sequence::getEndPos(){
411         if(endPos == -1){
412                 for(int j=alignmentLength-1;j>=0;j--){
413                         if(aligned[j] != '.'){
414                                 endPos = j + 1;
415                                 break;
416                         }
417                 }
418         }
419         if(isAligned == 0){     endPos = numBases;      }
420         
421         return endPos;
422 }
423
424 //********************************************************************************************************************
425
426 bool Sequence::getIsAligned(){
427         return isAligned;
428 }
429
430 //********************************************************************************************************************
431
432 void Sequence::reverseComplement(){
433
434         string temp;
435         for(int i=numBases-1;i>=0;i--){
436                 if(unaligned[i] == 'A')         {       temp += 'T';    }
437                 else if(unaligned[i] == 'T'){   temp += 'A';    }
438                 else if(unaligned[i] == 'G'){   temp += 'C';    }
439                 else if(unaligned[i] == 'C'){   temp += 'G';    }
440                 else                                            {       temp += 'N';    }
441         }
442         unaligned = temp;
443         aligned = temp;
444         
445 }
446 #ifdef USE_MPI  
447 //********************************************************************************************************************
448 int Sequence::MPISend(int receiver) {
449         try {
450                 //send name - string
451                 int length = name.length();
452                 char buf[name.length()];
453                 strcpy(buf, name.c_str()); 
454                 
455                 MPI_Send(&length, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); 
456
457                 MPI_Send(&buf, length, MPI_CHAR, receiver, 2001, MPI_COMM_WORLD);
458         
459                 //send aligned - string
460                 length = aligned.length();
461                 char buf2[aligned.length()];
462                 strcpy(buf2, aligned.c_str()); 
463         
464                 MPI_Send(&length, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); 
465         
466                 MPI_Send(&buf2, length, MPI_CHAR, receiver, 2001, MPI_COMM_WORLD);
467         
468                 return 0;
469
470         }
471         catch(exception& e) {
472                 m->errorOut(e, "Sequence", "MPISend");
473                 exit(1);
474         }
475 }
476 /**************************************************************************************************/
477 int Sequence::MPIRecv(int sender) {
478         try {
479                 MPI_Status status;
480         
481                 //receive name - string
482                 int length;
483                 MPI_Recv(&length, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status);
484         
485                 char buf[length];
486                 MPI_Recv(&buf, length, MPI_CHAR, sender, 2001, MPI_COMM_WORLD, &status);
487                 name = buf;
488                 
489                 //receive aligned - string
490                 MPI_Recv(&length, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status);
491         
492                 char buf2[length];
493                 MPI_Recv(&buf2, length, MPI_CHAR, sender, 2001, MPI_COMM_WORLD, &status);
494                 aligned = buf2;
495                 
496                 setAligned(aligned);
497                 
498                 return 0;
499
500         }
501         catch(exception& e) {
502                 m->errorOut(e, "Sequence", "MPIRecv");
503                 exit(1);
504         }
505 }
506 #endif
507 /**************************************************************************************************/