]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
created mothurOut class to handle logfiles
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13
14 Sequence::Sequence(){
15         m = MothurOut::getInstance();
16         initialize();
17 }
18
19 /***********************************************************************/
20
21 Sequence::Sequence(string newName, string sequence) {
22
23         initialize();   
24         name = newName;
25         
26         //setUnaligned removes any gap characters for us
27         setUnaligned(sequence);
28         setAligned(sequence);
29         
30 }
31 //********************************************************************************************************************
32 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
33 Sequence::Sequence(ifstream& fastaFile){
34
35         initialize();
36         fastaFile >> name;
37         name = name.substr(1);
38         string sequence;
39         
40         //read comments
41         while ((name[0] == '#') && fastaFile) { 
42             while (!fastaFile.eof())    {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
43                 sequence = getCommentString(fastaFile);
44                 
45                 if (fastaFile) {  
46                         fastaFile >> name;  
47                         name = name.substr(1);  
48                 }else { 
49                         name = "";
50                         break;
51                 }
52         }
53         
54         //read real sequence
55         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
56                 
57         sequence = getSequenceString(fastaFile);                
58         
59         setAligned(sequence);   
60         //setUnaligned removes any gap characters for us                                                
61         setUnaligned(sequence);                                                         
62 }
63 //********************************************************************************************************************
64 string Sequence::getSequenceString(ifstream& fastaFile) {
65         try {
66                 char letter;
67                 string sequence = "";   
68                 
69                 while(fastaFile){
70                         letter= fastaFile.get();
71                         if(letter == '>'){
72                                 fastaFile.putback(letter);
73                                 break;
74                         }
75                         else if(isprint(letter)){
76                                 letter = toupper(letter);
77                                 if(letter == 'U'){letter = 'T';}
78                                 sequence += letter;
79                         }
80                 }
81                 
82                 return sequence;
83         }
84         catch(exception& e) {
85                 m->errorOut(e, "Sequence", "getSequenceString");
86                 exit(1);
87         }
88 }
89 //********************************************************************************************************************
90 //comment can contain '>' so we need to account for that
91 string Sequence::getCommentString(ifstream& fastaFile) {
92         try {
93                 char letter;
94                 string sequence = "";
95                 
96                 while(fastaFile){
97                         letter=fastaFile.get();
98                         if((letter == '\r') || (letter == '\n')){  
99                                 gobble(fastaFile);  //in case its a \r\n situation
100                                 break;
101                         }
102                 }
103                 
104                 return sequence;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "Sequence", "getCommentString");
108                 exit(1);
109         }
110 }
111
112 //********************************************************************************************************************
113
114 void Sequence::initialize(){
115         
116         name = "";
117         unaligned = "";
118         aligned = "";
119         pairwise = "";
120         
121         numBases = 0;
122         alignmentLength = 0;
123         isAligned = 0;
124         startPos = -1;
125         endPos = -1;
126         longHomoPolymer = -1;
127         ambigBases = -1;
128         
129 }       
130
131 //********************************************************************************************************************
132
133 void Sequence::setName(string seqName) {
134         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
135         else                                    {       name = seqName;                         }
136 }
137
138 //********************************************************************************************************************
139
140 void Sequence::setUnaligned(string sequence){
141         
142         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
143                 string temp = "";
144                 for(int j=0;j<sequence.length();j++) {
145                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
146                 }
147                 unaligned = temp;
148         }
149         else {
150                 unaligned = sequence;
151         }
152         numBases = unaligned.length();
153         
154 }
155
156 //********************************************************************************************************************
157
158 void Sequence::setAligned(string sequence){
159         
160         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
161         aligned = sequence;
162         alignmentLength = aligned.length();
163         setUnaligned(sequence); 
164
165         if(aligned[0] == '-'){
166                 for(int i=0;i<alignmentLength;i++){
167                         if(aligned[i] == '-'){
168                                 aligned[i] = '.';
169                         }
170                         else{
171                                 break;
172                         }
173                 }
174                 for(int i=alignmentLength-1;i>=0;i--){
175                         if(aligned[i] == '-'){
176                                 aligned[i] = '.';
177                         }
178                         else{
179                                 break;
180                         }
181                 }
182         }
183         isAligned = 1;  
184 }
185
186 //********************************************************************************************************************
187
188 void Sequence::setPairwise(string sequence){
189         pairwise = sequence;
190 }
191
192 //********************************************************************************************************************
193
194 string Sequence::convert2ints() {
195         
196         if(unaligned == "")     {       /* need to throw an error */    }
197         
198         string processed;
199         
200         for(int i=0;i<unaligned.length();i++) {
201                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
202                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
203                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
204                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
205                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
206                 else                                                                    {       processed += '4';       }
207         }
208         return processed;
209 }
210
211 //********************************************************************************************************************
212
213 string Sequence::getName(){
214         return name;
215 }
216
217 //********************************************************************************************************************
218
219 string Sequence::getAligned(){
220         return aligned;
221 }
222
223 //********************************************************************************************************************
224
225 string Sequence::getPairwise(){
226         return pairwise;
227 }
228
229 //********************************************************************************************************************
230
231 string Sequence::getUnaligned(){
232         return unaligned;
233 }
234
235 //********************************************************************************************************************
236
237 int Sequence::getNumBases(){
238         return numBases;
239 }
240
241 //********************************************************************************************************************
242
243 void Sequence::printSequence(ostream& out){
244
245         out << ">" << name << endl;
246         if(isAligned){
247                 out << aligned << endl;
248         }
249         else{
250                 out << unaligned << endl;
251         }
252 }
253
254 //********************************************************************************************************************
255
256 int Sequence::getAlignLength(){
257         return alignmentLength;
258 }
259
260 //********************************************************************************************************************
261
262 int Sequence::getAmbigBases(){
263         if(ambigBases == -1){
264                 ambigBases = 0;
265                 for(int j=0;j<numBases;j++){
266                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
267                                 ambigBases++;
268                         }
269                 }
270         }       
271         
272         return ambigBases;
273 }
274
275 //********************************************************************************************************************
276
277 int Sequence::getLongHomoPolymer(){
278         if(longHomoPolymer == -1){
279                 longHomoPolymer = 1;
280                 int homoPolymer = 1;
281                 for(int j=1;j<numBases;j++){
282                         if(unaligned[j] == unaligned[j-1]){
283                                 homoPolymer++;
284                         }
285                         else{
286                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
287                                 homoPolymer = 1;
288                         }
289                 }
290                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
291         }
292         return longHomoPolymer;
293 }
294
295 //********************************************************************************************************************
296
297 int Sequence::getStartPos(){
298         if(endPos == -1){
299                 for(int j = 0; j < alignmentLength; j++) {
300                         if(aligned[j] != '.'){
301                                 startPos = j + 1;
302                                 break;
303                         }
304                 }
305         }
306         if(isAligned == 0){     startPos = 1;   }
307
308         return startPos;
309 }
310
311 //********************************************************************************************************************
312
313 int Sequence::getEndPos(){
314         if(endPos == -1){
315                 for(int j=alignmentLength-1;j>=0;j--){
316                         if(aligned[j] != '.'){
317                                 endPos = j + 1;
318                                 break;
319                         }
320                 }
321         }
322         if(isAligned == 0){     endPos = numBases;      }
323         
324         return endPos;
325 }
326
327 //********************************************************************************************************************
328
329 bool Sequence::getIsAligned(){
330         return isAligned;
331 }
332
333 //********************************************************************************************************************
334
335 void Sequence::reverseComplement(){
336
337         string temp;
338         for(int i=numBases-1;i>=0;i--){
339                 if(unaligned[i] == 'A')         {       temp += 'T';    }
340                 else if(unaligned[i] == 'T'){   temp += 'A';    }
341                 else if(unaligned[i] == 'G'){   temp += 'C';    }
342                 else if(unaligned[i] == 'C'){   temp += 'G';    }
343                 else                                            {       temp += 'N';    }
344         }
345         unaligned = temp;
346         aligned = temp;
347         
348 }
349
350 //********************************************************************************************************************