]> git.donarmstrong.com Git - mothur.git/blob - sequence.cpp
modified sequence class to read fasta files with comments. this required modification...
[mothur.git] / sequence.cpp
1 /*
2  *  sequence.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/15/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "sequence.hpp"
11
12 /***********************************************************************/
13
14 Sequence::Sequence(){
15         initialize();
16 }
17
18 /***********************************************************************/
19
20 Sequence::Sequence(string newName, string sequence) {
21
22         initialize();   
23         name = newName;
24         
25         //setUnaligned removes any gap characters for us
26         setUnaligned(sequence);
27         setAligned(sequence);
28         
29 }
30 //********************************************************************************************************************
31 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
32 Sequence::Sequence(ifstream& fastaFile){
33
34         initialize();
35         fastaFile >> name;
36         name = name.substr(1);
37         string sequence;
38         
39         //read comments
40         while ((name[0] == '#') && fastaFile) { 
41             while (!fastaFile.eof())    {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
42                 sequence = getCommentString(fastaFile);
43                 
44                 if (fastaFile) {  
45                         fastaFile >> name;  
46                         name = name.substr(1);  
47                 }else { 
48                         name = "";
49                         break;
50                 }
51         }
52         
53         //read real sequence
54         while (!fastaFile.eof())        {       char c = fastaFile.get(); if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
55                 
56         sequence = getSequenceString(fastaFile);                
57         
58         setAligned(sequence);   
59         //setUnaligned removes any gap characters for us                                                
60         setUnaligned(sequence);                                                         
61 }
62 //********************************************************************************************************************
63 string Sequence::getSequenceString(ifstream& fastaFile) {
64         try {
65                 char letter;
66                 string sequence = "";   
67                 
68                 while(fastaFile){
69                         letter= fastaFile.get();
70                         if(letter == '>'){
71                                 fastaFile.putback(letter);
72                                 break;
73                         }
74                         else if(isprint(letter)){
75                                 letter = toupper(letter);
76                                 if(letter == 'U'){letter = 'T';}
77                                 sequence += letter;
78                         }
79                 }
80                 
81                 return sequence;
82         }
83         catch(exception& e) {
84                 errorOut(e, "Sequence", "getSequenceString");
85                 exit(1);
86         }
87 }
88 //********************************************************************************************************************
89 //comment can contain '>' so we need to account for that
90 string Sequence::getCommentString(ifstream& fastaFile) {
91         try {
92                 char letter;
93                 string sequence = "";
94                 
95                 while(fastaFile){
96                         letter=fastaFile.get();
97                         if((letter == '\r') || (letter == '\n')){  
98                                 gobble(fastaFile);  //in case its a \r\n situation
99                                 break;
100                         }
101                 }
102                 
103                 return sequence;
104         }
105         catch(exception& e) {
106                 errorOut(e, "Sequence", "getCommentString");
107                 exit(1);
108         }
109 }
110
111 //********************************************************************************************************************
112
113 void Sequence::initialize(){
114         
115         name = "";
116         unaligned = "";
117         aligned = "";
118         pairwise = "";
119         
120         numBases = 0;
121         alignmentLength = 0;
122         isAligned = 0;
123         startPos = -1;
124         endPos = -1;
125         longHomoPolymer = -1;
126         ambigBases = -1;
127         
128 }       
129
130 //********************************************************************************************************************
131
132 void Sequence::setName(string seqName) {
133         if(seqName[0] == '>')   {       name = seqName.substr(1);       }
134         else                                    {       name = seqName;                         }
135 }
136
137 //********************************************************************************************************************
138
139 void Sequence::setUnaligned(string sequence){
140         
141         if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
142                 string temp = "";
143                 for(int j=0;j<sequence.length();j++) {
144                         if(isalpha(sequence[j]))        {       temp += sequence[j];    }
145                 }
146                 unaligned = temp;
147         }
148         else {
149                 unaligned = sequence;
150         }
151         numBases = unaligned.length();
152         
153 }
154
155 //********************************************************************************************************************
156
157 void Sequence::setAligned(string sequence){
158         
159         //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
160         aligned = sequence;
161         alignmentLength = aligned.length();
162         setUnaligned(sequence); 
163
164         if(aligned[0] == '-'){
165                 for(int i=0;i<alignmentLength;i++){
166                         if(aligned[i] == '-'){
167                                 aligned[i] = '.';
168                         }
169                         else{
170                                 break;
171                         }
172                 }
173                 for(int i=alignmentLength-1;i>=0;i--){
174                         if(aligned[i] == '-'){
175                                 aligned[i] = '.';
176                         }
177                         else{
178                                 break;
179                         }
180                 }
181         }
182         isAligned = 1;  
183 }
184
185 //********************************************************************************************************************
186
187 void Sequence::setPairwise(string sequence){
188         pairwise = sequence;
189 }
190
191 //********************************************************************************************************************
192
193 string Sequence::convert2ints() {
194         
195         if(unaligned == "")     {       /* need to throw an error */    }
196         
197         string processed;
198         
199         for(int i=0;i<unaligned.length();i++) {
200                 if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
201                 else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
202                 else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
203                 else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
204                 else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
205                 else                                                                    {       processed += '4';       }
206         }
207         return processed;
208 }
209
210 //********************************************************************************************************************
211
212 string Sequence::getName(){
213         return name;
214 }
215
216 //********************************************************************************************************************
217
218 string Sequence::getAligned(){
219         return aligned;
220 }
221
222 //********************************************************************************************************************
223
224 string Sequence::getPairwise(){
225         return pairwise;
226 }
227
228 //********************************************************************************************************************
229
230 string Sequence::getUnaligned(){
231         return unaligned;
232 }
233
234 //********************************************************************************************************************
235
236 int Sequence::getNumBases(){
237         return numBases;
238 }
239
240 //********************************************************************************************************************
241
242 void Sequence::printSequence(ostream& out){
243
244         out << ">" << name << endl;
245         if(isAligned){
246                 out << aligned << endl;
247         }
248         else{
249                 out << unaligned << endl;
250         }
251 }
252
253 //********************************************************************************************************************
254
255 int Sequence::getAlignLength(){
256         return alignmentLength;
257 }
258
259 //********************************************************************************************************************
260
261 int Sequence::getAmbigBases(){
262         if(ambigBases == -1){
263                 ambigBases = 0;
264                 for(int j=0;j<numBases;j++){
265                         if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
266                                 ambigBases++;
267                         }
268                 }
269         }       
270         
271         return ambigBases;
272 }
273
274 //********************************************************************************************************************
275
276 int Sequence::getLongHomoPolymer(){
277         if(longHomoPolymer == -1){
278                 longHomoPolymer = 1;
279                 int homoPolymer = 1;
280                 for(int j=1;j<numBases;j++){
281                         if(unaligned[j] == unaligned[j-1]){
282                                 homoPolymer++;
283                         }
284                         else{
285                                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
286                                 homoPolymer = 1;
287                         }
288                 }
289                 if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
290         }
291         return longHomoPolymer;
292 }
293
294 //********************************************************************************************************************
295
296 int Sequence::getStartPos(){
297         if(endPos == -1){
298                 for(int j = 0; j < alignmentLength; j++) {
299                         if(aligned[j] != '.'){
300                                 startPos = j + 1;
301                                 break;
302                         }
303                 }
304         }
305         if(isAligned == 0){     startPos = 1;   }
306
307         return startPos;
308 }
309
310 //********************************************************************************************************************
311
312 int Sequence::getEndPos(){
313         if(endPos == -1){
314                 for(int j=alignmentLength-1;j>=0;j--){
315                         if(aligned[j] != '.'){
316                                 endPos = j + 1;
317                                 break;
318                         }
319                 }
320         }
321         if(isAligned == 0){     endPos = numBases;      }
322         
323         return endPos;
324 }
325
326 //********************************************************************************************************************
327
328 bool Sequence::getIsAligned(){
329         return isAligned;
330 }
331
332 //********************************************************************************************************************
333
334 void Sequence::reverseComplement(){
335
336         string temp;
337         for(int i=numBases-1;i>=0;i--){
338                 if(unaligned[i] == 'A')         {       temp += 'T';    }
339                 else if(unaligned[i] == 'T'){   temp += 'A';    }
340                 else if(unaligned[i] == 'G'){   temp += 'C';    }
341                 else if(unaligned[i] == 'C'){   temp += 'G';    }
342                 else                                            {       temp += 'N';    }
343         }
344         unaligned = temp;
345         aligned = temp;
346         
347 }
348
349 //********************************************************************************************************************