]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.cpp
77da3a60f7534dca70f65d73c262c27bf9734648
[mothur.git] / trimseqscommand.cpp
1 /*
2  *  trimseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 6/6/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "trimseqscommand.h"
11
12 //***************************************************************************************************************
13
14 TrimSeqsCommand::TrimSeqsCommand(string option){
15         try {
16                 
17                 globaldata = GlobalData::getInstance();
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength"};
26                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27                         
28                         parser = new OptionParser();
29                         parser->parse(option, parameters);      delete parser; 
30                         
31                         ValidParameters* validParameter = new ValidParameters();
32                 
33                         //check to make sure all parameters are valid for command
34                         for (it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter->isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //check for required parameters
39                         fastafile = validParameter->validFile(parameters, "fasta", true);
40                         if (fastafile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; }
41                         else if (fastafile == "not open") { abort = true; }     
42                         else { globaldata->setFastaFile(fastafile); }
43                 
44                 
45                         //check for optional parameter and set defaults
46                         // ...at some point should added some additional type checking...
47                         string temp;
48                         temp = validParameter->validFile(parameters, "flip", false);                    if (temp == "not found") { temp = "0"; }
49                         if(isTrue(temp))        {       flip = 1;       }
50                 
51                         temp = validParameter->validFile(parameters, "oligos", false);                  if (temp == "not found") { temp = ""; }
52                         if(temp != "")          {       oligos = 1;      } 
53                         else {  oligos = 0;      }
54
55                         temp = validParameter->validFile(parameters, "maxambig", false);                if (temp == "not found") { temp = "-1"; }
56                         convert(temp, maxAmbig);  
57
58                         temp = validParameter->validFile(parameters, "maxhomop", false);                if (temp == "not found") { temp = "0"; }
59                         convert(temp, maxHomoP);  
60
61                         temp = validParameter->validFile(parameters, "minlength", false);               if (temp == "not found") { temp = "0"; }
62                         convert(temp, minLength); 
63                         
64                         temp = validParameter->validFile(parameters, "maxlength", false);               if (temp == "not found") { temp = "0"; }
65                         convert(temp, maxLength); 
66                         
67                         if(!flip && !oligos && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP ){       cout << "huh?" << endl; }
68                         
69                         delete validParameter;
70                 }
71
72         }
73         catch(exception& e) {
74                 cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
75                 exit(1);
76         }
77         catch(...) {
78                 cout << "An unknown error has occurred in the TrimSeqsCommand class function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
79                 exit(1);
80         }       
81 }
82 //**********************************************************************************************************************
83
84 void TrimSeqsCommand::help(){
85         try {
86                 cout << "The trim.seqs command reads a fastafile and creates ....." << "\n";
87                 cout << "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength and maxlength." << "\n";
88                 cout << "The fasta parameter is required." << "\n";
89                 cout << "The flip parameter .... The default is 0." << "\n";
90                 cout << "The oligos parameter .... The default is ""." << "\n";
91                 cout << "The maxambig parameter .... The default is -1." << "\n";
92                 cout << "The maxhomop parameter .... The default is 0." << "\n";
93                 cout << "The minlength parameter .... The default is 0." << "\n";
94                 cout << "The maxlength parameter .... The default is 0." << "\n";
95                 cout << "The trim.seqs command should be in the following format: " << "\n";
96                 cout << "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig,  " << "\n";
97                 cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  " << "\n";    
98                 cout << "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n";
99                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n";
100
101         }
102         catch(exception& e) {
103                 cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
104                 exit(1);
105         }
106         catch(...) {
107                 cout << "An unknown error has occurred in the TrimSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
108                 exit(1);
109         }       
110 }
111
112
113 //***************************************************************************************************************
114
115 TrimSeqsCommand::~TrimSeqsCommand(){    /*      do nothing      */      }
116
117 //***************************************************************************************************************
118
119 int TrimSeqsCommand::execute(){
120         try{
121         
122                 if (abort == true) { return 0; }
123         
124                 getOligos();
125                 
126                 ifstream inFASTA;
127                 openInputFile(fastafile, inFASTA);
128
129                 ofstream outFASTA;
130                 string trimSeqFile = getRootName(fastafile) + "trim.fasta";
131                 openOutputFile(trimSeqFile, outFASTA);
132                 
133                 ofstream outGroups;
134                 string groupFile = getRootName(fastafile) + "groups"; 
135                 openOutputFile(groupFile, outGroups);
136
137                 ofstream scrapFASTA;
138                 string scrapSeqFile = getRootName(fastafile) + "scrap.fasta";
139                 openOutputFile(scrapSeqFile, scrapFASTA);
140
141                 bool success;
142                 
143                 while(!inFASTA.eof()){
144                         Sequence currSeq(inFASTA);
145                         string origSeq = currSeq.getUnaligned();
146                         string group;
147                         string trashCode = "";
148
149                         if(barcodes.size() != 0){
150                                 success = stripBarcode(currSeq, group);
151                                 if(!success){   trashCode += 'b';       }
152                         }
153                         if(numFPrimers != 0){
154                                 success = stripForward(currSeq);
155                                 if(!success){   trashCode += 'f';       }
156                         }
157                         if(numRPrimers != 0){
158                                 success = stripReverse(currSeq);
159                                 if(!success){   trashCode += 'r';       }
160                         }
161                         if(minLength > 0 || maxLength > 0){
162                                 success = cullLength(currSeq);
163                                 if(!success){   trashCode += 'l';       }
164                         }
165                         if(maxHomoP > 0){
166                                 success = cullHomoP(currSeq);
167                                 if(!success){   trashCode += 'h';       }
168                         }
169                         if(maxAmbig != -1){
170                                 success = cullAmbigs(currSeq);
171                                 if(!success){   trashCode += 'n';       }
172                         }
173                         
174                         if(flip){       currSeq.reverseComplement();    }               // should go last                       
175
176                         if(trashCode.length() == 0){
177                                 currSeq.printSequence(outFASTA);
178                                 outGroups << currSeq.getName() << '\t' << group << endl;
179                         }
180                         else{
181                                 currSeq.setName(currSeq.getName() + '|' + trashCode);
182                                 currSeq.setUnaligned(origSeq);
183                                 currSeq.printSequence(scrapFASTA);
184                         }
185                         
186                         gobble(inFASTA);
187                 }
188                 inFASTA.close();
189                 outFASTA.close();
190                 scrapFASTA.close();
191                 outGroups.close();
192                 
193                 return 0;               
194         }
195         catch(exception& e) {
196                 cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
197                 exit(1);
198         }
199         catch(...) {
200                 cout << "An unknown error has occurred in the TrimSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
201                 exit(1);
202         }
203 }
204
205 //***************************************************************************************************************
206
207 void TrimSeqsCommand::getOligos(){
208
209         ifstream inOligos;
210         //openInputFile(globaldata->getOligosFile(), inOligos);
211
212         string type, oligo, group;
213         
214         while(!inOligos.eof()){
215                 inOligos >> type;
216                 
217                 if(type == "forward"){
218                         inOligos >> oligo;
219                         forPrimer.push_back(oligo);
220                 }
221                 else if(type == "reverse"){
222                         inOligos >> oligo;
223                         revPrimer.push_back(oligo);
224                 }
225                 else if(type == "barcode"){
226                         inOligos >> oligo >> group;
227                         barcodes[oligo]=group;
228                 }
229                 else if(type[0] == '#'){
230                         char c;
231                         while ((c = inOligos.get()) != EOF)     {       if (c == 10){   break;  }       } // get rest of line
232                 }
233                 
234                 gobble(inOligos);
235         }
236
237         numFPrimers = forPrimer.size();
238         numRPrimers = revPrimer.size();
239 }
240
241 //***************************************************************************************************************
242
243 bool TrimSeqsCommand::stripBarcode(Sequence& seq, string& group){
244         
245         string rawSequence = seq.getUnaligned();
246         bool success = 0;       //guilty until proven innocent
247
248         for(map<string,string>::iterator it=barcodes.begin();it!=barcodes.end();it++){
249                 string oligo = it->first;
250                 
251                 if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
252                         success = 0;
253                         break;
254                 }
255                 
256                 if (rawSequence.compare(0,oligo.length(),oligo) == 0){
257                         group = it->second;
258                         seq.setUnaligned(rawSequence.substr(oligo.length()));
259                         success = 1;
260                         break;
261                 }
262         }
263         return success;
264         
265 }
266
267 //***************************************************************************************************************
268
269 bool TrimSeqsCommand::stripForward(Sequence& seq){
270         
271         string rawSequence = seq.getUnaligned();
272         bool success = 0;       //guilty until proven innocent
273         
274         for(int i=0;i<numFPrimers;i++){
275                 string oligo = forPrimer[i];
276
277                 if(rawSequence.length() < oligo.length()){
278                         success = 0;
279                         break;
280                 }
281                 
282                 if (rawSequence.compare(0,oligo.length(),oligo) == 0){
283                         seq.setUnaligned(rawSequence.substr(oligo.length()));
284                         success = 1;
285                         break;
286                 }
287         }
288         
289         return success;
290         
291 }
292
293 //***************************************************************************************************************
294
295 bool TrimSeqsCommand::stripReverse(Sequence& seq){
296         
297         string rawSequence = seq.getUnaligned();
298         bool success = 0;       //guilty until proven innocent
299         
300         for(int i=0;i<numRPrimers;i++){
301                 string oligo = revPrimer[i];
302                 
303                 if(rawSequence.length() < oligo.length()){
304                         success = 0;
305                         break;
306                 }
307                 
308                 if(rawSequence.compare(rawSequence.length()-oligo.length(),oligo.length(),oligo) == 0){
309                         seq.setUnaligned(rawSequence.substr(rawSequence.length()-oligo.length()));
310                         success = 1;
311                         break;
312                 }
313         }       
314         return success;
315         
316 }
317
318 //***************************************************************************************************************
319
320 bool TrimSeqsCommand::cullLength(Sequence& seq){
321         
322         int length = seq.getNumBases();
323         bool success = 0;       //guilty until proven innocent
324         
325         if(length >= minLength && maxLength == 0)                       {       success = 1;    }
326         else if(length >= minLength && length <= maxLength)     {       success = 1;    }
327         else                                                                                            {       success = 0;    }
328         
329         return success;
330         
331 }
332
333 //***************************************************************************************************************
334
335 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
336         
337         int longHomoP = seq.getLongHomoPolymer();
338         bool success = 0;       //guilty until proven innocent
339         
340         if(longHomoP <= maxHomoP){      success = 1;    }
341         else                                    {       success = 0;    }
342         
343         return success;
344         
345 }
346
347 //***************************************************************************************************************
348
349 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
350         
351         int numNs = seq.getAmbigBases();
352         bool success = 0;       //guilty until proven innocent
353         
354         if(numNs <= maxAmbig){  success = 1;    }
355         else                                    {       success = 0;    }
356         
357         return success;
358         
359 }
360
361 //***************************************************************************************************************