]> git.donarmstrong.com Git - mothur.git/blob - filterseqscommand.cpp
0e33ab2047359b432886384cc41d60a806fbdc7c
[mothur.git] / filterseqscommand.cpp
1 /*
2  *  filterseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Thomas Ryabin on 5/4/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "filterseqscommand.h"
11 #include "sequence.hpp"
12
13 /**************************************************************************************/
14
15 FilterSeqsCommand::FilterSeqsCommand(string option)  {
16         try {
17                 abort = false;
18                 filterFileName = "";
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string Array[] =  {"fasta", "trump", "soft", "hard", "vertical", "outputdir","inputdir", "processors"};
26                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string,string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                         map<string,string>::iterator it;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //if the user changes the input directory command factory will send this info to us in the output parameter 
40                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
41                         if (inputDir == "not found"){   inputDir = "";          }
42                         else {
43                                 string path;
44                                 it = parameters.find("fasta");
45                                 //user has given a template file
46                                 if(it != parameters.end()){ 
47                                         path = hasPath(it->second);
48                                         //if the user has not given a path then, add inputdir. else leave path alone.
49                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
50                                 }
51                                 
52                                 it = parameters.find("hard");
53                                 //user has given a template file
54                                 if(it != parameters.end()){ 
55                                         path = hasPath(it->second);
56                                         //if the user has not given a path then, add inputdir. else leave path alone.
57                                         if (path == "") {       parameters["hard"] = inputDir + it->second;             }
58                                 }
59                         }
60                         
61                         //check for required parameters
62                         fasta = validParameter.validFile(parameters, "fasta", false);
63                         if (fasta == "not found") { m->mothurOut("fasta is a required parameter for the filter.seqs command."); m->mothurOutEndLine(); abort = true;  }
64                         else { 
65                                 splitAtDash(fasta, fastafileNames);
66                                 
67                                 //go through files and make sure they are good, if not, then disregard them
68                                 for (int i = 0; i < fastafileNames.size(); i++) {
69                                         if (inputDir != "") {
70                                                 string path = hasPath(fastafileNames[i]);
71                                                 //if the user has not given a path then, add inputdir. else leave path alone.
72                                                 if (path == "") {       fastafileNames[i] = inputDir + fastafileNames[i];               }
73                                         }
74
75                                         int ableToOpen;
76                                         ifstream in;
77                                         ableToOpen = openInputFile(fastafileNames[i], in);
78                                         if (ableToOpen == 1) { 
79                                                 m->mothurOut(fastafileNames[i] + " will be disregarded."); m->mothurOutEndLine(); 
80                                                 //erase from file list
81                                                 fastafileNames.erase(fastafileNames.begin()+i);
82                                                 i--;
83                                         }else{  
84                                                 string simpleName = getSimpleName(fastafileNames[i]);
85                                                 filterFileName += simpleName.substr(0, simpleName.find_first_of('.'));
86                                         }
87                                         in.close();
88                                 }
89                                 
90                                 //make sure there is at least one valid file left
91                                 if (fastafileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
92                         }
93                         
94                         if (!abort) {
95                                 //if the user changes the output directory command factory will send this info to us in the output parameter 
96                                 outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
97                                         outputDir = ""; 
98                                         outputDir += hasPath(fastafileNames[0]); //if user entered a file with a path then preserve it  
99                                 }
100                         }
101                         //check for optional parameter and set defaults
102                         // ...at some point should added some additional type checking...
103                         
104                         string temp;
105                         temp = validParameter.validFile(parameters, "trump", false);                    if (temp == "not found") { temp = "*"; }
106                         trump = temp[0];
107                         
108                         temp = validParameter.validFile(parameters, "soft", false);                             if (temp == "not found") { soft = 0; }
109                         else {  soft = (float)atoi(temp.c_str()) / 100.0;  }
110                         
111                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
112                         convert(temp, processors); 
113                         
114                         hard = validParameter.validFile(parameters, "hard", true);                              if (hard == "not found") { hard = ""; }
115                         else if (hard == "not open") { abort = true; }  
116                         
117                         vertical = validParameter.validFile(parameters, "vertical", false);             if (vertical == "not found") { vertical = "T"; }
118                         
119                         numSeqs = 0;
120                         
121                 }
122                 
123         }
124         catch(exception& e) {
125                 m->errorOut(e, "FilterSeqsCommand", "FilterSeqsCommand");
126                 exit(1);
127         }
128 }
129
130 //**********************************************************************************************************************
131
132 void FilterSeqsCommand::help(){
133         try {
134                 m->mothurOut("The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file.\n");
135                 m->mothurOut("The filter.seqs command parameters are fasta, trump, soft, hard and vertical. \n");
136                 m->mothurOut("The fasta parameter is required. You may enter several fasta files to build the filter from and filter, by separating their names with -'s.\n");
137                 m->mothurOut("For example: fasta=abrecovery.fasta-amazon.fasta \n");
138                 m->mothurOut("The trump parameter .... The default is ...\n");
139                 m->mothurOut("The soft parameter .... The default is ....\n");
140                 m->mothurOut("The hard parameter .... The default is ....\n");
141                 m->mothurOut("The vertical parameter .... The default is T.\n");
142                 m->mothurOut("The filter.seqs command should be in the following format: \n");
143                 m->mothurOut("filter.seqs(fasta=yourFastaFile, trump=yourTrump, soft=yourSoft, hard=yourHard, vertical=yourVertical) \n");
144                 m->mothurOut("Example filter.seqs(fasta=abrecovery.fasta, trump=..., soft=..., hard=..., vertical=T).\n");
145                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
146                 
147         }
148         catch(exception& e) {
149                 m->errorOut(e, "FilterSeqsCommand", "help");
150                 exit(1);
151         }
152 }
153
154 /**************************************************************************************/
155
156 int FilterSeqsCommand::execute() {      
157         try {
158         
159                 if (abort == true) { return 0; }
160                 vector<string> outputNames;
161                 
162                 ifstream inFASTA;
163                 openInputFile(fastafileNames[0], inFASTA);
164                 
165                 Sequence testSeq(inFASTA);
166                 alignmentLength = testSeq.getAlignLength();
167                 inFASTA.close();
168                 
169                 ////////////create filter/////////////////
170                 
171                 filter = createFilter();
172                 
173                 ofstream outFilter;
174                 
175                 string filterFile = outputDir + filterFileName + ".filter";
176                 openOutputFile(filterFile, outFilter);
177                 outFilter << filter << endl;
178                 outFilter.close();
179                 outputNames.push_back(filterFile);
180                 
181                 
182                 ////////////run filter/////////////////
183                 
184                 numSeqs = 0;
185                 for (int i = 0; i < fastafileNames.size(); i++) {
186                         ifstream in;
187                         openInputFile(fastafileNames[i], in);
188                         string filteredFasta = outputDir + getRootName(getSimpleName(fastafileNames[i])) + "filter.fasta";
189                         ofstream outFASTA;
190                         openOutputFile(filteredFasta, outFASTA);
191                         outputNames.push_back(filteredFasta);
192                         
193                         
194                         while(!in.eof()){
195                                 if (m->control_pressed) { in.close(); outFASTA.close(); for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }  return 0; }
196                                 
197                                 Sequence seq(in);
198                                 if (seq.getName() != "") {
199                                         string align = seq.getAligned();
200                                         string filterSeq = "";
201                                         
202                                         for(int j=0;j<alignmentLength;j++){
203                                                 if(filter[j] == '1'){
204                                                         filterSeq += align[j];
205                                                 }
206                                         }
207                                         
208                                         outFASTA << '>' << seq.getName() << endl << filterSeq << endl;
209                                         numSeqs++;
210                                 }
211                                 gobble(in);
212                         }
213                         outFASTA.close();
214                         in.close();
215                 }
216                 
217                 int filteredLength = 0;
218                 for(int i=0;i<alignmentLength;i++){
219                         if(filter[i] == '1'){   filteredLength++;       }
220                 }
221                 
222                 if (m->control_pressed) {  for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }  return 0; }
223
224                 
225                 m->mothurOutEndLine();
226                 m->mothurOut("Length of filtered alignment: " + toString(filteredLength)); m->mothurOutEndLine();
227                 m->mothurOut("Number of columns removed: " + toString((alignmentLength-filteredLength))); m->mothurOutEndLine();
228                 m->mothurOut("Length of the original alignment: " + toString(alignmentLength)); m->mothurOutEndLine();
229                 m->mothurOut("Number of sequences used to construct filter: " + toString(numSeqs)); m->mothurOutEndLine();
230                 
231                 
232                 m->mothurOutEndLine();
233                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
234                 for(int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();      }
235                 m->mothurOutEndLine();
236
237                 return 0;
238                 
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "FilterSeqsCommand", "execute");
242                 exit(1);
243         }
244 }
245 /**************************************************************************************/
246 string FilterSeqsCommand::createFilter() {      
247         try {
248                 string filterString = "";
249                 
250                 Filters F;
251                 
252                 if (soft != 0)                  {  F.setSoft(soft);             }
253                 if (trump != '*')               {  F.setTrump(trump);   }
254                 
255                 F.setLength(alignmentLength);
256                 
257                 if(soft != 0 || isTrue(vertical)){
258                         F.initialize();
259                 }
260                 
261                 if(hard.compare("") != 0)       {       F.doHard(hard);         }
262                 else                                            {       F.setFilter(string(alignmentLength, '1'));      }
263                 
264                 numSeqs = 0;
265                 
266                 if(trump != '*' || isTrue(vertical) || soft != 0){
267                         for (int s = 0; s < fastafileNames.size(); s++) {
268                         
269                                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
270                                 
271                                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
272                                         if(processors == 1){
273                                                 ifstream inFASTA;
274                                                 openInputFile(fastafileNames[s], inFASTA);
275                                                 int numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
276                                                 inFASTA.close();
277                                                 
278                                                 numSeqs += numFastaSeqs;
279                                 
280                                                 lines.push_back(new linePair(0, numFastaSeqs));
281                                 
282                                                 driverCreateFilter(F, fastafileNames[s], lines[0]);
283                                         }else{
284                                                 vector<int> positions;
285                                 
286                                                 ifstream inFASTA;
287                                                 openInputFile(fastafileNames[s], inFASTA);
288                                 
289                                                 string input;
290                                                 while(!inFASTA.eof()){
291                                                         input = getline(inFASTA);
292                                                         if (input.length() != 0) {
293                                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
294                                                         }
295                                                 }
296                                                 inFASTA.close();
297                                 
298                                                 int numFastaSeqs = positions.size();
299                                                 
300                                                 numSeqs += numFastaSeqs;
301                                 
302                                                 int numSeqsPerProcessor = numFastaSeqs / processors;
303                                 
304                                                 for (int i = 0; i < processors; i++) {
305                                                         long int startPos = positions[ i * numSeqsPerProcessor ];
306                                                         if(i == processors - 1){
307                                                                 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
308                                                         }
309                                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
310                                                 }
311                                 
312                                                 createProcessesCreateFilter(F, fastafileNames[s]); 
313                                         }
314                                 #else
315                                         ifstream inFASTA;
316                                         openInputFile(fastafileNames[s], inFASTA);
317                                         int numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
318                                         inFASTA.close();
319                                                 
320                                         numSeqs += numFastaSeqs;
321                                 
322                                         lines.push_back(new linePair(0, numFastaSeqs));
323                                 
324                                         driverCreateFilter(F, lines[0], fastafileNames[s]);
325                                 #endif
326                         
327                         
328                         }
329                 }
330                 
331                 F.setNumSeqs(numSeqs);
332                                 
333                 if(isTrue(vertical) == 1)       {       F.doVertical(); }
334                 if(soft != 0)                           {       F.doSoft();             }
335                 
336                 filterString = F.getFilter();
337
338                 return filterString;
339         }
340         catch(exception& e) {
341                 m->errorOut(e, "FilterSeqsCommand", "createFilter");
342                 exit(1);
343         }
344 }
345 /**************************************************************************************/
346 int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* line) {        
347         try {
348                 
349                 ifstream in;
350                 openInputFile(filename, in);
351                                 
352                 in.seekg(line->start);
353                 
354                 for(int i=0;i<line->numSeqs;i++){
355                                 
356                         if (m->control_pressed) { in.close(); return 1; }
357                                         
358                         Sequence seq(in);
359                         if (seq.getName() != "") {
360                                         if(trump != '*'){       F.doTrump(seq); }
361                                         if(isTrue(vertical) || soft != 0){      F.getFreqs(seq);        }
362                                         cout.flush();
363                         }
364                 }
365                                 
366                 in.close();
367                 
368                 return 0;
369         }
370         catch(exception& e) {
371                 m->errorOut(e, "FilterSeqsCommand", "driverCreateFilter");
372                 exit(1);
373         }
374 }
375 /**************************************************************************************************/
376
377 int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) {
378         try {
379 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
380                 int process = 0;
381                 int exitCommand = 1;
382                 vector<int> processIDS;
383                 
384                 //loop through and create all the processes you want
385                 while (process != processors) {
386                         int pid = vfork();
387                         
388                         if (pid > 0) {
389                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
390                                 process++;
391                         }else if (pid == 0){
392                                 driverCreateFilter(F, filename, lines[process]);
393                                 exit(0);
394                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
395                 }
396                 
397                 //force parent to wait until all the processes are done
398                 for (int i=0;i<processors;i++) { 
399                         int temp = processIDS[i];
400                         wait(&temp);
401                 }
402                 
403                 return exitCommand;
404 #endif          
405         }
406         catch(exception& e) {
407                 m->errorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter");
408                 exit(1);
409         }
410 }
411 /**************************************************************************************/