]> git.donarmstrong.com Git - mothur.git/blob - filterseqscommand.cpp
added getCommandInfoCommand for gui
[mothur.git] / filterseqscommand.cpp
1 /*
2  *  filterseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Thomas Ryabin on 5/4/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "filterseqscommand.h"
11 #include "sequence.hpp"
12
13
14 //**********************************************************************************************************************
15 vector<string> FilterSeqsCommand::setParameters(){      
16         try {
17                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18                 CommandParameter phard("hard", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(phard);
19                 CommandParameter ptrump("trump", "String", "", "*", "", "", "",false,false); parameters.push_back(ptrump);
20                 CommandParameter psoft("soft", "Number", "", "0", "", "", "",false,false); parameters.push_back(psoft);
21                 CommandParameter pvertical("vertical", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pvertical);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "FilterSeqsCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string FilterSeqsCommand::getHelpString(){      
37         try {
38                 string helpString = "";
39                 helpString += "The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file.\n";
40                 helpString += "The filter.seqs command parameters are fasta, trump, soft, hard, processors and vertical. \n";
41                 helpString += "The fasta parameter is required, unless you have a valid current fasta file. You may enter several fasta files to build the filter from and filter, by separating their names with -'s.\n";
42                 helpString += "For example: fasta=abrecovery.fasta-amazon.fasta \n";
43                 helpString += "The trump option will remove a column if the trump character is found at that position in any sequence of the alignment. Default=*, meaning no trump. \n";
44                 helpString += "A soft mask removes any column where the dominant base (i.e. A, T, G, C, or U) does not occur in at least a designated percentage of sequences. Default=0.\n";
45                 helpString += "The hard parameter allows you to enter a file containing the filter you want to use.\n";
46                 helpString += "The vertical parameter removes columns where all sequences contain a gap character. The default is T.\n";
47                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
48                 helpString += "The filter.seqs command should be in the following format: \n";
49                 helpString += "filter.seqs(fasta=yourFastaFile, trump=yourTrump) \n";
50                 helpString += "Example filter.seqs(fasta=abrecovery.fasta, trump=.).\n";
51                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52                 return helpString;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "FilterSeqsCommand", "getHelpString");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 FilterSeqsCommand::FilterSeqsCommand(){ 
61         try {
62                 abort = true; calledHelp = true; 
63                 setParameters();
64                 vector<string> tempOutNames;
65                 outputTypes["fasta"] = tempOutNames;
66                 outputTypes["filter"] = tempOutNames;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "FilterSeqsCommand", "FilterSeqsCommand");
70                 exit(1);
71         }
72 }
73 /**************************************************************************************/
74 FilterSeqsCommand::FilterSeqsCommand(string option)  {
75         try {
76                 abort = false; calledHelp = false;   
77                 filterFileName = "";
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 
82                 else {
83                         vector<string> myArray = setParameters();
84                         
85                         OptionParser parser(option);
86                         map<string,string> parameters = parser.getParameters();
87                         
88                         ValidParameters validParameter("filter.seqs");
89                         map<string,string>::iterator it;
90                         
91                         //check to make sure all parameters are valid for command
92                         for (it = parameters.begin(); it != parameters.end(); it++) { 
93                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
94                         }
95                         
96                         //initialize outputTypes
97                         vector<string> tempOutNames;
98                         outputTypes["fasta"] = tempOutNames;
99                         outputTypes["filter"] = tempOutNames;
100                 
101                         //if the user changes the input directory command factory will send this info to us in the output parameter 
102                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
103                         if (inputDir == "not found"){   inputDir = "";          }
104                         else {
105                                 string path;
106                                 it = parameters.find("fasta");
107                                 //user has given a template file
108                                 if(it != parameters.end()){ 
109                                         path = m->hasPath(it->second);
110                                         //if the user has not given a path then, add inputdir. else leave path alone.
111                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
112                                 }
113                                 
114                                 it = parameters.find("hard");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["hard"] = inputDir + it->second;             }
120                                 }
121                         }
122                         
123                         //check for required parameters
124                         fasta = validParameter.validFile(parameters, "fasta", false);
125                         if (fasta == "not found") {                             
126                                 fasta = m->getFastaFile(); 
127                                 if (fasta != "") { fastafileNames.push_back(fasta);  m->mothurOut("Using " + fasta + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
128                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
129                         }
130                         else { 
131                                 m->splitAtDash(fasta, fastafileNames);
132                                 
133                                 //go through files and make sure they are good, if not, then disregard them
134                                 for (int i = 0; i < fastafileNames.size(); i++) {
135                                         if (inputDir != "") {
136                                                 string path = m->hasPath(fastafileNames[i]);
137                                                 //if the user has not given a path then, add inputdir. else leave path alone.
138                                                 if (path == "") {       fastafileNames[i] = inputDir + fastafileNames[i];               }
139                                         }
140
141                                         ifstream in;
142                                         int ableToOpen = m->openInputFile(fastafileNames[i], in, "noerror");
143                                 
144                                         //if you can't open it, try default location
145                                         if (ableToOpen == 1) {
146                                                 if (m->getDefaultPath() != "") { //default path is set
147                                                         string tryPath = m->getDefaultPath() + m->getSimpleName(fastafileNames[i]);
148                                                         m->mothurOut("Unable to open " + fastafileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
149                                                         ifstream in2;
150                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
151                                                         in2.close();
152                                                         fastafileNames[i] = tryPath;
153                                                 }
154                                         }
155                                         
156                                         //if you can't open it, try default location
157                                         if (ableToOpen == 1) {
158                                                 if (m->getOutputDir() != "") { //default path is set
159                                                         string tryPath = m->getOutputDir() + m->getSimpleName(fastafileNames[i]);
160                                                         m->mothurOut("Unable to open " + fastafileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
161                                                         ifstream in2;
162                                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
163                                                         in2.close();
164                                                         fastafileNames[i] = tryPath;
165                                                 }
166                                         }
167                                         
168                                         in.close();
169                                         
170                                         if (ableToOpen == 1) { 
171                                                 m->mothurOut("Unable to open " + fastafileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
172                                                 //erase from file list
173                                                 fastafileNames.erase(fastafileNames.begin()+i);
174                                                 i--;
175                                         }else{  
176                                                 string simpleName = m->getSimpleName(fastafileNames[i]);
177                                                 filterFileName += simpleName.substr(0, simpleName.find_first_of('.'));
178                                         }
179                                         in.close();
180                                 }
181                                 
182                                 //make sure there is at least one valid file left
183                                 if (fastafileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
184                         }
185                         
186                         if (!abort) {
187                                 //if the user changes the output directory command factory will send this info to us in the output parameter 
188                                 outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
189                                         outputDir = ""; 
190                                         outputDir += m->hasPath(fastafileNames[0]); //if user entered a file with a path then preserve it       
191                                 }
192                         }
193                         //check for optional parameter and set defaults
194                         // ...at some point should added some additional type checking...
195                         
196                         string temp;
197                         hard = validParameter.validFile(parameters, "hard", true);                              if (hard == "not found") { hard = ""; }
198                         else if (hard == "not open") { hard = ""; abort = true; }       
199
200                         temp = validParameter.validFile(parameters, "trump", false);                    if (temp == "not found") { temp = "*"; }
201                         trump = temp[0];
202                         
203                         temp = validParameter.validFile(parameters, "soft", false);                             if (temp == "not found") { soft = 0; }
204                         else {  soft = (float)atoi(temp.c_str()) / 100.0;  }
205                         
206                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
207                         m->setProcessors(temp);
208                         convert(temp, processors); 
209                         
210                         vertical = validParameter.validFile(parameters, "vertical", false);             
211                         if (vertical == "not found") { 
212                                 if ((hard == "") && (trump == '*') && (soft == 0)) { vertical = "T"; } //you have not given a hard file or set the trump char.
213                                 else { vertical = "F";  }
214                         }
215                         
216                         numSeqs = 0;
217                 }
218                 
219         }
220         catch(exception& e) {
221                 m->errorOut(e, "FilterSeqsCommand", "FilterSeqsCommand");
222                 exit(1);
223         }
224 }
225 /**************************************************************************************/
226
227 int FilterSeqsCommand::execute() {      
228         try {
229         
230                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
231                 
232                 ifstream inFASTA;
233                 m->openInputFile(fastafileNames[0], inFASTA);
234                 
235                 Sequence testSeq(inFASTA);
236                 alignmentLength = testSeq.getAlignLength();
237                 inFASTA.close();
238                 
239                 ////////////create filter/////////////////
240                 m->mothurOut("Creating Filter... "); m->mothurOutEndLine();
241                 
242                 filter = createFilter();
243                 
244                 m->mothurOutEndLine();  m->mothurOutEndLine();
245                 
246                 if (m->control_pressed) { outputTypes.clear(); return 0; }
247                 
248                 #ifdef USE_MPI
249                         int pid;
250                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
251                                         
252                         if (pid == 0) { //only one process should output the filter
253                 #endif
254                 
255                 ofstream outFilter;
256                 
257                 //prevent giantic file name
258                 string filterFile;
259                 if (fastafileNames.size() > 3) { filterFile = outputDir + "merge.filter"; }
260                 else {  filterFile = outputDir + filterFileName + ".filter";  }
261                 
262                 m->openOutputFile(filterFile, outFilter);
263                 outFilter << filter << endl;
264                 outFilter.close();
265                 outputNames.push_back(filterFile); outputTypes["filter"].push_back(filterFile);
266                 
267                 #ifdef USE_MPI
268                         }
269                 #endif
270                 
271                 ////////////run filter/////////////////
272                 
273                 m->mothurOut("Running Filter... "); m->mothurOutEndLine();
274                 
275                 filterSequences();
276                 
277                 m->mothurOutEndLine();  m->mothurOutEndLine();
278                                         
279                 int filteredLength = 0;
280                 for(int i=0;i<alignmentLength;i++){
281                         if(filter[i] == '1'){   filteredLength++;       }
282                 }
283                 
284                 if (m->control_pressed) {  outputTypes.clear(); for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }  return 0; }
285
286                 
287                 m->mothurOutEndLine();
288                 m->mothurOut("Length of filtered alignment: " + toString(filteredLength)); m->mothurOutEndLine();
289                 m->mothurOut("Number of columns removed: " + toString((alignmentLength-filteredLength))); m->mothurOutEndLine();
290                 m->mothurOut("Length of the original alignment: " + toString(alignmentLength)); m->mothurOutEndLine();
291                 m->mothurOut("Number of sequences used to construct filter: " + toString(numSeqs)); m->mothurOutEndLine();
292                 
293                 //set fasta file as new current fastafile
294                 string current = "";
295                 itTypes = outputTypes.find("fasta");
296                 if (itTypes != outputTypes.end()) {
297                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
298                 }
299                 
300                 m->mothurOutEndLine();
301                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
302                 for(int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();      }
303                 m->mothurOutEndLine();
304                 
305                 return 0;
306                 
307         }
308         catch(exception& e) {
309                 m->errorOut(e, "FilterSeqsCommand", "execute");
310                 exit(1);
311         }
312 }
313 /**************************************************************************************/
314 int FilterSeqsCommand::filterSequences() {      
315         try {
316                 
317                 numSeqs = 0;
318                 
319                 for (int s = 0; s < fastafileNames.size(); s++) {
320                         
321                                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
322                                 
323                                 string filteredFasta = outputDir + m->getRootName(m->getSimpleName(fastafileNames[s])) + "filter.fasta";
324 #ifdef USE_MPI  
325                                 int pid, numSeqsPerProcessor, num; 
326                                 int tag = 2001;
327                                 vector<unsigned long int>MPIPos;
328                                                 
329                                 MPI_Status status; 
330                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
331                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
332                                 
333                                 MPI_File outMPI;
334                                 MPI_File inMPI;
335                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
336                                 int inMode=MPI_MODE_RDONLY; 
337                                 
338                                 char outFilename[1024];
339                                 strcpy(outFilename, filteredFasta.c_str());
340                         
341                                 char inFileName[1024];
342                                 strcpy(inFileName, fastafileNames[s].c_str());
343                                 
344                                 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
345                                 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
346
347                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  return 0;  }
348
349                                 if (pid == 0) { //you are the root process 
350                                         
351                                         MPIPos = m->setFilePosFasta(fastafileNames[s], num); //fills MPIPos, returns numSeqs
352                                         numSeqs += num;
353                                         
354                                         //send file positions to all processes
355                                         for(int i = 1; i < processors; i++) { 
356                                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
357                                                 MPI_Send(&MPIPos[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
358                                         }
359                                         
360                                         //figure out how many sequences you have to do
361                                         numSeqsPerProcessor = num / processors;
362                                         int startIndex =  pid * numSeqsPerProcessor;
363                                         if(pid == (processors - 1)){    numSeqsPerProcessor = num - pid * numSeqsPerProcessor;  }
364                                         
365                                 
366                                         //do your part
367                                         driverMPIRun(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
368                                         
369                                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  return 0;  }
370                                         
371                                         //wait on chidren
372                                         for(int i = 1; i < processors; i++) { 
373                                                 char buf[5];
374                                                 MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
375                                         }
376                                         
377                                 }else { //you are a child process
378                                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
379                                         MPIPos.resize(num+1);
380                                         numSeqs += num;
381                                         MPI_Recv(&MPIPos[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
382                                         
383                                         //figure out how many sequences you have to align
384                                         numSeqsPerProcessor = num / processors;
385                                         int startIndex =  pid * numSeqsPerProcessor;
386                                         if(pid == (processors - 1)){    numSeqsPerProcessor = num - pid * numSeqsPerProcessor;  }
387                                         
388                                         
389                                         //align your part
390                                         driverMPIRun(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);           
391                                         
392                                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  return 0;  }
393                                         
394                                         char buf[5];
395                                         strcpy(buf, "done"); 
396                                         
397                                         //tell parent you are done.
398                                         MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
399                                 }
400                                 
401                                 MPI_File_close(&outMPI);
402                                 MPI_File_close(&inMPI);
403                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
404                                 
405 #else
406                         vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
407                                 
408                         for (int i = 0; i < (positions.size()-1); i++) {
409                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
410                         }       
411                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
412                                 if(processors == 1){
413                                         int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
414                                         numSeqs += numFastaSeqs;
415                                 }else{
416                                         int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s]); 
417                                         numSeqs += numFastaSeqs;
418                                 
419                                         rename((fastafileNames[s] + toString(processIDS[0]) + ".temp").c_str(), filteredFasta.c_str());
420                                 
421                                         //append fasta files
422                                         for(int i=1;i<processors;i++){
423                                                 m->appendFiles((fastafileNames[s] + toString(processIDS[i]) + ".temp"), filteredFasta);
424                                                 remove((fastafileNames[s] + toString(processIDS[i]) + ".temp").c_str());
425                                         }
426                                 }
427                                 
428                                 if (m->control_pressed) {  return 1; }
429                 #else
430                                 int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
431                                 numSeqs += numFastaSeqs;
432
433                                 if (m->control_pressed) {  return 1; }
434                 #endif
435 #endif
436                         outputNames.push_back(filteredFasta); outputTypes["fasta"].push_back(filteredFasta);
437                 }
438
439                 return 0;
440         }
441         catch(exception& e) {
442                 m->errorOut(e, "FilterSeqsCommand", "filterSequences");
443                 exit(1);
444         }
445 }
446 #ifdef USE_MPI
447 /**************************************************************************************/
448 int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos) { 
449         try {
450                 string outputString = "";
451                 int count = 0;
452                 MPI_Status status; 
453                 
454                 for(int i=0;i<num;i++){
455                 
456                         if (m->control_pressed) { return 0; }
457                 
458                         //read next sequence
459                         int length = MPIPos[start+i+1] - MPIPos[start+i];
460                         char* buf4 = new char[length];
461                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
462                         
463                         string tempBuf = buf4;
464                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
465                         istringstream iss (tempBuf,istringstream::in);
466                         delete buf4;
467         
468                         Sequence seq(iss);  m->gobble(iss);
469                         
470                         if (seq.getName() != "") {
471                                 string align = seq.getAligned();
472                                 string filterSeq = "";
473                                         
474                                 for(int j=0;j<alignmentLength;j++){
475                                         if(filter[j] == '1'){
476                                                 filterSeq += align[j];
477                                         }
478                                 }
479                                 
480                                 count++;
481                                 outputString += ">" + seq.getName() + "\n" + filterSeq + "\n";
482                                 
483                                 if(count % 10 == 0){ //output to file 
484                                         //send results to parent
485                                         int length = outputString.length();
486                                         char* buf = new char[length];
487                                         memcpy(buf, outputString.c_str(), length);
488                                 
489                                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
490                                         outputString = "";
491                                         delete buf;
492                                 }
493
494                         }
495                         
496                         if((i+1) % 100 == 0){   cout << (i+1) << endl;   m->mothurOutJustToLog(toString(i+1) + "\n");   }
497                 }
498                 
499                 if(outputString != ""){ //output to file 
500                         //send results to parent
501                         int length = outputString.length();
502                         char* buf = new char[length];
503                         memcpy(buf, outputString.c_str(), length);
504                         
505                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
506                         outputString = "";
507                         delete buf;
508                 }
509                 
510                 if((num) % 100 != 0){   cout << (num) << endl;   m->mothurOutJustToLog(toString(num) + "\n");   }
511                         
512                 return 0;
513         }
514         catch(exception& e) {
515                 m->errorOut(e, "FilterSeqsCommand", "driverRunFilter");
516                 exit(1);
517         }
518 }
519 #endif
520 /**************************************************************************************/
521 int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string inputFilename, linePair* filePos) {      
522         try {
523                 ofstream out;
524                 m->openOutputFile(outputFilename, out);
525                 
526                 ifstream in;
527                 m->openInputFile(inputFilename, in);
528                                 
529                 in.seekg(filePos->start);
530
531                 bool done = false;
532                 int count = 0;
533         
534                 while (!done) {
535                                 
536                                 if (m->control_pressed) { in.close(); out.close(); return 0; }
537                                 
538                                 Sequence seq(in); m->gobble(in);
539                                 if (seq.getName() != "") {
540                                         string align = seq.getAligned();
541                                         string filterSeq = "";
542                                         
543                                         for(int j=0;j<alignmentLength;j++){
544                                                 if(filter[j] == '1'){
545                                                         filterSeq += align[j];
546                                                 }
547                                         }
548                                         
549                                         out << '>' << seq.getName() << endl << filterSeq << endl;
550                                 count++;
551                         }
552                         
553                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
554                                 unsigned long int pos = in.tellg();
555                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
556                         #else
557                                 if (in.eof()) { break; }
558                         #endif
559                         
560                         //report progress
561                         if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine();           }
562                 }
563                 //report progress
564                 if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine();           }
565                 
566                 
567                 out.close();
568                 in.close();
569                 
570                 return count;
571         }
572         catch(exception& e) {
573                 m->errorOut(e, "FilterSeqsCommand", "driverRunFilter");
574                 exit(1);
575         }
576 }
577 /**************************************************************************************************/
578
579 int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) {
580         try {
581 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
582                 int process = 0;
583                 int num = 0;
584                 processIDS.clear();
585                 
586                 //loop through and create all the processes you want
587                 while (process != processors) {
588                         int pid = fork();
589                         
590                         if (pid > 0) {
591                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
592                                 process++;
593                         }else if (pid == 0){
594                                 string filteredFasta = filename + toString(getpid()) + ".temp";
595                                 num = driverRunFilter(F, filteredFasta, filename, lines[process]);
596                                 
597                                 //pass numSeqs to parent
598                                 ofstream out;
599                                 string tempFile = filename +  toString(getpid()) + ".num.temp";
600                                 m->openOutputFile(tempFile, out);
601                                 out << num << endl;
602                                 out.close();
603                                 
604                                 exit(0);
605                         }else { 
606                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
607                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
608                                 exit(0);
609                         }
610                 }
611                 
612                 //force parent to wait until all the processes are done
613                 for (int i=0;i<processors;i++) { 
614                         int temp = processIDS[i];
615                         wait(&temp);
616                 }       
617                                         
618                 for (int i = 0; i < processIDS.size(); i++) {
619                         ifstream in;
620                         string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
621                         m->openInputFile(tempFile, in);
622                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
623                         in.close(); remove(tempFile.c_str());
624                 }
625
626                 
627                 return num;
628 #endif          
629         }
630         catch(exception& e) {
631                 m->errorOut(e, "FilterSeqsCommand", "createProcessesRunFilter");
632                 exit(1);
633         }
634 }
635 /**************************************************************************************/
636 string FilterSeqsCommand::createFilter() {      
637         try {
638                 string filterString = "";                       
639                 Filters F;
640                 
641                 if (soft != 0)                  {  F.setSoft(soft);             }
642                 if (trump != '*')               {  F.setTrump(trump);   }
643                 
644                 F.setLength(alignmentLength);
645                 
646                 if(trump != '*' || m->isTrue(vertical) || soft != 0){
647                         F.initialize();
648                 }
649                 
650                 if(hard.compare("") != 0)       {       F.doHard(hard);         }
651                 else                                            {       F.setFilter(string(alignmentLength, '1'));      }
652                 
653                 numSeqs = 0;
654                 if(trump != '*' || m->isTrue(vertical) || soft != 0){
655                         for (int s = 0; s < fastafileNames.size(); s++) {
656                         
657                                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
658                         
659 #ifdef USE_MPI  
660                                 int pid, numSeqsPerProcessor, num; 
661                                 int tag = 2001;
662                                 vector<unsigned long int> MPIPos;
663                                 
664                                 MPI_Status status; 
665                                 MPI_File inMPI; 
666                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
667                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
668                                                         
669                                 //char* tempFileName = new char(fastafileNames[s].length());
670                                 //tempFileName = &(fastafileNames[s][0]);
671                                 
672                                 char tempFileName[1024];
673                                 strcpy(tempFileName, fastafileNames[s].c_str());
674                 
675                                 MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
676                                 
677                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  return 0;  }
678                                 
679                                 if (pid == 0) { //you are the root process
680                                                 MPIPos = m->setFilePosFasta(fastafileNames[s], num); //fills MPIPos, returns numSeqs
681                                                 numSeqs += num;
682                                                 
683                                                 //send file positions to all processes
684                                                 for(int i = 1; i < processors; i++) { 
685                                                         MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
686                                                         MPI_Send(&MPIPos[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
687                                                 }
688                                                                 
689                                                 //figure out how many sequences you have to do
690                                                 numSeqsPerProcessor = num / processors;
691                                                 int startIndex =  pid * numSeqsPerProcessor;
692                                                 if(pid == (processors - 1)){    numSeqsPerProcessor = num - pid * numSeqsPerProcessor;  }
693                                                 
694                                 
695                                                 //do your part
696                                                 MPICreateFilter(startIndex, numSeqsPerProcessor, F, inMPI, MPIPos);
697                                                 
698                                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  return 0;  }
699                                                                                                 
700                                 }else { //i am the child process
701                                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
702                                         MPIPos.resize(num+1);
703                                         numSeqs += num;
704                                         MPI_Recv(&MPIPos[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
705                                         
706                                         //figure out how many sequences you have to align
707                                         numSeqsPerProcessor = num / processors;
708                                         int startIndex =  pid * numSeqsPerProcessor;
709                                         if(pid == (processors - 1)){    numSeqsPerProcessor = num - pid * numSeqsPerProcessor;  }
710                                         
711                                         
712                                         //do your part
713                                         MPICreateFilter(startIndex, numSeqsPerProcessor, F, inMPI,  MPIPos);
714                                         
715                                         if (m->control_pressed) {  MPI_File_close(&inMPI);  return 0;  }
716                                 }
717                                 
718                                 MPI_File_close(&inMPI);
719                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
720                                 
721 #else
722                 vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
723                                 
724                 for (int i = 0; i < (positions.size()-1); i++) {
725                         lines.push_back(new linePair(positions[i], positions[(i+1)]));
726                 }       
727                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
728                                 if(processors == 1){
729                                         int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]);
730                                         numSeqs += numFastaSeqs;
731                                 }else{
732                                         int numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]); 
733                                         numSeqs += numFastaSeqs;
734                                 }
735                                 
736                                 if (m->control_pressed) {  return filterString; }
737                 #else
738                                 int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]);
739                                 numSeqs += numFastaSeqs;
740                                 if (m->control_pressed) {  return filterString; }
741                 #endif
742 #endif
743                         
744                         }
745                 }
746
747
748 #ifdef USE_MPI  
749                 int pid;
750                 int Atag = 1; int Ttag = 2; int Ctag = 3; int Gtag = 4; int Gaptag = 5;
751                 MPI_Status status;
752                 
753                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
754                 
755                 if(trump != '*' || m->isTrue(vertical) || soft != 0){
756                         
757                         if (pid == 0) { //only one process should output the filter
758                         
759                                 vector<int> temp; temp.resize(alignmentLength+1);
760                                                                 
761                                 //get the frequencies from the child processes
762                                 for(int i = 1; i < processors; i++) { 
763                                 
764                                         for (int j = 0; j < 5; j++) {
765                                         
766                                                 MPI_Recv(&temp[0], (alignmentLength+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); 
767                                                 int receiveTag = temp[temp.size()-1];  //child process added a int to the end to indicate what letter count this is for
768                                                 
769                                                 if (receiveTag == Atag) { //you are recieveing the A frequencies
770                                                         for (int k = 0; k < alignmentLength; k++) {             F.a[k] += temp[k];      }
771                                                 }else if (receiveTag == Ttag) { //you are recieveing the T frequencies
772                                                         for (int k = 0; k < alignmentLength; k++) {             F.t[k] += temp[k];      }
773                                                 }else if (receiveTag == Ctag) { //you are recieveing the C frequencies
774                                                         for (int k = 0; k < alignmentLength; k++) {             F.c[k] += temp[k];      }
775                                                 }else if (receiveTag == Gtag) { //you are recieveing the G frequencies
776                                                         for (int k = 0; k < alignmentLength; k++) {             F.g[k] += temp[k];      }
777                                                 }else if (receiveTag == Gaptag) { //you are recieveing the gap frequencies
778                                                         for (int k = 0; k < alignmentLength; k++) {             F.gap[k] += temp[k];    }
779                                                 }
780                                         }
781                                 } 
782                         }else{
783                         
784                                 //send my fequency counts
785                                 F.a.push_back(Atag);
786                                 int ierr = MPI_Send(&(F.a[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
787                                 F.t.push_back(Ttag);
788                                 ierr = MPI_Send (&(F.t[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
789                                 F.c.push_back(Ctag);
790                                 ierr = MPI_Send(&(F.c[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
791                                 F.g.push_back(Gtag);
792                                 ierr = MPI_Send(&(F.g[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
793                                 F.gap.push_back(Gaptag);
794                                 ierr = MPI_Send(&(F.gap[0]), (alignmentLength+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
795                         }
796                         
797                 }
798                 
799                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
800                 
801                 if (pid == 0) { //only one process should output the filter
802 #endif
803
804                 F.setNumSeqs(numSeqs);
805                 if(m->isTrue(vertical) == 1)    {       F.doVertical(); }
806                 if(soft != 0)                           {       F.doSoft();             }
807                 filterString = F.getFilter();
808                 
809 #ifdef USE_MPI
810                 //send filter string to kids
811                 //for(int i = 1; i < processors; i++) { 
812                 //      MPI_Send(&filterString[0], alignmentLength, MPI_CHAR, i, 2001, MPI_COMM_WORLD);
813                 //}
814                 MPI_Bcast(&filterString[0], alignmentLength, MPI_CHAR, 0, MPI_COMM_WORLD);
815         }else{
816                 //recieve filterString
817                 char* tempBuf = new char[alignmentLength];
818                 //MPI_Recv(&tempBuf[0], alignmentLength, MPI_CHAR, 0, 2001, MPI_COMM_WORLD, &status);
819                 MPI_Bcast(tempBuf, alignmentLength, MPI_CHAR, 0, MPI_COMM_WORLD);
820                 
821                 filterString = tempBuf;
822                 if (filterString.length() > alignmentLength) { filterString = filterString.substr(0, alignmentLength);  }
823                 delete tempBuf; 
824         }
825         
826         MPI_Barrier(MPI_COMM_WORLD);
827 #endif
828                                 
829                 return filterString;
830         }
831         catch(exception& e) {
832                 m->errorOut(e, "FilterSeqsCommand", "createFilter");
833                 exit(1);
834         }
835 }
836 /**************************************************************************************/
837 int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* filePos) {     
838         try {
839                 
840                 ifstream in;
841                 m->openInputFile(filename, in);
842                                 
843                 in.seekg(filePos->start);
844
845                 bool done = false;
846                 int count = 0;
847         
848                 while (!done) {
849                                 
850                         if (m->control_pressed) { in.close(); return 1; }
851                                         
852                         Sequence seq(in); m->gobble(in);
853                         if (seq.getName() != "") {
854                                         if (seq.getAligned().length() != alignmentLength) { m->mothurOut("Sequences are not all the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true;  }
855                                         
856                                         if(trump != '*')                        {       F.doTrump(seq);         }
857                                         if(m->isTrue(vertical) || soft != 0)    {       F.getFreqs(seq);        }
858                                         cout.flush();
859                                         count++;
860                         }
861                         
862                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
863                                 unsigned long int pos = in.tellg();
864                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
865                         #else
866                                 if (in.eof()) { break; }
867                         #endif
868                         
869                         //report progress
870                         if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine();           }
871                 }
872                 //report progress
873                 if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine();           }
874                 in.close();
875                 
876                 return count;
877         }
878         catch(exception& e) {
879                 m->errorOut(e, "FilterSeqsCommand", "driverCreateFilter");
880                 exit(1);
881         }
882 }
883 #ifdef USE_MPI
884 /**************************************************************************************/
885 int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector<unsigned long int>& MPIPos) {    
886         try {
887                 
888                 MPI_Status status; 
889                 int pid;
890                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
891                 
892                 for(int i=0;i<num;i++){
893                         
894                         if (m->control_pressed) { return 0; }
895                         
896                         //read next sequence
897                         int length = MPIPos[start+i+1] - MPIPos[start+i];
898         
899                         char* buf4 = new char[length];
900                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
901                         
902                         string tempBuf = buf4;
903                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
904                         istringstream iss (tempBuf,istringstream::in);
905                         delete buf4;
906
907                         Sequence seq(iss);  
908
909                         if (seq.getAligned().length() != alignmentLength) {  cout << "Alignment length is " << alignmentLength << " and sequence " << seq.getName() << " has length " << seq.getAligned().length() << ", please correct." << endl; exit(1);  }
910                         
911                         if(trump != '*'){       F.doTrump(seq); }
912                         if(m->isTrue(vertical) || soft != 0){   F.getFreqs(seq);        }
913                         cout.flush();
914                                                 
915                         //report progress
916                         if((i+1) % 100 == 0){   cout << (i+1) << endl;   m->mothurOutJustToLog(toString(i+1) + "\n");   }
917                 }
918                 
919                 //report progress
920                 if((num) % 100 != 0){   cout << num << endl; m->mothurOutJustToLog(toString(num) + "\n");       }
921                 
922                 return 0;
923         }
924         catch(exception& e) {
925                 m->errorOut(e, "FilterSeqsCommand", "MPICreateFilter");
926                 exit(1);
927         }
928 }
929 #endif
930 /**************************************************************************************************/
931
932 int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) {
933         try {
934 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
935                 int process = 1;
936                 int num = 0;
937                 processIDS.clear();
938                 
939                 //loop through and create all the processes you want
940                 while (process != processors) {
941                         int pid = fork();
942                         
943                         if (pid > 0) {
944                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
945                                 process++;
946                         }else if (pid == 0){
947                                 //reset child's filter counts to 0;
948                                 F.a.clear(); F.a.resize(alignmentLength, 0);
949                                 F.t.clear(); F.t.resize(alignmentLength, 0);
950                                 F.g.clear(); F.g.resize(alignmentLength, 0);
951                                 F.c.clear(); F.c.resize(alignmentLength, 0);
952                                 F.gap.clear(); F.gap.resize(alignmentLength, 0);
953                                 
954                                 num = driverCreateFilter(F, filename, lines[process]);
955                                 
956                                 //write out filter counts to file
957                                 filename += toString(getpid()) + "filterValues.temp";
958                                 ofstream out;
959                                 m->openOutputFile(filename, out);
960                                 
961                                 out << num << endl;
962                                 out << F.getFilter() << endl;
963                                 for (int k = 0; k < alignmentLength; k++) {             out << F.a[k] << '\t'; }  out << endl;
964                                 for (int k = 0; k < alignmentLength; k++) {             out << F.t[k] << '\t'; }  out << endl;
965                                 for (int k = 0; k < alignmentLength; k++) {             out << F.g[k] << '\t'; }  out << endl;
966                                 for (int k = 0; k < alignmentLength; k++) {             out << F.c[k] << '\t'; }  out << endl;
967                                 for (int k = 0; k < alignmentLength; k++) {             out << F.gap[k] << '\t'; }  out << endl;
968
969                                 //cout << F.getFilter() << endl;
970                                 out.close();
971                                 
972                                 exit(0);
973                         }else { 
974                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
975                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
976                                 exit(0);
977                         }
978                 }
979                 
980                 //parent do your part
981                 num = driverCreateFilter(F, filename, lines[0]);
982                 
983                 //force parent to wait until all the processes are done
984                 for (int i=0;i<(processors-1);i++) { 
985                         int temp = processIDS[i];
986                         wait(&temp);
987                 }
988                 
989                 //parent reads in and combines Filter info
990                 for (int i = 0; i < processIDS.size(); i++) {
991                         string tempFilename = filename + toString(processIDS[i]) + "filterValues.temp";
992                         ifstream in;
993                         m->openInputFile(tempFilename, in);
994                         
995                         int temp, tempNum;
996                         string tempFilterString;
997
998                         in >> tempNum; m->gobble(in); num += tempNum;
999
1000                         in >> tempFilterString;
1001                         F.mergeFilter(tempFilterString);
1002
1003                         for (int k = 0; k < alignmentLength; k++) {             in >> temp; F.a[k] += temp; }           m->gobble(in);
1004                         for (int k = 0; k < alignmentLength; k++) {             in >> temp; F.t[k] += temp; }           m->gobble(in);
1005                         for (int k = 0; k < alignmentLength; k++) {             in >> temp; F.g[k] += temp; }           m->gobble(in);
1006                         for (int k = 0; k < alignmentLength; k++) {             in >> temp; F.c[k] += temp; }           m->gobble(in);
1007                         for (int k = 0; k < alignmentLength; k++) {             in >> temp; F.gap[k] += temp; } m->gobble(in);
1008                                 
1009                         in.close();
1010                         remove(tempFilename.c_str());
1011                 }
1012                 
1013                 return num;
1014 #endif          
1015         }
1016         catch(exception& e) {
1017                 m->errorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter");
1018                 exit(1);
1019         }
1020 }
1021 /**************************************************************************************/