]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
removed read.dist, read.otu, read.tree and globaldata. added current to defaults...
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18 vector<string> DistanceCommand::setParameters(){        
19         try {
20                 CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(pcolumn);
21                 CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(poldfasta);
22                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
23                 CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(poutput);
24                 CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "",false,false); parameters.push_back(pcalc);
25                 CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcountends);
26                 CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcompress);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pcutoff);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "DistanceCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string DistanceCommand::getHelpString(){        
43         try {
44                 string helpString = "";
45                 helpString += "The dist.seqs command reads a file containing sequences and creates a distance file.\n";
46                 helpString += "The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors.  \n";
47                 helpString += "The fasta parameter is required, unless you have a valid current fasta file.\n";
48                 helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
49                 helpString += "The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n";
50                 helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n";
51                 helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
52                 helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
53                 helpString += "The processors parameter allows you to specify number of processors to use.  The default is 1.\n";
54                 helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n";
55                 helpString += "The dist.seqs command should be in the following format: \n";
56                 helpString += "dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n";
57                 helpString += "Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n";
58                 helpString += "Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n";
59                 return helpString;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "DistanceCommand", "getHelpString");
63                 exit(1);
64         }
65 }
66 //**********************************************************************************************************************
67 DistanceCommand::DistanceCommand(){     
68         try {
69                 abort = true; calledHelp = true; 
70                 setParameters();
71                 vector<string> tempOutNames;
72                 outputTypes["phylip"] = tempOutNames;
73                 outputTypes["column"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 DistanceCommand::DistanceCommand(string option) {
82         try {
83                 abort = false; calledHelp = false;   
84                 Estimators.clear();
85                                 
86                 //allow user to run help
87                 if(option == "help") { help(); abort = true; calledHelp = true; }
88                 
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string, string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter("dist.seqs");
96                         map<string, string>::iterator it2;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
100                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylip"] = tempOutNames;
106                         outputTypes["column"] = tempOutNames;
107                 
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it2 = parameters.find("fasta");
114                                 //user has given a template file
115                                 if(it2 != parameters.end()){ 
116                                         path = m->hasPath(it2->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
119                                 }
120                                 
121                                 it2 = parameters.find("oldfasta");
122                                 //user has given a template file
123                                 if(it2 != parameters.end()){ 
124                                         path = m->hasPath(it2->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
127                                 }
128                                 
129                                 it2 = parameters.find("column");
130                                 //user has given a template file
131                                 if(it2 != parameters.end()){ 
132                                         path = m->hasPath(it2->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
135                                 }
136                         }
137
138                         //check for required parameters
139                         fastafile = validParameter.validFile(parameters, "fasta", true);
140                         if (fastafile == "not found") {                                 
141                                 fastafile = m->getFastaFile(); 
142                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); 
143                                         ifstream inFASTA;
144                                         m->openInputFile(fastafile, inFASTA);
145                                         alignDB = SequenceDB(inFASTA); 
146                                         inFASTA.close();
147                                 }else {         m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
148                         }else if (fastafile == "not open") { abort = true; }    
149                         else{
150                                 ifstream inFASTA;
151                                 m->openInputFile(fastafile, inFASTA);
152                                 alignDB = SequenceDB(inFASTA); 
153                                 inFASTA.close();
154                         }
155                         
156                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
157                         if (oldfastafile == "not found") { oldfastafile = ""; }
158                         else if (oldfastafile == "not open") { abort = true; }  
159                         
160                         column = validParameter.validFile(parameters, "column", true);
161                         if (column == "not found") { column = ""; }
162                         else if (column == "not open") { abort = true; }        
163                         
164                         //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
166                                 outputDir = ""; 
167                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
168                         }
169
170                         //check for optional parameter and set defaults
171                         // ...at some point should added some additional type checking...
172                         calc = validParameter.validFile(parameters, "calc", false);                     
173                         if (calc == "not found") { calc = "onegap";  }
174                         else { 
175                                  if (calc == "default")  {  calc = "onegap";  }
176                         }
177                         m->splitAtDash(calc, Estimators);
178
179                         string temp;
180                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
181                         convert(temp, countends); 
182                         
183                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
184                         convert(temp, cutoff); 
185                         
186                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
187                         m->setProcessors(temp);
188                         convert(temp, processors);
189                         
190                         temp = validParameter.validFile(parameters, "compress", false);         if(temp == "not found"){  temp = "F"; }
191                         convert(temp, compress);
192
193                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
194                         
195                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
196                         
197                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
198                         
199                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
200                         
201                         ValidCalculators validCalculator;
202                         
203                         if (m->isTrue(countends) == true) {
204                                 for (int i=0; i<Estimators.size(); i++) {
205                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
206                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
207                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
208                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
209                                         }
210                                 }
211                         }else {
212                                 for (int i=0; i<Estimators.size(); i++) {
213                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
214                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
215                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
216                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
217                                         }
218                                 }
219                         }
220
221                 }
222                                 
223         }
224         catch(exception& e) {
225                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
226                 exit(1);
227         }
228 }
229 //**********************************************************************************************************************
230
231 int DistanceCommand::execute(){
232         try {
233                 
234                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
235                 
236                 int startTime = time(NULL);
237                 
238                 //save number of new sequence
239                 numNewFasta = alignDB.getNumSeqs();
240                 
241                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
242                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
243                 
244                 if (m->control_pressed) { return 0; }
245                 
246                 int numSeqs = alignDB.getNumSeqs();
247                 cutoff += 0.005;
248                 
249                 string outputFile;
250                                 
251                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
252                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
253                         remove(outputFile.c_str()); outputTypes["phylip"].push_back(outputFile);
254                         
255                         //output numSeqs to phylip formatted dist file
256                 }else if (output == "column") { //user wants column format
257                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
258                         outputTypes["column"].push_back(outputFile);
259                         
260                         //so we don't accidentally overwrite
261                         if (outputFile == column) { 
262                                 string tempcolumn = column + ".old"; 
263                                 rename(column.c_str(), tempcolumn.c_str());
264                         }
265                         
266                         remove(outputFile.c_str());
267                 }else { //assume square
268                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
269                         remove(outputFile.c_str());
270                         outputTypes["phylip"].push_back(outputFile);
271                 }
272                 
273
274 #ifdef USE_MPI
275                 
276                 int pid, start, end; 
277                 int tag = 2001;
278                                 
279                 MPI_Status status; 
280                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
281                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
282                 
283                 //each process gets where it should start and stop in the file
284                 if (output != "square") {
285                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
286                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
287                 }else{
288                         start = int ((float(pid)/float(processors)) * numSeqs);
289                         end = int ((float(pid+1)/float(processors)) * numSeqs);
290                 }
291                 
292                 if (output == "column") {
293                         MPI_File outMPI;
294                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
295
296                         //char* filename = new char[outputFile.length()];
297                         //memcpy(filename, outputFile.c_str(), outputFile.length());
298                         
299                         char filename[1024];
300                         strcpy(filename, outputFile.c_str());
301                         
302                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
303                         //delete filename;
304
305                         if (pid == 0) { //you are the root process 
306                                 
307                                 //do your part
308                                 string outputMyPart;
309                                 
310                                 driverMPI(start, end, outMPI, cutoff); 
311                                 
312                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator;  return 0; }
313                         
314                                 //wait on chidren
315                                 for(int i = 1; i < processors; i++) { 
316                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
317                                         
318                                         char buf[5];
319                                         MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
320                                 }
321                         }else { //you are a child process
322                                 //do your part
323                                 driverMPI(start, end, outMPI, cutoff); 
324                                 
325                                 if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
326                         
327                                 char buf[5];
328                                 strcpy(buf, "done"); 
329                                 //tell parent you are done.
330                                 MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
331                         }
332                         
333                         MPI_File_close(&outMPI);
334                         
335                 }else { //lower triangle format
336                         if (pid == 0) { //you are the root process 
337                         
338                                 //do your part
339                                 string outputMyPart;
340                                 unsigned long int mySize;
341                                 
342                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
343                                 else { driverMPI(start, end, outputFile, mySize, output); }
344         
345                                 if (m->control_pressed) {  outputTypes.clear();  delete distCalculator;  return 0; }
346                                 
347                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
348                                 MPI_File outMPI;
349                                 MPI_File inMPI;
350
351                                 //char* filename = new char[outputFile.length()];
352                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
353                                 
354                                 char filename[1024];
355                                 strcpy(filename, outputFile.c_str());
356
357                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
358                                 //delete filename;
359
360                                 //wait on chidren
361                                 for(int b = 1; b < processors; b++) { 
362                                         unsigned long int fileSize;
363                                         
364                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
365                                         
366                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
367                                         
368                                         string outTemp = outputFile + toString(b) + ".temp";
369
370                                         char* buf = new char[outTemp.length()];
371                                         memcpy(buf, outTemp.c_str(), outTemp.length());
372                                         
373                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
374                                         delete buf;
375
376                                         int count = 0;
377                                         while (count < fileSize) { 
378                                                 char buf2[1];
379                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
380                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
381                                                 count += 1;
382                                         }
383                                         
384                                         MPI_File_close(&inMPI); //deleted on close
385                                 }
386                                 
387                                 MPI_File_close(&outMPI);
388                         }else { //you are a child process
389                                 //do your part
390                                 unsigned long int size;
391                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
392                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
393                                 
394                                 if (m->control_pressed) { delete distCalculator;  return 0; }
395                         
396                                 //tell parent you are done.
397                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
398                         }
399                 }
400                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
401 #else           
402                                 
403         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
404                 //if you don't need to fork anything
405                 if(processors == 1){
406                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
407                         else { driver(0, numSeqs, outputFile, "square");  }
408                 }else{ //you have multiple processors
409                         
410                         unsigned long int numDists = 0;
411                         
412                         if (output == "square") {
413                                  numDists = numSeqs * numSeqs;
414                         }else {
415                                 for(int i=0;i<numSeqs;i++){
416                                         for(int j=0;j<i;j++){
417                                                 numDists++;
418                                                 if (numDists > processors) { break; }
419                                         }
420                                 }
421                         }
422                         
423                         if (numDists < processors) { processors = numDists; }
424                         
425                         for (int i = 0; i < processors; i++) {
426                                 distlinePair tempLine;
427                                 lines.push_back(tempLine);
428                                 if (output != "square") {
429                                         lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
430                                         lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
431                                 }else{
432                                         lines[i].start = int ((float(i)/float(processors)) * numSeqs);
433                                         lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
434                                 }
435                                 
436                         }
437                         
438                         createProcesses(outputFile); 
439                 }
440         #else
441                 //ifstream inFASTA;
442                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
443                 else { driver(0, numSeqs, outputFile, "square");  }
444         #endif
445         
446 #endif
447                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; remove(outputFile.c_str()); return 0; }
448                 
449                 #ifdef USE_MPI
450                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
451                                         
452                         if (pid == 0) { //only one process should output to screen
453                 #endif
454                 
455                 //if (output == "square") {  convertMatrix(outputFile); }
456                 
457                 ifstream fileHandle;
458                 fileHandle.open(outputFile.c_str());
459                 if(fileHandle) {
460                         m->gobble(fileHandle);
461                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
462                 }
463                 
464                 //append the old column file to the new one
465                 if ((oldfastafile != "") && (column != ""))  {
466                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
467                         if (outputFile == column) { 
468                                 string tempcolumn = column + ".old";
469                                 m->appendFiles(tempcolumn, outputFile);
470                                 remove(tempcolumn.c_str());
471                         }else{
472                                 m->appendFiles(outputFile, column);
473                                 remove(outputFile.c_str());
474                                 outputFile = column;
475                         }
476                         
477                         if (outputDir != "") { 
478                                 string newOutputName = outputDir + m->getSimpleName(outputFile);
479                                 rename(outputFile.c_str(), newOutputName.c_str());
480                                 remove(outputFile.c_str());
481                                 outputFile = newOutputName;
482                         }
483                 }
484
485                 
486                 #ifdef USE_MPI
487                         }
488                 #endif
489                 
490                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; remove(outputFile.c_str()); return 0; }
491                 
492                 delete distCalculator;
493                 
494                 //set phylip file as new current phylipfile
495                 string current = "";
496                 itTypes = outputTypes.find("phylip");
497                 if (itTypes != outputTypes.end()) {
498                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
499                 }
500                 
501                 //set column file as new current columnfile
502                 itTypes = outputTypes.find("column");
503                 if (itTypes != outputTypes.end()) {
504                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
505                 }
506                 
507                 m->mothurOutEndLine();
508                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
509                 m->mothurOut(outputFile); m->mothurOutEndLine();
510                 m->mothurOutEndLine();
511                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
512
513
514                 if (m->isTrue(compress)) {
515                         m->mothurOut("Compressing..."); m->mothurOutEndLine();
516                         m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)"); m->mothurOutEndLine();
517                         system(("gzip -v " + outputFile).c_str());
518                         outputNames.push_back(outputFile + ".gz");
519                 }else { outputNames.push_back(outputFile); }
520
521                 return 0;
522                 
523         }
524         catch(exception& e) {
525                 m->errorOut(e, "DistanceCommand", "execute");
526                 exit(1);
527         }
528 }
529 /**************************************************************************************************/
530 void DistanceCommand::createProcesses(string filename) {
531         try {
532 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
533                 int process = 1;
534                 processIDS.clear();
535                 
536                 //loop through and create all the processes you want
537                 while (process != processors) {
538                         int pid = fork();
539                         
540                         if (pid > 0) {
541                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
542                                 process++;
543                         }else if (pid == 0){
544                                 if (output != "square") {  driver(lines[process].start, lines[process].end, filename + toString(getpid()) + ".temp", cutoff); }
545                                 else { driver(lines[process].start, lines[process].end, filename + toString(getpid()) + ".temp", "square"); }
546                                 exit(0);
547                         }else { 
548                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
549                                 perror(" : ");
550                                 for (int i=0;i<processIDS.size();i++) {  int temp = processIDS[i]; kill (temp, SIGINT); }
551                                 exit(0);
552                         }
553                 }
554                 
555                 //parent does its part
556                 if (output != "square") {  driver(lines[0].start, lines[0].end, filename, cutoff); }
557                 else { driver(lines[0].start, lines[0].end, filename, "square"); }
558                 
559                 
560                 //force parent to wait until all the processes are done
561                 for (int i=0;i<processIDS.size();i++) { 
562                         int temp = processIDS[i];
563                         wait(&temp);
564                 }
565                 
566                 //append and remove temp files
567                 for (int i=0;i<processIDS.size();i++) { 
568                         m->appendFiles((filename + toString(processIDS[i]) + ".temp"), filename);
569                         remove((filename + toString(processIDS[i]) + ".temp").c_str());
570                 }
571 #endif
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "DistanceCommand", "createProcesses");
575                 exit(1);
576         }
577 }
578
579 /**************************************************************************************************/
580 /////// need to fix to work with calcs and sequencedb
581 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
582         try {
583
584                 int startTime = time(NULL);
585                 
586                 //column file
587                 ofstream outFile(dFileName.c_str(), ios::trunc);
588                 outFile.setf(ios::fixed, ios::showpoint);
589                 outFile << setprecision(4);
590                 
591                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
592                 
593                 for(int i=startLine;i<endLine;i++){
594                         if(output == "lt")      {       
595                                 string name = alignDB.get(i).getName();
596                                 if (name.length() < 10) { //pad with spaces to make compatible
597                                         while (name.length() < 10) {  name += " ";  }
598                                 }
599                                 outFile << name << '\t';        
600                         }
601                         for(int j=0;j<i;j++){
602                                 
603                                 if (m->control_pressed) { outFile.close(); return 0;  }
604                                 
605                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
606                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
607                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
608                                 
609                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
610                                 double dist = distCalculator->getDist();
611                                 
612                                 if(dist <= cutoff){
613                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
614                                 }
615                                 if (output == "lt") {  outFile << dist << '\t'; }
616                         }
617                         
618                         if (output == "lt") { outFile << endl; }
619                         
620                         if(i % 100 == 0){
621                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
622                         }
623                         
624                 }
625                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
626                 
627                 outFile.close();
628                 
629                 return 1;
630         }
631         catch(exception& e) {
632                 m->errorOut(e, "DistanceCommand", "driver");
633                 exit(1);
634         }
635 }
636 /**************************************************************************************************/
637 /////// need to fix to work with calcs and sequencedb
638 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
639         try {
640
641                 int startTime = time(NULL);
642                 
643                 //column file
644                 ofstream outFile(dFileName.c_str(), ios::trunc);
645                 outFile.setf(ios::fixed, ios::showpoint);
646                 outFile << setprecision(4);
647                 
648                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
649                 
650                 for(int i=startLine;i<endLine;i++){
651                                 
652                         string name = alignDB.get(i).getName();
653                         //pad with spaces to make compatible
654                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
655                                 
656                         outFile << name << '\t';        
657                         
658                         for(int j=0;j<alignDB.getNumSeqs();j++){
659                                 
660                                 if (m->control_pressed) { outFile.close(); return 0;  }
661                                 
662                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
663                                 double dist = distCalculator->getDist();
664                                 
665                                 outFile << dist << '\t'; 
666                         }
667                         
668                         outFile << endl; 
669                         
670                         if(i % 100 == 0){
671                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
672                         }
673                         
674                 }
675                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
676                 
677                 outFile.close();
678                 
679                 return 1;
680         }
681         catch(exception& e) {
682                 m->errorOut(e, "DistanceCommand", "driver");
683                 exit(1);
684         }
685 }
686 #ifdef USE_MPI
687 /**************************************************************************************************/
688 /////// need to fix to work with calcs and sequencedb
689 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
690         try {
691                 MPI_Status status;
692                 int startTime = time(NULL);
693                 
694                 string outputString = "";
695                 
696                 for(int i=startLine;i<endLine;i++){
697         
698                         for(int j=0;j<i;j++){
699                                 
700                                 if (m->control_pressed) {  return 0;  }
701                                 
702                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
703                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
704                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
705                                 
706                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
707                                 double dist = distCalculator->getDist();
708                                 
709                                 if(dist <= cutoff){
710                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
711                                 }
712                         }
713                         
714                         if(i % 100 == 0){
715                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
716                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
717                         }
718                         
719                          
720                         //send results to parent
721                         int length = outputString.length();
722
723                         char* buf = new char[length];
724                         memcpy(buf, outputString.c_str(), length);
725                         
726                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
727                         outputString = "";
728                         delete buf;
729                         
730                 }
731                 
732                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
733                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
734                 return 1;
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "DistanceCommand", "driverMPI");
738                 exit(1);
739         }
740 }
741 /**************************************************************************************************/
742 /////// need to fix to work with calcs and sequencedb
743 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
744         try {
745                 MPI_Status status;
746                 
747                 MPI_File outMPI;
748                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
749
750                 //char* filename = new char[file.length()];
751                 //memcpy(filename, file.c_str(), file.length());
752                 
753                 char filename[1024];
754                 strcpy(filename, file.c_str());
755
756                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
757                 //delete filename;
758
759                 int startTime = time(NULL);
760                 
761                 string outputString = "";
762                 size = 0;
763                 
764                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
765                 
766                 for(int i=startLine;i<endLine;i++){
767                                 
768                         string name = alignDB.get(i).getName();
769                         if (name.length() < 10) { //pad with spaces to make compatible
770                                 while (name.length() < 10) {  name += " ";  }
771                         }
772                         outputString += name + "\t";    
773                         
774                         for(int j=0;j<i;j++){
775                                 
776                                 if (m->control_pressed) {  return 0;  }
777                                 
778                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
779                                 double dist = distCalculator->getDist();
780                                 
781                                 outputString += toString(dist) + "\t"; 
782                         }
783                         
784                         outputString += "\n"; 
785
786                 
787                         if(i % 100 == 0){
788                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
789                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
790                         }
791                         
792                         
793                         //send results to parent
794                         int length = outputString.length();
795                         char* buf = new char[length];
796                         memcpy(buf, outputString.c_str(), length);
797                         
798                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
799                         size += outputString.length();
800                         outputString = "";
801                         delete buf;
802                 }
803                 
804                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
805                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
806                 MPI_File_close(&outMPI);
807                 
808                 return 1;
809         }
810         catch(exception& e) {
811                 m->errorOut(e, "DistanceCommand", "driverMPI");
812                 exit(1);
813         }
814 }
815 /**************************************************************************************************/
816 /////// need to fix to work with calcs and sequencedb
817 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
818         try {
819                 MPI_Status status;
820                 
821                 MPI_File outMPI;
822                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
823
824                 //char* filename = new char[file.length()];
825                 //memcpy(filename, file.c_str(), file.length());
826                 
827                 char filename[1024];
828                 strcpy(filename, file.c_str());
829
830                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
831                 //delete filename;
832
833                 int startTime = time(NULL);
834                 
835                 string outputString = "";
836                 size = 0;
837                 
838                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
839                 
840                 for(int i=startLine;i<endLine;i++){
841                                 
842                         string name = alignDB.get(i).getName();
843                         if (name.length() < 10) { //pad with spaces to make compatible
844                                 while (name.length() < 10) {  name += " ";  }
845                         }
846                         outputString += name + "\t";    
847                         
848                         for(int j=0;j<alignDB.getNumSeqs();j++){
849                                 
850                                 if (m->control_pressed) {  return 0;  }
851                                 
852                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
853                                 double dist = distCalculator->getDist();
854                                 
855                                 outputString += toString(dist) + "\t"; 
856                         }
857                         
858                         outputString += "\n"; 
859
860                 
861                         if(i % 100 == 0){
862                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
863                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
864                         }
865                         
866                         
867                         //send results to parent
868                         int length = outputString.length();
869                         char* buf = new char[length];
870                         memcpy(buf, outputString.c_str(), length);
871                         
872                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
873                         size += outputString.length();
874                         outputString = "";
875                         delete buf;
876                 }
877                 
878                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
879                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
880                 MPI_File_close(&outMPI);
881                 
882                 return 1;
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "DistanceCommand", "driverMPI");
886                 exit(1);
887         }
888 }
889 #endif
890 /**************************************************************************************************
891 int DistanceCommand::convertMatrix(string outputFile) {
892         try{
893
894                 //sort file by first column so the distances for each row are together
895                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
896                 
897                 //use the unix sort 
898                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
899                         string command = "sort -n " + outputFile + " -o " + outfile;
900                         system(command.c_str());
901                 #else //sort using windows sort
902                         string command = "sort " + outputFile + " /O " + outfile;
903                         system(command.c_str());
904                 #endif
905                 
906
907                 //output to new file distance for each row and save positions in file where new row begins
908                 ifstream in;
909                 m->openInputFile(outfile, in);
910                 
911                 ofstream out;
912                 m->openOutputFile(outputFile, out);
913                 
914                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
915
916                 out << alignDB.getNumSeqs() << endl;
917                 
918                 //get first currentRow
919                 string first, currentRow, second;
920                 float dist;
921                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
922                 map<string, float>::iterator it;
923                 
924                 in >> first;
925                 currentRow = first;
926                 
927                 rowDists[first] = 0.00; //distance to yourself is 0.0
928                 
929                 in.seekg(0);
930                 //m->openInputFile(outfile, in);
931                 
932                 while(!in.eof()) {
933                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
934                         
935                         in >> first >> second >> dist; m->gobble(in);
936                                 
937                         if (first != currentRow) {
938                                 //print out last row
939                                 out << currentRow << '\t'; //print name
940
941                                 //print dists
942                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
943                                         out << it->second << '\t';
944                                 }
945                                 out << endl;
946                                 
947                                 //start new row
948                                 currentRow = first;
949                                 rowDists.clear();
950                                 rowDists[first] = 0.00;
951                                 rowDists[second] = dist;
952                         }else{
953                                 rowDists[second] = dist;
954                         }
955                 }
956                 //print out last row
957                 out << currentRow << '\t'; //print name
958                                 
959                 //print dists
960                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
961                         out << it->second << '\t';
962                 }
963                 out << endl;
964                 
965                 in.close();
966                 out.close();
967                 
968                 remove(outfile.c_str());
969                 
970                 return 1;
971                 
972         }
973         catch(exception& e) {
974                 m->errorOut(e, "DistanceCommand", "convertMatrix");
975                 exit(1);
976         }
977 }
978 /**************************************************************************************************
979 int DistanceCommand::convertToLowerTriangle(string outputFile) {
980         try{
981
982                 //sort file by first column so the distances for each row are together
983                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
984                 
985                 //use the unix sort 
986                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
987                         string command = "sort -n " + outputFile + " -o " + outfile;
988                         system(command.c_str());
989                 #else //sort using windows sort
990                         string command = "sort " + outputFile + " /O " + outfile;
991                         system(command.c_str());
992                 #endif
993                 
994
995                 //output to new file distance for each row and save positions in file where new row begins
996                 ifstream in;
997                 m->openInputFile(outfile, in);
998                 
999                 ofstream out;
1000                 m->openOutputFile(outputFile, out);
1001                 
1002                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
1003
1004                 out << alignDB.getNumSeqs() << endl;
1005                 
1006                 //get first currentRow
1007                 string first, currentRow, second;
1008                 float dist;
1009                 int i, j;
1010                 i = 0; j = 0;
1011                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
1012                 map<string, float>::iterator it;
1013                 
1014                 in >> first;
1015                 currentRow = first;
1016                 
1017                 rowDists[first] = 0.00; //distance to yourself is 0.0
1018                 
1019                 in.seekg(0);
1020                 //m->openInputFile(outfile, in);
1021                 
1022                 while(!in.eof()) {
1023                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
1024                         
1025                         in >> first >> second >> dist; m->gobble(in);
1026                                 
1027                         if (first != currentRow) {
1028                                 //print out last row
1029                                 out << currentRow << '\t'; //print name
1030
1031                                 //print dists
1032                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1033                                         if (j >= i) { break; }
1034                                         out << it->second << '\t';
1035                                         j++;
1036                                 }
1037                                 out << endl;
1038                                 
1039                                 //start new row
1040                                 currentRow = first;
1041                                 rowDists.clear();
1042                                 rowDists[first] = 0.00;
1043                                 rowDists[second] = dist;
1044                                 j = 0;
1045                                 i++;
1046                         }else{
1047                                 rowDists[second] = dist;
1048                         }
1049                 }
1050                 //print out last row
1051                 out << currentRow << '\t'; //print name
1052                                 
1053                 //print dists
1054                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1055                         out << it->second << '\t';
1056                 }
1057                 out << endl;
1058                 
1059                 in.close();
1060                 out.close();
1061                 
1062                 remove(outfile.c_str());
1063                 
1064                 return 1;
1065                 
1066         }
1067         catch(exception& e) {
1068                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
1069                 exit(1);
1070         }
1071 }
1072 /**************************************************************************************************/
1073 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
1074 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
1075 //also check to make sure the 2 files have the same alignment length.
1076 bool DistanceCommand::sanityCheck() {
1077         try{
1078                 bool good = true;
1079                 
1080                 //make sure the 2 fasta files have the same alignment length
1081                 ifstream in;
1082                 m->openInputFile(fastafile, in);
1083                 int fastaAlignLength = 0;
1084                 if (in) { 
1085                         Sequence tempIn(in);
1086                         fastaAlignLength = tempIn.getAligned().length();
1087                 }
1088                 in.close();
1089                 
1090                 ifstream in2;
1091                 m->openInputFile(oldfastafile, in2);
1092                 int oldfastaAlignLength = 0;
1093                 if (in2) { 
1094                         Sequence tempIn2(in2);
1095                         oldfastaAlignLength = tempIn2.getAligned().length();
1096                 }
1097                 in2.close();
1098                 
1099                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1100                 
1101                 //read fasta file and save names as well as adding them to the alignDB
1102                 set<string> namesOldFasta;
1103                 
1104                 ifstream inFasta;
1105                 m->openInputFile(oldfastafile, inFasta);
1106                 
1107                 while (!inFasta.eof()) {
1108                         if (m->control_pressed) {  inFasta.close(); return good;  }
1109                 
1110                         Sequence temp(inFasta);
1111                         
1112                         if (temp.getName() != "") {
1113                                 namesOldFasta.insert(temp.getName());  //save name
1114                                 alignDB.push_back(temp);  //add to DB
1115                         }
1116                         
1117                         m->gobble(inFasta);
1118                 }
1119                 
1120                 inFasta.close();
1121                 
1122                 //read through the column file checking names and removing distances above the cutoff
1123                 ifstream inDist;
1124                 m->openInputFile(column, inDist);
1125                 
1126                 ofstream outDist;
1127                 string outputFile = column + ".temp";
1128                 m->openOutputFile(outputFile, outDist);
1129                 
1130                 string name1, name2;
1131                 float dist;
1132                 while (!inDist.eof()) {
1133                         if (m->control_pressed) {  inDist.close(); outDist.close(); remove(outputFile.c_str()); return good;  }
1134                 
1135                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1136                         
1137                         //both names are in fasta file and distance is below cutoff
1138                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1139                         else{
1140                                 if (dist <= cutoff) {
1141                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1142                                 }
1143                         }
1144                 }
1145                 
1146                 inDist.close();
1147                 outDist.close();
1148                 
1149                 if (good) {
1150                         remove(column.c_str());
1151                         rename(outputFile.c_str(), column.c_str());
1152                 }else{
1153                         remove(outputFile.c_str()); //temp file is bad because file mismatch above
1154                 }
1155                 
1156                 return good;
1157                 
1158         }
1159         catch(exception& e) {
1160                 m->errorOut(e, "DistanceCommand", "sanityCheck");
1161                 exit(1);
1162         }
1163 }
1164 /**************************************************************************************************/
1165
1166
1167
1168