]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
rewrote metastats command in c++, added mothurRemove function to handle ~ error....
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18 vector<string> DistanceCommand::setParameters(){        
19         try {
20                 CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(pcolumn);
21                 CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(poldfasta);
22                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
23                 CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(poutput);
24                 CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "",false,false); parameters.push_back(pcalc);
25                 CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcountends);
26                 CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcompress);
27                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
28                 CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pcutoff);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "DistanceCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string DistanceCommand::getHelpString(){        
43         try {
44                 string helpString = "";
45                 helpString += "The dist.seqs command reads a file containing sequences and creates a distance file.\n";
46                 helpString += "The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors.  \n";
47                 helpString += "The fasta parameter is required, unless you have a valid current fasta file.\n";
48                 helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
49                 helpString += "The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n";
50                 helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n";
51                 helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
52                 helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
53                 helpString += "The processors parameter allows you to specify number of processors to use.  The default is 1.\n";
54                 helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n";
55                 helpString += "The dist.seqs command should be in the following format: \n";
56                 helpString += "dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n";
57                 helpString += "Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n";
58                 helpString += "Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n";
59                 return helpString;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "DistanceCommand", "getHelpString");
63                 exit(1);
64         }
65 }
66 //**********************************************************************************************************************
67 DistanceCommand::DistanceCommand(){     
68         try {
69                 abort = true; calledHelp = true; 
70                 setParameters();
71                 vector<string> tempOutNames;
72                 outputTypes["phylip"] = tempOutNames;
73                 outputTypes["column"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 DistanceCommand::DistanceCommand(string option) {
82         try {
83                 abort = false; calledHelp = false;   
84                 Estimators.clear();
85                                 
86                 //allow user to run help
87                 if(option == "help") { help(); abort = true; calledHelp = true; }
88                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
89                 
90                 else {
91                         vector<string> myArray = setParameters();
92                         
93                         OptionParser parser(option);
94                         map<string, string> parameters = parser.getParameters();
95                         
96                         ValidParameters validParameter("dist.seqs");
97                         map<string, string>::iterator it2;
98                 
99                         //check to make sure all parameters are valid for command
100                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
101                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
102                         }
103                         
104                         //initialize outputTypes
105                         vector<string> tempOutNames;
106                         outputTypes["phylip"] = tempOutNames;
107                         outputTypes["column"] = tempOutNames;
108                 
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it2 = parameters.find("fasta");
115                                 //user has given a template file
116                                 if(it2 != parameters.end()){ 
117                                         path = m->hasPath(it2->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
120                                 }
121                                 
122                                 it2 = parameters.find("oldfasta");
123                                 //user has given a template file
124                                 if(it2 != parameters.end()){ 
125                                         path = m->hasPath(it2->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
128                                 }
129                                 
130                                 it2 = parameters.find("column");
131                                 //user has given a template file
132                                 if(it2 != parameters.end()){ 
133                                         path = m->hasPath(it2->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
136                                 }
137                         }
138
139                         //check for required parameters
140                         fastafile = validParameter.validFile(parameters, "fasta", true);
141                         if (fastafile == "not found") {                                 
142                                 fastafile = m->getFastaFile(); 
143                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); 
144                                         ifstream inFASTA;
145                                         m->openInputFile(fastafile, inFASTA);
146                                         alignDB = SequenceDB(inFASTA); 
147                                         inFASTA.close();
148                                 }else {         m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
149                         }else if (fastafile == "not open") { abort = true; }    
150                         else{
151                                 ifstream inFASTA;
152                                 m->openInputFile(fastafile, inFASTA);
153                                 alignDB = SequenceDB(inFASTA); 
154                                 inFASTA.close();
155                                 m->setFastaFile(fastafile);
156                         }
157                         
158                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
159                         if (oldfastafile == "not found") { oldfastafile = ""; }
160                         else if (oldfastafile == "not open") { abort = true; }  
161                         
162                         column = validParameter.validFile(parameters, "column", true);
163                         if (column == "not found") { column = ""; }
164                         else if (column == "not open") { abort = true; }        
165                         else { m->setColumnFile(column); }
166                         
167                         //if the user changes the output directory command factory will send this info to us in the output parameter 
168                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
169                                 outputDir = ""; 
170                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
171                         }
172
173                         //check for optional parameter and set defaults
174                         // ...at some point should added some additional type checking...
175                         calc = validParameter.validFile(parameters, "calc", false);                     
176                         if (calc == "not found") { calc = "onegap";  }
177                         else { 
178                                  if (calc == "default")  {  calc = "onegap";  }
179                         }
180                         m->splitAtDash(calc, Estimators);
181
182                         string temp;
183                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
184                         convert(temp, countends); 
185                         
186                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
187                         convert(temp, cutoff); 
188                         
189                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
190                         m->setProcessors(temp);
191                         convert(temp, processors);
192                         
193                         temp = validParameter.validFile(parameters, "compress", false);         if(temp == "not found"){  temp = "F"; }
194                         convert(temp, compress);
195
196                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
197                         
198                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
199                         
200                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
201                         
202                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
203                         
204                         ValidCalculators validCalculator;
205                         
206                         if (m->isTrue(countends) == true) {
207                                 for (int i=0; i<Estimators.size(); i++) {
208                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
209                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
210                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
211                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
212                                         }
213                                 }
214                         }else {
215                                 for (int i=0; i<Estimators.size(); i++) {
216                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
217                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
218                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
219                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
220                                         }
221                                 }
222                         }
223
224                 }
225                                 
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
229                 exit(1);
230         }
231 }
232 //**********************************************************************************************************************
233
234 int DistanceCommand::execute(){
235         try {
236                 
237                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
238                 
239                 int startTime = time(NULL);
240                 
241                 //save number of new sequence
242                 numNewFasta = alignDB.getNumSeqs();
243                 
244                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
245                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
246                 
247                 if (m->control_pressed) { return 0; }
248                 
249                 int numSeqs = alignDB.getNumSeqs();
250                 cutoff += 0.005;
251                 
252                 if (!alignDB.sameLength()) {  m->mothurOut("[ERROR]: your sequences are not the same length, aborting."); m->mothurOutEndLine(); return 0; }
253                 
254                 string outputFile;
255                                 
256                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
257                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
258                         m->mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
259                         
260                         //output numSeqs to phylip formatted dist file
261                 }else if (output == "column") { //user wants column format
262                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
263                         outputTypes["column"].push_back(outputFile);
264                         
265                         //so we don't accidentally overwrite
266                         if (outputFile == column) { 
267                                 string tempcolumn = column + ".old"; 
268                                 rename(column.c_str(), tempcolumn.c_str());
269                         }
270                         
271                         m->mothurRemove(outputFile);
272                 }else { //assume square
273                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
274                         m->mothurRemove(outputFile);
275                         outputTypes["phylip"].push_back(outputFile);
276                 }
277                 
278
279 #ifdef USE_MPI
280                 
281                 int pid, start, end; 
282                 int tag = 2001;
283                                 
284                 MPI_Status status; 
285                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
286                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
287                 
288                 //each process gets where it should start and stop in the file
289                 if (output != "square") {
290                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
291                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
292                 }else{
293                         start = int ((float(pid)/float(processors)) * numSeqs);
294                         end = int ((float(pid+1)/float(processors)) * numSeqs);
295                 }
296                 
297                 if (output == "column") {
298                         MPI_File outMPI;
299                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
300
301                         //char* filename = new char[outputFile.length()];
302                         //memcpy(filename, outputFile.c_str(), outputFile.length());
303                         
304                         char filename[1024];
305                         strcpy(filename, outputFile.c_str());
306                         
307                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
308                         //delete filename;
309
310                         if (pid == 0) { //you are the root process 
311                                 
312                                 //do your part
313                                 string outputMyPart;
314                                 
315                                 driverMPI(start, end, outMPI, cutoff); 
316                                 
317                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator;  return 0; }
318                         
319                                 //wait on chidren
320                                 for(int i = 1; i < processors; i++) { 
321                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
322                                         
323                                         char buf[5];
324                                         MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
325                                 }
326                         }else { //you are a child process
327                                 //do your part
328                                 driverMPI(start, end, outMPI, cutoff); 
329                                 
330                                 if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
331                         
332                                 char buf[5];
333                                 strcpy(buf, "done"); 
334                                 //tell parent you are done.
335                                 MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
336                         }
337                         
338                         MPI_File_close(&outMPI);
339                         
340                 }else { //lower triangle format
341                         if (pid == 0) { //you are the root process 
342                         
343                                 //do your part
344                                 string outputMyPart;
345                                 unsigned long int mySize;
346                                 
347                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
348                                 else { driverMPI(start, end, outputFile, mySize, output); }
349         
350                                 if (m->control_pressed) {  outputTypes.clear();  delete distCalculator;  return 0; }
351                                 
352                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
353                                 MPI_File outMPI;
354                                 MPI_File inMPI;
355
356                                 //char* filename = new char[outputFile.length()];
357                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
358                                 
359                                 char filename[1024];
360                                 strcpy(filename, outputFile.c_str());
361
362                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
363                                 //delete filename;
364
365                                 //wait on chidren
366                                 for(int b = 1; b < processors; b++) { 
367                                         unsigned long int fileSize;
368                                         
369                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
370                                         
371                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
372                                         
373                                         string outTemp = outputFile + toString(b) + ".temp";
374
375                                         char* buf = new char[outTemp.length()];
376                                         memcpy(buf, outTemp.c_str(), outTemp.length());
377                                         
378                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
379                                         delete buf;
380
381                                         int count = 0;
382                                         while (count < fileSize) { 
383                                                 char buf2[1];
384                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
385                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
386                                                 count += 1;
387                                         }
388                                         
389                                         MPI_File_close(&inMPI); //deleted on close
390                                 }
391                                 
392                                 MPI_File_close(&outMPI);
393                         }else { //you are a child process
394                                 //do your part
395                                 unsigned long int size;
396                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
397                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
398                                 
399                                 if (m->control_pressed) { delete distCalculator;  return 0; }
400                         
401                                 //tell parent you are done.
402                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
403                         }
404                 }
405                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
406 #else           
407                                 
408         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
409                 //if you don't need to fork anything
410                 if(processors == 1){
411                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
412                         else { driver(0, numSeqs, outputFile, "square");  }
413                 }else{ //you have multiple processors
414                         
415                         unsigned long int numDists = 0;
416                         
417                         if (output == "square") {
418                                  numDists = numSeqs * numSeqs;
419                         }else {
420                                 for(int i=0;i<numSeqs;i++){
421                                         for(int j=0;j<i;j++){
422                                                 numDists++;
423                                                 if (numDists > processors) { break; }
424                                         }
425                                 }
426                         }
427                         
428                         if (numDists < processors) { processors = numDists; }
429                         
430                         for (int i = 0; i < processors; i++) {
431                                 distlinePair tempLine;
432                                 lines.push_back(tempLine);
433                                 if (output != "square") {
434                                         lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
435                                         lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
436                                 }else{
437                                         lines[i].start = int ((float(i)/float(processors)) * numSeqs);
438                                         lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
439                                 }
440                                 
441                         }
442                         
443                         createProcesses(outputFile); 
444                 }
445         #else
446                 //ifstream inFASTA;
447                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
448                 else { driver(0, numSeqs, outputFile, "square");  }
449         #endif
450         
451 #endif
452                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; m->mothurRemove(outputFile); return 0; }
453                 
454                 #ifdef USE_MPI
455                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
456                                         
457                         if (pid == 0) { //only one process should output to screen
458                 #endif
459                 
460                 //if (output == "square") {  convertMatrix(outputFile); }
461                 
462                 ifstream fileHandle;
463                 fileHandle.open(outputFile.c_str());
464                 if(fileHandle) {
465                         m->gobble(fileHandle);
466                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
467                 }
468                 
469                 //append the old column file to the new one
470                 if ((oldfastafile != "") && (column != ""))  {
471                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
472                         if (outputFile == column) { 
473                                 string tempcolumn = column + ".old";
474                                 m->appendFiles(tempcolumn, outputFile);
475                                 m->mothurRemove(tempcolumn);
476                         }else{
477                                 m->appendFiles(outputFile, column);
478                                 m->mothurRemove(outputFile);
479                                 outputFile = column;
480                         }
481                         
482                         if (outputDir != "") { 
483                                 string newOutputName = outputDir + m->getSimpleName(outputFile);
484                                 rename(outputFile.c_str(), newOutputName.c_str());
485                                 m->mothurRemove(outputFile);
486                                 outputFile = newOutputName;
487                         }
488                 }
489
490                 
491                 #ifdef USE_MPI
492                         }
493                 #endif
494                 
495                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; m->mothurRemove(outputFile); return 0; }
496                 
497                 delete distCalculator;
498                 
499                 //set phylip file as new current phylipfile
500                 string current = "";
501                 itTypes = outputTypes.find("phylip");
502                 if (itTypes != outputTypes.end()) {
503                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
504                 }
505                 
506                 //set column file as new current columnfile
507                 itTypes = outputTypes.find("column");
508                 if (itTypes != outputTypes.end()) {
509                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
510                 }
511                 
512                 m->mothurOutEndLine();
513                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
514                 m->mothurOut(outputFile); m->mothurOutEndLine();
515                 m->mothurOutEndLine();
516                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
517
518
519                 if (m->isTrue(compress)) {
520                         m->mothurOut("Compressing..."); m->mothurOutEndLine();
521                         m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)"); m->mothurOutEndLine();
522                         system(("gzip -v " + outputFile).c_str());
523                         outputNames.push_back(outputFile + ".gz");
524                 }else { outputNames.push_back(outputFile); }
525
526                 return 0;
527                 
528         }
529         catch(exception& e) {
530                 m->errorOut(e, "DistanceCommand", "execute");
531                 exit(1);
532         }
533 }
534 /**************************************************************************************************/
535 void DistanceCommand::createProcesses(string filename) {
536         try {
537 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
538                 int process = 1;
539                 processIDS.clear();
540                 
541                 //loop through and create all the processes you want
542                 while (process != processors) {
543                         int pid = fork();
544                         
545                         if (pid > 0) {
546                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
547                                 process++;
548                         }else if (pid == 0){
549                                 if (output != "square") {  driver(lines[process].start, lines[process].end, filename + toString(getpid()) + ".temp", cutoff); }
550                                 else { driver(lines[process].start, lines[process].end, filename + toString(getpid()) + ".temp", "square"); }
551                                 exit(0);
552                         }else { 
553                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
554                                 perror(" : ");
555                                 for (int i=0;i<processIDS.size();i++) {  int temp = processIDS[i]; kill (temp, SIGINT); }
556                                 exit(0);
557                         }
558                 }
559                 
560                 //parent does its part
561                 if (output != "square") {  driver(lines[0].start, lines[0].end, filename, cutoff); }
562                 else { driver(lines[0].start, lines[0].end, filename, "square"); }
563                 
564                 
565                 //force parent to wait until all the processes are done
566                 for (int i=0;i<processIDS.size();i++) { 
567                         int temp = processIDS[i];
568                         wait(&temp);
569                 }
570                 
571                 //append and remove temp files
572                 for (int i=0;i<processIDS.size();i++) { 
573                         m->appendFiles((filename + toString(processIDS[i]) + ".temp"), filename);
574                         m->mothurRemove((filename + toString(processIDS[i]) + ".temp"));
575                 }
576 #endif
577         }
578         catch(exception& e) {
579                 m->errorOut(e, "DistanceCommand", "createProcesses");
580                 exit(1);
581         }
582 }
583
584 /**************************************************************************************************/
585 /////// need to fix to work with calcs and sequencedb
586 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
587         try {
588
589                 int startTime = time(NULL);
590                 
591                 //column file
592                 ofstream outFile(dFileName.c_str(), ios::trunc);
593                 outFile.setf(ios::fixed, ios::showpoint);
594                 outFile << setprecision(4);
595                 
596                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
597                 
598                 for(int i=startLine;i<endLine;i++){
599                         if(output == "lt")      {       
600                                 string name = alignDB.get(i).getName();
601                                 if (name.length() < 10) { //pad with spaces to make compatible
602                                         while (name.length() < 10) {  name += " ";  }
603                                 }
604                                 outFile << name << '\t';        
605                         }
606                         for(int j=0;j<i;j++){
607                                 
608                                 if (m->control_pressed) { outFile.close(); return 0;  }
609                                 
610                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
611                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
612                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
613                                 
614                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
615                                 double dist = distCalculator->getDist();
616                                 
617                                 if(dist <= cutoff){
618                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
619                                 }
620                                 if (output == "lt") {  outFile << dist << '\t'; }
621                         }
622                         
623                         if (output == "lt") { outFile << endl; }
624                         
625                         if(i % 100 == 0){
626                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
627                         }
628                         
629                 }
630                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
631                 
632                 outFile.close();
633                 
634                 return 1;
635         }
636         catch(exception& e) {
637                 m->errorOut(e, "DistanceCommand", "driver");
638                 exit(1);
639         }
640 }
641 /**************************************************************************************************/
642 /////// need to fix to work with calcs and sequencedb
643 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
644         try {
645
646                 int startTime = time(NULL);
647                 
648                 //column file
649                 ofstream outFile(dFileName.c_str(), ios::trunc);
650                 outFile.setf(ios::fixed, ios::showpoint);
651                 outFile << setprecision(4);
652                 
653                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
654                 
655                 for(int i=startLine;i<endLine;i++){
656                                 
657                         string name = alignDB.get(i).getName();
658                         //pad with spaces to make compatible
659                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
660                                 
661                         outFile << name << '\t';        
662                         
663                         for(int j=0;j<alignDB.getNumSeqs();j++){
664                                 
665                                 if (m->control_pressed) { outFile.close(); return 0;  }
666                                 
667                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
668                                 double dist = distCalculator->getDist();
669                                 
670                                 outFile << dist << '\t'; 
671                         }
672                         
673                         outFile << endl; 
674                         
675                         if(i % 100 == 0){
676                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
677                         }
678                         
679                 }
680                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
681                 
682                 outFile.close();
683                 
684                 return 1;
685         }
686         catch(exception& e) {
687                 m->errorOut(e, "DistanceCommand", "driver");
688                 exit(1);
689         }
690 }
691 #ifdef USE_MPI
692 /**************************************************************************************************/
693 /////// need to fix to work with calcs and sequencedb
694 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
695         try {
696                 MPI_Status status;
697                 int startTime = time(NULL);
698                 
699                 string outputString = "";
700                 
701                 for(int i=startLine;i<endLine;i++){
702         
703                         for(int j=0;j<i;j++){
704                                 
705                                 if (m->control_pressed) {  return 0;  }
706                                 
707                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
708                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
709                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
710                                 
711                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
712                                 double dist = distCalculator->getDist();
713                                 
714                                 if(dist <= cutoff){
715                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
716                                 }
717                         }
718                         
719                         if(i % 100 == 0){
720                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
721                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
722                         }
723                         
724                          
725                         //send results to parent
726                         int length = outputString.length();
727
728                         char* buf = new char[length];
729                         memcpy(buf, outputString.c_str(), length);
730                         
731                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
732                         outputString = "";
733                         delete buf;
734                         
735                 }
736                 
737                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
738                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
739                 return 1;
740         }
741         catch(exception& e) {
742                 m->errorOut(e, "DistanceCommand", "driverMPI");
743                 exit(1);
744         }
745 }
746 /**************************************************************************************************/
747 /////// need to fix to work with calcs and sequencedb
748 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
749         try {
750                 MPI_Status status;
751                 
752                 MPI_File outMPI;
753                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
754
755                 //char* filename = new char[file.length()];
756                 //memcpy(filename, file.c_str(), file.length());
757                 
758                 char filename[1024];
759                 strcpy(filename, file.c_str());
760
761                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
762                 //delete filename;
763
764                 int startTime = time(NULL);
765                 
766                 string outputString = "";
767                 size = 0;
768                 
769                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
770                 
771                 for(int i=startLine;i<endLine;i++){
772                                 
773                         string name = alignDB.get(i).getName();
774                         if (name.length() < 10) { //pad with spaces to make compatible
775                                 while (name.length() < 10) {  name += " ";  }
776                         }
777                         outputString += name + "\t";    
778                         
779                         for(int j=0;j<i;j++){
780                                 
781                                 if (m->control_pressed) {  return 0;  }
782                                 
783                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
784                                 double dist = distCalculator->getDist();
785                                 
786                                 outputString += toString(dist) + "\t"; 
787                         }
788                         
789                         outputString += "\n"; 
790
791                 
792                         if(i % 100 == 0){
793                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
794                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
795                         }
796                         
797                         
798                         //send results to parent
799                         int length = outputString.length();
800                         char* buf = new char[length];
801                         memcpy(buf, outputString.c_str(), length);
802                         
803                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
804                         size += outputString.length();
805                         outputString = "";
806                         delete buf;
807                 }
808                 
809                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
810                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
811                 MPI_File_close(&outMPI);
812                 
813                 return 1;
814         }
815         catch(exception& e) {
816                 m->errorOut(e, "DistanceCommand", "driverMPI");
817                 exit(1);
818         }
819 }
820 /**************************************************************************************************/
821 /////// need to fix to work with calcs and sequencedb
822 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
823         try {
824                 MPI_Status status;
825                 
826                 MPI_File outMPI;
827                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
828
829                 //char* filename = new char[file.length()];
830                 //memcpy(filename, file.c_str(), file.length());
831                 
832                 char filename[1024];
833                 strcpy(filename, file.c_str());
834
835                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
836                 //delete filename;
837
838                 int startTime = time(NULL);
839                 
840                 string outputString = "";
841                 size = 0;
842                 
843                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
844                 
845                 for(int i=startLine;i<endLine;i++){
846                                 
847                         string name = alignDB.get(i).getName();
848                         if (name.length() < 10) { //pad with spaces to make compatible
849                                 while (name.length() < 10) {  name += " ";  }
850                         }
851                         outputString += name + "\t";    
852                         
853                         for(int j=0;j<alignDB.getNumSeqs();j++){
854                                 
855                                 if (m->control_pressed) {  return 0;  }
856                                 
857                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
858                                 double dist = distCalculator->getDist();
859                                 
860                                 outputString += toString(dist) + "\t"; 
861                         }
862                         
863                         outputString += "\n"; 
864
865                 
866                         if(i % 100 == 0){
867                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
868                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
869                         }
870                         
871                         
872                         //send results to parent
873                         int length = outputString.length();
874                         char* buf = new char[length];
875                         memcpy(buf, outputString.c_str(), length);
876                         
877                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
878                         size += outputString.length();
879                         outputString = "";
880                         delete buf;
881                 }
882                 
883                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
884                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
885                 MPI_File_close(&outMPI);
886                 
887                 return 1;
888         }
889         catch(exception& e) {
890                 m->errorOut(e, "DistanceCommand", "driverMPI");
891                 exit(1);
892         }
893 }
894 #endif
895 /**************************************************************************************************
896 int DistanceCommand::convertMatrix(string outputFile) {
897         try{
898
899                 //sort file by first column so the distances for each row are together
900                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
901                 
902                 //use the unix sort 
903                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
904                         string command = "sort -n " + outputFile + " -o " + outfile;
905                         system(command.c_str());
906                 #else //sort using windows sort
907                         string command = "sort " + outputFile + " /O " + outfile;
908                         system(command.c_str());
909                 #endif
910                 
911
912                 //output to new file distance for each row and save positions in file where new row begins
913                 ifstream in;
914                 m->openInputFile(outfile, in);
915                 
916                 ofstream out;
917                 m->openOutputFile(outputFile, out);
918                 
919                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
920
921                 out << alignDB.getNumSeqs() << endl;
922                 
923                 //get first currentRow
924                 string first, currentRow, second;
925                 float dist;
926                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
927                 map<string, float>::iterator it;
928                 
929                 in >> first;
930                 currentRow = first;
931                 
932                 rowDists[first] = 0.00; //distance to yourself is 0.0
933                 
934                 in.seekg(0);
935                 //m->openInputFile(outfile, in);
936                 
937                 while(!in.eof()) {
938                         if (m->control_pressed) { in.close(); m->mothurRemove(outfile); out.close(); return 0; }
939                         
940                         in >> first >> second >> dist; m->gobble(in);
941                                 
942                         if (first != currentRow) {
943                                 //print out last row
944                                 out << currentRow << '\t'; //print name
945
946                                 //print dists
947                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
948                                         out << it->second << '\t';
949                                 }
950                                 out << endl;
951                                 
952                                 //start new row
953                                 currentRow = first;
954                                 rowDists.clear();
955                                 rowDists[first] = 0.00;
956                                 rowDists[second] = dist;
957                         }else{
958                                 rowDists[second] = dist;
959                         }
960                 }
961                 //print out last row
962                 out << currentRow << '\t'; //print name
963                                 
964                 //print dists
965                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
966                         out << it->second << '\t';
967                 }
968                 out << endl;
969                 
970                 in.close();
971                 out.close();
972                 
973                 m->mothurRemove(outfile);
974                 
975                 return 1;
976                 
977         }
978         catch(exception& e) {
979                 m->errorOut(e, "DistanceCommand", "convertMatrix");
980                 exit(1);
981         }
982 }
983 /**************************************************************************************************
984 int DistanceCommand::convertToLowerTriangle(string outputFile) {
985         try{
986
987                 //sort file by first column so the distances for each row are together
988                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
989                 
990                 //use the unix sort 
991                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
992                         string command = "sort -n " + outputFile + " -o " + outfile;
993                         system(command.c_str());
994                 #else //sort using windows sort
995                         string command = "sort " + outputFile + " /O " + outfile;
996                         system(command.c_str());
997                 #endif
998                 
999
1000                 //output to new file distance for each row and save positions in file where new row begins
1001                 ifstream in;
1002                 m->openInputFile(outfile, in);
1003                 
1004                 ofstream out;
1005                 m->openOutputFile(outputFile, out);
1006                 
1007                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
1008
1009                 out << alignDB.getNumSeqs() << endl;
1010                 
1011                 //get first currentRow
1012                 string first, currentRow, second;
1013                 float dist;
1014                 int i, j;
1015                 i = 0; j = 0;
1016                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
1017                 map<string, float>::iterator it;
1018                 
1019                 in >> first;
1020                 currentRow = first;
1021                 
1022                 rowDists[first] = 0.00; //distance to yourself is 0.0
1023                 
1024                 in.seekg(0);
1025                 //m->openInputFile(outfile, in);
1026                 
1027                 while(!in.eof()) {
1028                         if (m->control_pressed) { in.close(); m->mothurRemove(outfile); out.close(); return 0; }
1029                         
1030                         in >> first >> second >> dist; m->gobble(in);
1031                                 
1032                         if (first != currentRow) {
1033                                 //print out last row
1034                                 out << currentRow << '\t'; //print name
1035
1036                                 //print dists
1037                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1038                                         if (j >= i) { break; }
1039                                         out << it->second << '\t';
1040                                         j++;
1041                                 }
1042                                 out << endl;
1043                                 
1044                                 //start new row
1045                                 currentRow = first;
1046                                 rowDists.clear();
1047                                 rowDists[first] = 0.00;
1048                                 rowDists[second] = dist;
1049                                 j = 0;
1050                                 i++;
1051                         }else{
1052                                 rowDists[second] = dist;
1053                         }
1054                 }
1055                 //print out last row
1056                 out << currentRow << '\t'; //print name
1057                                 
1058                 //print dists
1059                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1060                         out << it->second << '\t';
1061                 }
1062                 out << endl;
1063                 
1064                 in.close();
1065                 out.close();
1066                 
1067                 m->mothurRemove(outfile);
1068                 
1069                 return 1;
1070                 
1071         }
1072         catch(exception& e) {
1073                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
1074                 exit(1);
1075         }
1076 }
1077 /**************************************************************************************************/
1078 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
1079 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
1080 //also check to make sure the 2 files have the same alignment length.
1081 bool DistanceCommand::sanityCheck() {
1082         try{
1083                 bool good = true;
1084                 
1085                 //make sure the 2 fasta files have the same alignment length
1086                 ifstream in;
1087                 m->openInputFile(fastafile, in);
1088                 int fastaAlignLength = 0;
1089                 if (in) { 
1090                         Sequence tempIn(in);
1091                         fastaAlignLength = tempIn.getAligned().length();
1092                 }
1093                 in.close();
1094                 
1095                 ifstream in2;
1096                 m->openInputFile(oldfastafile, in2);
1097                 int oldfastaAlignLength = 0;
1098                 if (in2) { 
1099                         Sequence tempIn2(in2);
1100                         oldfastaAlignLength = tempIn2.getAligned().length();
1101                 }
1102                 in2.close();
1103                 
1104                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1105                 
1106                 //read fasta file and save names as well as adding them to the alignDB
1107                 set<string> namesOldFasta;
1108                 
1109                 ifstream inFasta;
1110                 m->openInputFile(oldfastafile, inFasta);
1111                 
1112                 while (!inFasta.eof()) {
1113                         if (m->control_pressed) {  inFasta.close(); return good;  }
1114                 
1115                         Sequence temp(inFasta);
1116                         
1117                         if (temp.getName() != "") {
1118                                 namesOldFasta.insert(temp.getName());  //save name
1119                                 alignDB.push_back(temp);  //add to DB
1120                         }
1121                         
1122                         m->gobble(inFasta);
1123                 }
1124                 
1125                 inFasta.close();
1126                 
1127                 //read through the column file checking names and removing distances above the cutoff
1128                 ifstream inDist;
1129                 m->openInputFile(column, inDist);
1130                 
1131                 ofstream outDist;
1132                 string outputFile = column + ".temp";
1133                 m->openOutputFile(outputFile, outDist);
1134                 
1135                 string name1, name2;
1136                 float dist;
1137                 while (!inDist.eof()) {
1138                         if (m->control_pressed) {  inDist.close(); outDist.close(); m->mothurRemove(outputFile); return good;  }
1139                 
1140                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1141                         
1142                         //both names are in fasta file and distance is below cutoff
1143                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1144                         else{
1145                                 if (dist <= cutoff) {
1146                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1147                                 }
1148                         }
1149                 }
1150                 
1151                 inDist.close();
1152                 outDist.close();
1153                 
1154                 if (good) {
1155                         m->mothurRemove(column);
1156                         rename(outputFile.c_str(), column.c_str());
1157                 }else{
1158                         m->mothurRemove(outputFile); //temp file is bad because file mismatch above
1159                 }
1160                 
1161                 return good;
1162                 
1163         }
1164         catch(exception& e) {
1165                 m->errorOut(e, "DistanceCommand", "sanityCheck");
1166                 exit(1);
1167         }
1168 }
1169 /**************************************************************************************************/
1170
1171
1172
1173