]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
paralellized phylo.diversity
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18
19 DistanceCommand::DistanceCommand(string option) {
20         try {
21                 abort = false;
22                 Estimators.clear();
23                                 
24                 //allow user to run help
25                 if(option == "help") { help(); abort = true; }
26                 
27                 else {
28                         //valid paramters for this command
29                         string Array[] =  {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir","compress"};
30
31                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
32                         
33                         OptionParser parser(option);
34                         map<string, string> parameters = parser.getParameters();
35                         
36                         ValidParameters validParameter("dist.seqs");
37                         map<string, string>::iterator it2;
38                 
39                         //check to make sure all parameters are valid for command
40                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
41                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
42                         }
43                         
44                         //if the user changes the input directory command factory will send this info to us in the output parameter 
45                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
46                         if (inputDir == "not found"){   inputDir = "";          }
47                         else {
48                                 string path;
49                                 it2 = parameters.find("fasta");
50                                 //user has given a template file
51                                 if(it2 != parameters.end()){ 
52                                         path = m->hasPath(it2->second);
53                                         //if the user has not given a path then, add inputdir. else leave path alone.
54                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
55                                 }
56                                 
57                                 it2 = parameters.find("oldfasta");
58                                 //user has given a template file
59                                 if(it2 != parameters.end()){ 
60                                         path = m->hasPath(it2->second);
61                                         //if the user has not given a path then, add inputdir. else leave path alone.
62                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
63                                 }
64                                 
65                                 it2 = parameters.find("column");
66                                 //user has given a template file
67                                 if(it2 != parameters.end()){ 
68                                         path = m->hasPath(it2->second);
69                                         //if the user has not given a path then, add inputdir. else leave path alone.
70                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
71                                 }
72                         }
73
74                         //check for required parameters
75                         fastafile = validParameter.validFile(parameters, "fasta", true);
76                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the dist.seqs command."); m->mothurOutEndLine(); abort = true; }
77                         else if (fastafile == "not open") { abort = true; }     
78                         else{
79                                 ifstream inFASTA;
80                                 m->openInputFile(fastafile, inFASTA);
81                                 alignDB = SequenceDB(inFASTA); 
82                                 inFASTA.close();
83                         }
84                         
85                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
86                         if (oldfastafile == "not found") { oldfastafile = ""; }
87                         else if (oldfastafile == "not open") { abort = true; }  
88                         
89                         column = validParameter.validFile(parameters, "column", true);
90                         if (column == "not found") { column = ""; }
91                         else if (column == "not open") { abort = true; }        
92                         
93                         //if the user changes the output directory command factory will send this info to us in the output parameter 
94                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
95                                 outputDir = ""; 
96                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
97                         }
98
99                         //check for optional parameter and set defaults
100                         // ...at some point should added some additional type checking...
101                         calc = validParameter.validFile(parameters, "calc", false);                     
102                         if (calc == "not found") { calc = "onegap";  }
103                         else { 
104                                  if (calc == "default")  {  calc = "onegap";  }
105                         }
106                         m->splitAtDash(calc, Estimators);
107
108                         string temp;
109                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
110                         convert(temp, countends); 
111                         
112                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
113                         convert(temp, cutoff); 
114                         
115                         temp = validParameter.validFile(parameters, "processors", false);       if(temp == "not found"){        temp = "1"; }
116                         convert(temp, processors); 
117                         
118                         temp = validParameter.validFile(parameters, "compress", false);         if(temp == "not found"){  temp = "F"; }
119                         convert(temp, compress);
120
121                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
122                         
123                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
124                         
125                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
126                         
127                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
128                         
129                         ValidCalculators validCalculator;
130                         
131                         if (m->isTrue(countends) == true) {
132                                 for (int i=0; i<Estimators.size(); i++) {
133                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
134                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
135                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
136                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
137                                         }
138                                 }
139                         }else {
140                                 for (int i=0; i<Estimators.size(); i++) {
141                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
142                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
143                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
144                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
145                                         }
146                                 }
147                         }
148
149                 }
150                                 
151         }
152         catch(exception& e) {
153                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
154                 exit(1);
155         }
156 }
157
158 //**********************************************************************************************************************
159
160 DistanceCommand::~DistanceCommand(){
161         
162         for(int i=0;i<lines.size();i++){
163                 delete lines[i];
164         }
165         
166 }
167         
168 //**********************************************************************************************************************
169
170 void DistanceCommand::help(){
171         try {
172                 m->mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
173                 m->mothurOut("The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors.  \n");
174                 m->mothurOut("The fasta parameter is required.\n");
175                 m->mothurOut("The oldfasta and column parameters allow you to append the distances calculated to the column file.\n");
176                 m->mothurOut("The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
177                 m->mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n");
178                 m->mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
179                 m->mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
180                 m->mothurOut("The processors parameter allows you to specify number of processors to use.  The default is 1.\n");
181                 m->mothurOut("The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n");
182                 m->mothurOut("The dist.seqs command should be in the following format: \n");
183                 m->mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
184                 m->mothurOut("Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n");
185                 m->mothurOut("Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n");
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "DistanceCommand", "help");
189                 exit(1);
190         }
191 }
192 //**********************************************************************************************************************
193
194 int DistanceCommand::execute(){
195         try {
196                 
197                 if (abort == true) { return 0; }
198                 
199                 int startTime = time(NULL);
200                 
201                 //save number of new sequence
202                 numNewFasta = alignDB.getNumSeqs();
203                 
204                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
205                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
206                 
207                 if (m->control_pressed) { return 0; }
208                 
209                 int numSeqs = alignDB.getNumSeqs();
210                 cutoff += 0.005;
211                 
212                 string outputFile;
213                                 
214                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
215                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
216                         remove(outputFile.c_str());
217                         
218                         //output numSeqs to phylip formatted dist file
219                 }else if (output == "column") { //user wants column format
220                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
221                         
222                         //so we don't accidentally overwrite
223                         if (outputFile == column) { 
224                                 string tempcolumn = column + ".old"; 
225                                 rename(column.c_str(), tempcolumn.c_str());
226                         }
227                         
228                         remove(outputFile.c_str());
229                 }else { //assume square
230                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
231                         remove(outputFile.c_str());
232                 }
233                 
234
235 #ifdef USE_MPI
236                 
237                 int pid, start, end; 
238                 int tag = 2001;
239                                 
240                 MPI_Status status; 
241                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
242                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
243                 
244                 //each process gets where it should start and stop in the file
245                 if (output != "square") {
246                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
247                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
248                 }else{
249                         start = int ((float(pid)/float(processors)) * numSeqs);
250                         end = int ((float(pid+1)/float(processors)) * numSeqs);
251                 }
252                 
253                 if (output == "column") {
254                         MPI_File outMPI;
255                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
256
257                         //char* filename = new char[outputFile.length()];
258                         //memcpy(filename, outputFile.c_str(), outputFile.length());
259                         
260                         char filename[1024];
261                         strcpy(filename, outputFile.c_str());
262                         
263                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
264                         //delete filename;
265
266                         if (pid == 0) { //you are the root process 
267                         
268                                 //do your part
269                                 string outputMyPart;
270                                 
271                                 driverMPI(start, end, outMPI, cutoff); 
272                                 
273                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
274                         
275                                 //wait on chidren
276                                 for(int i = 1; i < processors; i++) { 
277                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
278                                         
279                                         char buf[5];
280                                         MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
281                                 }
282                         }else { //you are a child process
283                                 //do your part
284                                 driverMPI(start, end, outMPI, cutoff); 
285                                 
286                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
287                         
288                                 char buf[5];
289                                 strcpy(buf, "done"); 
290                                 //tell parent you are done.
291                                 MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
292                         }
293                         
294                         MPI_File_close(&outMPI);
295                         
296                 }else { //lower triangle format
297                         if (pid == 0) { //you are the root process 
298                         
299                                 //do your part
300                                 string outputMyPart;
301                                 unsigned long int mySize;
302                                 
303                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
304                                 else { driverMPI(start, end, outputFile, mySize, output); }
305         
306                                 if (m->control_pressed) {  delete distCalculator;  return 0; }
307                                 
308                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
309                                 MPI_File outMPI;
310                                 MPI_File inMPI;
311
312                                 //char* filename = new char[outputFile.length()];
313                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
314                                 
315                                 char filename[1024];
316                                 strcpy(filename, outputFile.c_str());
317
318                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
319                                 //delete filename;
320
321                                 //wait on chidren
322                                 for(int b = 1; b < processors; b++) { 
323                                         unsigned long int fileSize;
324                                         
325                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
326                                         
327                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
328                                         
329                                         string outTemp = outputFile + toString(b) + ".temp";
330
331                                         char* buf = new char[outTemp.length()];
332                                         memcpy(buf, outTemp.c_str(), outTemp.length());
333                                         
334                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
335                                         delete buf;
336
337                                         int count = 0;
338                                         while (count < fileSize) { 
339                                                 char buf2[1];
340                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
341                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
342                                                 count += 1;
343                                         }
344                                         
345                                         MPI_File_close(&inMPI); //deleted on close
346                                 }
347                                 
348                                 MPI_File_close(&outMPI);
349                         }else { //you are a child process
350                                 //do your part
351                                 unsigned long int size;
352                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
353                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
354                                 
355                                 if (m->control_pressed) { delete distCalculator;  return 0; }
356                         
357                                 //tell parent you are done.
358                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
359                         }
360                 }
361                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
362 #else           
363                                 
364         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
365                 //if you don't need to fork anything
366                 if(processors == 1){
367                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
368                         else { driver(0, numSeqs, outputFile, "square");  }
369                 }else{ //you have multiple processors
370                         
371                         for (int i = 0; i < processors; i++) {
372                                 lines.push_back(new linePair());
373                                 if (output != "square") {
374                                         lines[i]->start = int (sqrt(float(i)/float(processors)) * numSeqs);
375                                         lines[i]->end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
376                                 }else{
377                                         lines[i]->start = int ((float(i)/float(processors)) * numSeqs);
378                                         lines[i]->end = int ((float(i+1)/float(processors)) * numSeqs);
379                                 }
380                         }
381
382                         createProcesses(outputFile); 
383                 
384                         map<int, int>::iterator it = processIDS.begin();
385                         rename((outputFile + toString(it->second) + ".temp").c_str(), outputFile.c_str());
386                         it++;
387                         
388                         //append and remove temp files
389                         for (; it != processIDS.end(); it++) {
390                                 m->appendFiles((outputFile + toString(it->second) + ".temp"), outputFile);
391                                 remove((outputFile + toString(it->second) + ".temp").c_str());
392                         }
393                 }
394         #else
395                 //ifstream inFASTA;
396                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
397                 else { driver(0, numSeqs, outputFile, "square");  }
398         #endif
399         
400 #endif
401                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
402                 
403                 #ifdef USE_MPI
404                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
405                                         
406                         if (pid == 0) { //only one process should output to screen
407                 #endif
408                 
409                 //if (output == "square") {  convertMatrix(outputFile); }
410                 
411                 ifstream fileHandle;
412                 fileHandle.open(outputFile.c_str());
413                 if(fileHandle) {
414                         m->gobble(fileHandle);
415                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
416                 }
417                 
418                 //append the old column file to the new one
419                 if ((oldfastafile != "") && (column != ""))  {
420                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
421                         if (outputFile == column) { 
422                                 string tempcolumn = column + ".old";
423                                 m->appendFiles(tempcolumn, outputFile);
424                                 remove(tempcolumn.c_str());
425                         }else{
426                                 m->appendFiles(outputFile, column);
427                                 remove(outputFile.c_str());
428                                 outputFile = column;
429                         }
430                         
431                         if (outputDir != "") { 
432                                 string newOutputName = outputDir + m->getSimpleName(outputFile);
433                                 rename(outputFile.c_str(), newOutputName.c_str());
434                                 remove(outputFile.c_str());
435                                 outputFile = newOutputName;
436                         }
437                 }
438
439                 
440                 #ifdef USE_MPI
441                         }
442                 #endif
443                 
444                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
445                 
446                 delete distCalculator;
447                 
448                 m->mothurOutEndLine();
449                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
450                 m->mothurOut(outputFile); m->mothurOutEndLine();
451                 m->mothurOutEndLine();
452                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
453
454
455                 if (m->isTrue(compress)) {
456                         m->mothurOut("Compressing..."); m->mothurOutEndLine();
457                         m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)"); m->mothurOutEndLine();
458                         system(("gzip -v " + outputFile).c_str());
459                 }
460
461                 return 0;
462                 
463         }
464         catch(exception& e) {
465                 m->errorOut(e, "DistanceCommand", "execute");
466                 exit(1);
467         }
468 }
469 /**************************************************************************************************/
470 void DistanceCommand::createProcesses(string filename) {
471         try {
472 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
473                 int process = 0;
474                 processIDS.clear();
475                 
476                 //loop through and create all the processes you want
477                 while (process != processors) {
478                         int pid = fork();
479                         
480                         if (pid > 0) {
481                                 processIDS[lines[process]->end] = pid;  //create map from line number to pid so you can append files in correct order later
482                                 process++;
483                         }else if (pid == 0){
484                                 if (output != "square") {  driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", cutoff); }
485                                 else { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", "square"); }
486                                 exit(0);
487                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
488                 }
489         
490                 //force parent to wait until all the processes are done
491                 for (map<int, int>::iterator it = processIDS.begin(); it != processIDS.end(); it++) { 
492                         int temp = it->second;
493                         wait(&temp);
494                 }
495 #endif
496         }
497         catch(exception& e) {
498                 m->errorOut(e, "DistanceCommand", "createProcesses");
499                 exit(1);
500         }
501 }
502
503 /**************************************************************************************************/
504 /////// need to fix to work with calcs and sequencedb
505 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
506         try {
507
508                 int startTime = time(NULL);
509                 
510                 //column file
511                 ofstream outFile(dFileName.c_str(), ios::trunc);
512                 outFile.setf(ios::fixed, ios::showpoint);
513                 outFile << setprecision(4);
514                 
515                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
516                 
517                 for(int i=startLine;i<endLine;i++){
518                         if(output == "lt")      {       
519                                 string name = alignDB.get(i).getName();
520                                 if (name.length() < 10) { //pad with spaces to make compatible
521                                         while (name.length() < 10) {  name += " ";  }
522                                 }
523                                 outFile << name << '\t';        
524                         }
525                         for(int j=0;j<i;j++){
526                                 
527                                 if (m->control_pressed) { outFile.close(); return 0;  }
528                                 
529                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
530                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
531                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
532                                 
533                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
534                                 double dist = distCalculator->getDist();
535                                 
536                                 if(dist <= cutoff){
537                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
538                                 }
539                                 if (output == "lt") {  outFile << dist << '\t'; }
540                         }
541                         
542                         if (output == "lt") { outFile << endl; }
543                         
544                         if(i % 100 == 0){
545                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
546                         }
547                         
548                 }
549                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
550                 
551                 outFile.close();
552                 
553                 return 1;
554         }
555         catch(exception& e) {
556                 m->errorOut(e, "DistanceCommand", "driver");
557                 exit(1);
558         }
559 }
560 /**************************************************************************************************/
561 /////// need to fix to work with calcs and sequencedb
562 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
563         try {
564
565                 int startTime = time(NULL);
566                 
567                 //column file
568                 ofstream outFile(dFileName.c_str(), ios::trunc);
569                 outFile.setf(ios::fixed, ios::showpoint);
570                 outFile << setprecision(4);
571                 
572                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
573                 
574                 for(int i=startLine;i<endLine;i++){
575                                 
576                         string name = alignDB.get(i).getName();
577                         //pad with spaces to make compatible
578                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
579                                 
580                         outFile << name << '\t';        
581                         
582                         for(int j=0;j<alignDB.getNumSeqs();j++){
583                                 
584                                 if (m->control_pressed) { outFile.close(); return 0;  }
585                                 
586                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
587                                 double dist = distCalculator->getDist();
588                                 
589                                 outFile << dist << '\t'; 
590                         }
591                         
592                         outFile << endl; 
593                         
594                         if(i % 100 == 0){
595                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
596                         }
597                         
598                 }
599                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
600                 
601                 outFile.close();
602                 
603                 return 1;
604         }
605         catch(exception& e) {
606                 m->errorOut(e, "DistanceCommand", "driver");
607                 exit(1);
608         }
609 }
610 #ifdef USE_MPI
611 /**************************************************************************************************/
612 /////// need to fix to work with calcs and sequencedb
613 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
614         try {
615                 MPI_Status status;
616                 int startTime = time(NULL);
617                 
618                 string outputString = "";
619                 
620                 for(int i=startLine;i<endLine;i++){
621         
622                         for(int j=0;j<i;j++){
623                                 
624                                 if (m->control_pressed) {  return 0;  }
625                                 
626                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
627                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
628                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
629                                 
630                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
631                                 double dist = distCalculator->getDist();
632                                 
633                                 if(dist <= cutoff){
634                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
635                                 }
636                         }
637                         
638                         if(i % 100 == 0){
639                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
640                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
641                         }
642                         
643                          
644                         //send results to parent
645                         int length = outputString.length();
646
647                         char* buf = new char[length];
648                         memcpy(buf, outputString.c_str(), length);
649                         
650                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
651                         outputString = "";
652                         delete buf;
653                         
654                 }
655                 
656                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
657                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
658                 return 1;
659         }
660         catch(exception& e) {
661                 m->errorOut(e, "DistanceCommand", "driverMPI");
662                 exit(1);
663         }
664 }
665 /**************************************************************************************************/
666 /////// need to fix to work with calcs and sequencedb
667 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
668         try {
669                 MPI_Status status;
670                 
671                 MPI_File outMPI;
672                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
673
674                 //char* filename = new char[file.length()];
675                 //memcpy(filename, file.c_str(), file.length());
676                 
677                 char filename[1024];
678                 strcpy(filename, file.c_str());
679
680                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
681                 //delete filename;
682
683                 int startTime = time(NULL);
684                 
685                 string outputString = "";
686                 size = 0;
687                 
688                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
689                 
690                 for(int i=startLine;i<endLine;i++){
691                                 
692                         string name = alignDB.get(i).getName();
693                         if (name.length() < 10) { //pad with spaces to make compatible
694                                 while (name.length() < 10) {  name += " ";  }
695                         }
696                         outputString += name + "\t";    
697                         
698                         for(int j=0;j<i;j++){
699                                 
700                                 if (m->control_pressed) {  return 0;  }
701                                 
702                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
703                                 double dist = distCalculator->getDist();
704                                 
705                                 outputString += toString(dist) + "\t"; 
706                         }
707                         
708                         outputString += "\n"; 
709
710                 
711                         if(i % 100 == 0){
712                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
713                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
714                         }
715                         
716                         
717                         //send results to parent
718                         int length = outputString.length();
719                         char* buf = new char[length];
720                         memcpy(buf, outputString.c_str(), length);
721                         
722                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
723                         size += outputString.length();
724                         outputString = "";
725                         delete buf;
726                 }
727                 
728                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
729                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
730                 MPI_File_close(&outMPI);
731                 
732                 return 1;
733         }
734         catch(exception& e) {
735                 m->errorOut(e, "DistanceCommand", "driverMPI");
736                 exit(1);
737         }
738 }
739 /**************************************************************************************************/
740 /////// need to fix to work with calcs and sequencedb
741 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
742         try {
743                 MPI_Status status;
744                 
745                 MPI_File outMPI;
746                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
747
748                 //char* filename = new char[file.length()];
749                 //memcpy(filename, file.c_str(), file.length());
750                 
751                 char filename[1024];
752                 strcpy(filename, file.c_str());
753
754                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
755                 //delete filename;
756
757                 int startTime = time(NULL);
758                 
759                 string outputString = "";
760                 size = 0;
761                 
762                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
763                 
764                 for(int i=startLine;i<endLine;i++){
765                                 
766                         string name = alignDB.get(i).getName();
767                         if (name.length() < 10) { //pad with spaces to make compatible
768                                 while (name.length() < 10) {  name += " ";  }
769                         }
770                         outputString += name + "\t";    
771                         
772                         for(int j=0;j<alignDB.getNumSeqs();j++){
773                                 
774                                 if (m->control_pressed) {  return 0;  }
775                                 
776                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
777                                 double dist = distCalculator->getDist();
778                                 
779                                 outputString += toString(dist) + "\t"; 
780                         }
781                         
782                         outputString += "\n"; 
783
784                 
785                         if(i % 100 == 0){
786                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
787                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
788                         }
789                         
790                         
791                         //send results to parent
792                         int length = outputString.length();
793                         char* buf = new char[length];
794                         memcpy(buf, outputString.c_str(), length);
795                         
796                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
797                         size += outputString.length();
798                         outputString = "";
799                         delete buf;
800                 }
801                 
802                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
803                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
804                 MPI_File_close(&outMPI);
805                 
806                 return 1;
807         }
808         catch(exception& e) {
809                 m->errorOut(e, "DistanceCommand", "driverMPI");
810                 exit(1);
811         }
812 }
813 #endif
814 /**************************************************************************************************
815 int DistanceCommand::convertMatrix(string outputFile) {
816         try{
817
818                 //sort file by first column so the distances for each row are together
819                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
820                 
821                 //use the unix sort 
822                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
823                         string command = "sort -n " + outputFile + " -o " + outfile;
824                         system(command.c_str());
825                 #else //sort using windows sort
826                         string command = "sort " + outputFile + " /O " + outfile;
827                         system(command.c_str());
828                 #endif
829                 
830
831                 //output to new file distance for each row and save positions in file where new row begins
832                 ifstream in;
833                 m->openInputFile(outfile, in);
834                 
835                 ofstream out;
836                 m->openOutputFile(outputFile, out);
837                 
838                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
839
840                 out << alignDB.getNumSeqs() << endl;
841                 
842                 //get first currentRow
843                 string first, currentRow, second;
844                 float dist;
845                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
846                 map<string, float>::iterator it;
847                 
848                 in >> first;
849                 currentRow = first;
850                 
851                 rowDists[first] = 0.00; //distance to yourself is 0.0
852                 
853                 in.seekg(0);
854                 //m->openInputFile(outfile, in);
855                 
856                 while(!in.eof()) {
857                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
858                         
859                         in >> first >> second >> dist; m->gobble(in);
860                                 
861                         if (first != currentRow) {
862                                 //print out last row
863                                 out << currentRow << '\t'; //print name
864
865                                 //print dists
866                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
867                                         out << it->second << '\t';
868                                 }
869                                 out << endl;
870                                 
871                                 //start new row
872                                 currentRow = first;
873                                 rowDists.clear();
874                                 rowDists[first] = 0.00;
875                                 rowDists[second] = dist;
876                         }else{
877                                 rowDists[second] = dist;
878                         }
879                 }
880                 //print out last row
881                 out << currentRow << '\t'; //print name
882                                 
883                 //print dists
884                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
885                         out << it->second << '\t';
886                 }
887                 out << endl;
888                 
889                 in.close();
890                 out.close();
891                 
892                 remove(outfile.c_str());
893                 
894                 return 1;
895                 
896         }
897         catch(exception& e) {
898                 m->errorOut(e, "DistanceCommand", "convertMatrix");
899                 exit(1);
900         }
901 }
902 /**************************************************************************************************
903 int DistanceCommand::convertToLowerTriangle(string outputFile) {
904         try{
905
906                 //sort file by first column so the distances for each row are together
907                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
908                 
909                 //use the unix sort 
910                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
911                         string command = "sort -n " + outputFile + " -o " + outfile;
912                         system(command.c_str());
913                 #else //sort using windows sort
914                         string command = "sort " + outputFile + " /O " + outfile;
915                         system(command.c_str());
916                 #endif
917                 
918
919                 //output to new file distance for each row and save positions in file where new row begins
920                 ifstream in;
921                 m->openInputFile(outfile, in);
922                 
923                 ofstream out;
924                 m->openOutputFile(outputFile, out);
925                 
926                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
927
928                 out << alignDB.getNumSeqs() << endl;
929                 
930                 //get first currentRow
931                 string first, currentRow, second;
932                 float dist;
933                 int i, j;
934                 i = 0; j = 0;
935                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
936                 map<string, float>::iterator it;
937                 
938                 in >> first;
939                 currentRow = first;
940                 
941                 rowDists[first] = 0.00; //distance to yourself is 0.0
942                 
943                 in.seekg(0);
944                 //m->openInputFile(outfile, in);
945                 
946                 while(!in.eof()) {
947                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
948                         
949                         in >> first >> second >> dist; m->gobble(in);
950                                 
951                         if (first != currentRow) {
952                                 //print out last row
953                                 out << currentRow << '\t'; //print name
954
955                                 //print dists
956                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
957                                         if (j >= i) { break; }
958                                         out << it->second << '\t';
959                                         j++;
960                                 }
961                                 out << endl;
962                                 
963                                 //start new row
964                                 currentRow = first;
965                                 rowDists.clear();
966                                 rowDists[first] = 0.00;
967                                 rowDists[second] = dist;
968                                 j = 0;
969                                 i++;
970                         }else{
971                                 rowDists[second] = dist;
972                         }
973                 }
974                 //print out last row
975                 out << currentRow << '\t'; //print name
976                                 
977                 //print dists
978                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
979                         out << it->second << '\t';
980                 }
981                 out << endl;
982                 
983                 in.close();
984                 out.close();
985                 
986                 remove(outfile.c_str());
987                 
988                 return 1;
989                 
990         }
991         catch(exception& e) {
992                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
993                 exit(1);
994         }
995 }
996 /**************************************************************************************************/
997 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
998 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
999 //also check to make sure the 2 files have the same alignment length.
1000 bool DistanceCommand::sanityCheck() {
1001         try{
1002                 bool good = true;
1003                 
1004                 //make sure the 2 fasta files have the same alignment length
1005                 ifstream in;
1006                 m->openInputFile(fastafile, in);
1007                 int fastaAlignLength = 0;
1008                 if (in) { 
1009                         Sequence tempIn(in);
1010                         fastaAlignLength = tempIn.getAligned().length();
1011                 }
1012                 in.close();
1013                 
1014                 ifstream in2;
1015                 m->openInputFile(oldfastafile, in2);
1016                 int oldfastaAlignLength = 0;
1017                 if (in2) { 
1018                         Sequence tempIn2(in2);
1019                         oldfastaAlignLength = tempIn2.getAligned().length();
1020                 }
1021                 in2.close();
1022                 
1023                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1024                 
1025                 //read fasta file and save names as well as adding them to the alignDB
1026                 set<string> namesOldFasta;
1027                 
1028                 ifstream inFasta;
1029                 m->openInputFile(oldfastafile, inFasta);
1030                 
1031                 while (!inFasta.eof()) {
1032                         if (m->control_pressed) {  inFasta.close(); return good;  }
1033                 
1034                         Sequence temp(inFasta);
1035                         
1036                         if (temp.getName() != "") {
1037                                 namesOldFasta.insert(temp.getName());  //save name
1038                                 alignDB.push_back(temp);  //add to DB
1039                         }
1040                         
1041                         m->gobble(inFasta);
1042                 }
1043                 
1044                 inFasta.close();
1045                 
1046                 //read through the column file checking names and removing distances above the cutoff
1047                 ifstream inDist;
1048                 m->openInputFile(column, inDist);
1049                 
1050                 ofstream outDist;
1051                 string outputFile = column + ".temp";
1052                 m->openOutputFile(outputFile, outDist);
1053                 
1054                 string name1, name2;
1055                 float dist;
1056                 while (!inDist.eof()) {
1057                         if (m->control_pressed) {  inDist.close(); outDist.close(); remove(outputFile.c_str()); return good;  }
1058                 
1059                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1060                         
1061                         //both names are in fasta file and distance is below cutoff
1062                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1063                         else{
1064                                 if (dist <= cutoff) {
1065                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1066                                 }
1067                         }
1068                 }
1069                 
1070                 inDist.close();
1071                 outDist.close();
1072                 
1073                 if (good) {
1074                         remove(column.c_str());
1075                         rename(outputFile.c_str(), column.c_str());
1076                 }else{
1077                         remove(outputFile.c_str()); //temp file is bad because file mismatch above
1078                 }
1079                 
1080         }
1081         catch(exception& e) {
1082                 m->errorOut(e, "DistanceCommand", "m->appendFiles");
1083                 exit(1);
1084         }
1085 }
1086 /**************************************************************************************************/
1087
1088
1089
1090