]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18 vector<string> DistanceCommand::getValidParameters(){   
19         try {
20                 string Array[] =  {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir","compress"};
21                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22                 return myArray;
23         }
24         catch(exception& e) {
25                 m->errorOut(e, "DistanceCommand", "getValidParameters");
26                 exit(1);
27         }
28 }
29 //**********************************************************************************************************************
30 DistanceCommand::DistanceCommand(){     
31         try {
32                 //initialize outputTypes
33                 vector<string> tempOutNames;
34                 outputTypes["phylip"] = tempOutNames;
35                 outputTypes["column"] = tempOutNames;
36         }
37         catch(exception& e) {
38                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
39                 exit(1);
40         }
41 }
42 //**********************************************************************************************************************
43 vector<string> DistanceCommand::getRequiredParameters(){        
44         try {
45                 string Array[] =  {"fasta"};
46                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
47                 return myArray;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "ChopSeqsCommand", "getRequiredParameters");
51                 exit(1);
52         }
53 }
54 //**********************************************************************************************************************
55 vector<string> DistanceCommand::getRequiredFiles(){     
56         try {
57                 vector<string> myArray;
58                 return myArray;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "DistanceCommand", "getRequiredFiles");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66 DistanceCommand::DistanceCommand(string option) {
67         try {
68                 abort = false;
69                 Estimators.clear();
70                                 
71                 //allow user to run help
72                 if(option == "help") { help(); abort = true; }
73                 
74                 else {
75                         //valid paramters for this command
76                         string Array[] =  {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir","compress"};
77
78                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
79                         
80                         OptionParser parser(option);
81                         map<string, string> parameters = parser.getParameters();
82                         
83                         ValidParameters validParameter("dist.seqs");
84                         map<string, string>::iterator it2;
85                 
86                         //check to make sure all parameters are valid for command
87                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
88                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
89                         }
90                         
91                         //initialize outputTypes
92                         vector<string> tempOutNames;
93                         outputTypes["phylip"] = tempOutNames;
94                         outputTypes["column"] = tempOutNames;
95                 
96                         //if the user changes the input directory command factory will send this info to us in the output parameter 
97                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
98                         if (inputDir == "not found"){   inputDir = "";          }
99                         else {
100                                 string path;
101                                 it2 = parameters.find("fasta");
102                                 //user has given a template file
103                                 if(it2 != parameters.end()){ 
104                                         path = m->hasPath(it2->second);
105                                         //if the user has not given a path then, add inputdir. else leave path alone.
106                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
107                                 }
108                                 
109                                 it2 = parameters.find("oldfasta");
110                                 //user has given a template file
111                                 if(it2 != parameters.end()){ 
112                                         path = m->hasPath(it2->second);
113                                         //if the user has not given a path then, add inputdir. else leave path alone.
114                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
115                                 }
116                                 
117                                 it2 = parameters.find("column");
118                                 //user has given a template file
119                                 if(it2 != parameters.end()){ 
120                                         path = m->hasPath(it2->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
123                                 }
124                         }
125
126                         //check for required parameters
127                         fastafile = validParameter.validFile(parameters, "fasta", true);
128                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the dist.seqs command."); m->mothurOutEndLine(); abort = true; }
129                         else if (fastafile == "not open") { abort = true; }     
130                         else{
131                                 ifstream inFASTA;
132                                 m->openInputFile(fastafile, inFASTA);
133                                 alignDB = SequenceDB(inFASTA); 
134                                 inFASTA.close();
135                         }
136                         
137                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
138                         if (oldfastafile == "not found") { oldfastafile = ""; }
139                         else if (oldfastafile == "not open") { abort = true; }  
140                         
141                         column = validParameter.validFile(parameters, "column", true);
142                         if (column == "not found") { column = ""; }
143                         else if (column == "not open") { abort = true; }        
144                         
145                         //if the user changes the output directory command factory will send this info to us in the output parameter 
146                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
147                                 outputDir = ""; 
148                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
149                         }
150
151                         //check for optional parameter and set defaults
152                         // ...at some point should added some additional type checking...
153                         calc = validParameter.validFile(parameters, "calc", false);                     
154                         if (calc == "not found") { calc = "onegap";  }
155                         else { 
156                                  if (calc == "default")  {  calc = "onegap";  }
157                         }
158                         m->splitAtDash(calc, Estimators);
159
160                         string temp;
161                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
162                         convert(temp, countends); 
163                         
164                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
165                         convert(temp, cutoff); 
166                         
167                         temp = validParameter.validFile(parameters, "processors", false);       if(temp == "not found"){        temp = "1"; }
168                         convert(temp, processors); 
169                         
170                         temp = validParameter.validFile(parameters, "compress", false);         if(temp == "not found"){  temp = "F"; }
171                         convert(temp, compress);
172
173                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
174                         
175                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
176                         
177                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
178                         
179                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
180                         
181                         ValidCalculators validCalculator;
182                         
183                         if (m->isTrue(countends) == true) {
184                                 for (int i=0; i<Estimators.size(); i++) {
185                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
186                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
187                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
188                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
189                                         }
190                                 }
191                         }else {
192                                 for (int i=0; i<Estimators.size(); i++) {
193                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
194                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
195                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
196                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
197                                         }
198                                 }
199                         }
200
201                 }
202                                 
203         }
204         catch(exception& e) {
205                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
206                 exit(1);
207         }
208 }
209
210 //**********************************************************************************************************************
211
212 DistanceCommand::~DistanceCommand(){
213         
214         for(int i=0;i<lines.size();i++){
215                 delete lines[i];
216         }
217         
218 }
219         
220 //**********************************************************************************************************************
221
222 void DistanceCommand::help(){
223         try {
224                 m->mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
225                 m->mothurOut("The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors.  \n");
226                 m->mothurOut("The fasta parameter is required.\n");
227                 m->mothurOut("The oldfasta and column parameters allow you to append the distances calculated to the column file.\n");
228                 m->mothurOut("The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
229                 m->mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n");
230                 m->mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
231                 m->mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
232                 m->mothurOut("The processors parameter allows you to specify number of processors to use.  The default is 1.\n");
233                 m->mothurOut("The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n");
234                 m->mothurOut("The dist.seqs command should be in the following format: \n");
235                 m->mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
236                 m->mothurOut("Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n");
237                 m->mothurOut("Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n");
238         }
239         catch(exception& e) {
240                 m->errorOut(e, "DistanceCommand", "help");
241                 exit(1);
242         }
243 }
244 //**********************************************************************************************************************
245
246 int DistanceCommand::execute(){
247         try {
248                 
249                 if (abort == true) { return 0; }
250                 
251                 int startTime = time(NULL);
252                 
253                 //save number of new sequence
254                 numNewFasta = alignDB.getNumSeqs();
255                 
256                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
257                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
258                 
259                 if (m->control_pressed) { return 0; }
260                 
261                 int numSeqs = alignDB.getNumSeqs();
262                 cutoff += 0.005;
263                 
264                 string outputFile;
265                                 
266                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
267                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
268                         remove(outputFile.c_str()); outputTypes["phylip"].push_back(outputFile);
269                         
270                         //output numSeqs to phylip formatted dist file
271                 }else if (output == "column") { //user wants column format
272                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
273                         outputTypes["column"].push_back(outputFile);
274                         
275                         //so we don't accidentally overwrite
276                         if (outputFile == column) { 
277                                 string tempcolumn = column + ".old"; 
278                                 rename(column.c_str(), tempcolumn.c_str());
279                         }
280                         
281                         remove(outputFile.c_str());
282                 }else { //assume square
283                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
284                         remove(outputFile.c_str());
285                         outputTypes["phylip"].push_back(outputFile);
286                 }
287                 
288
289 #ifdef USE_MPI
290                 
291                 int pid, start, end; 
292                 int tag = 2001;
293                                 
294                 MPI_Status status; 
295                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
296                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
297                 
298                 //each process gets where it should start and stop in the file
299                 if (output != "square") {
300                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
301                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
302                 }else{
303                         start = int ((float(pid)/float(processors)) * numSeqs);
304                         end = int ((float(pid+1)/float(processors)) * numSeqs);
305                 }
306                 
307                 if (output == "column") {
308                         MPI_File outMPI;
309                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
310
311                         //char* filename = new char[outputFile.length()];
312                         //memcpy(filename, outputFile.c_str(), outputFile.length());
313                         
314                         char filename[1024];
315                         strcpy(filename, outputFile.c_str());
316                         
317                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
318                         //delete filename;
319
320                         if (pid == 0) { //you are the root process 
321                         
322                                 //do your part
323                                 string outputMyPart;
324                                 
325                                 driverMPI(start, end, outMPI, cutoff); 
326                                 
327                                 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
328                         
329                                 //wait on chidren
330                                 for(int i = 1; i < processors; i++) { 
331                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
332                                         
333                                         char buf[5];
334                                         MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
335                                 }
336                         }else { //you are a child process
337                                 //do your part
338                                 driverMPI(start, end, outMPI, cutoff); 
339                                 
340                                 if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
341                         
342                                 char buf[5];
343                                 strcpy(buf, "done"); 
344                                 //tell parent you are done.
345                                 MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
346                         }
347                         
348                         MPI_File_close(&outMPI);
349                         
350                 }else { //lower triangle format
351                         if (pid == 0) { //you are the root process 
352                         
353                                 //do your part
354                                 string outputMyPart;
355                                 unsigned long int mySize;
356                                 
357                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
358                                 else { driverMPI(start, end, outputFile, mySize, output); }
359         
360                                 if (m->control_pressed) {  outputTypes.clear();  delete distCalculator;  return 0; }
361                                 
362                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
363                                 MPI_File outMPI;
364                                 MPI_File inMPI;
365
366                                 //char* filename = new char[outputFile.length()];
367                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
368                                 
369                                 char filename[1024];
370                                 strcpy(filename, outputFile.c_str());
371
372                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
373                                 //delete filename;
374
375                                 //wait on chidren
376                                 for(int b = 1; b < processors; b++) { 
377                                         unsigned long int fileSize;
378                                         
379                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
380                                         
381                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
382                                         
383                                         string outTemp = outputFile + toString(b) + ".temp";
384
385                                         char* buf = new char[outTemp.length()];
386                                         memcpy(buf, outTemp.c_str(), outTemp.length());
387                                         
388                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
389                                         delete buf;
390
391                                         int count = 0;
392                                         while (count < fileSize) { 
393                                                 char buf2[1];
394                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
395                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
396                                                 count += 1;
397                                         }
398                                         
399                                         MPI_File_close(&inMPI); //deleted on close
400                                 }
401                                 
402                                 MPI_File_close(&outMPI);
403                         }else { //you are a child process
404                                 //do your part
405                                 unsigned long int size;
406                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
407                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
408                                 
409                                 if (m->control_pressed) { delete distCalculator;  return 0; }
410                         
411                                 //tell parent you are done.
412                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
413                         }
414                 }
415                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
416 #else           
417                                 
418         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
419                 //if you don't need to fork anything
420                 if(processors == 1){
421                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
422                         else { driver(0, numSeqs, outputFile, "square");  }
423                 }else{ //you have multiple processors
424                         
425                         for (int i = 0; i < processors; i++) {
426                                 lines.push_back(new linePair());
427                                 if (output != "square") {
428                                         lines[i]->start = int (sqrt(float(i)/float(processors)) * numSeqs);
429                                         lines[i]->end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
430                                 }else{
431                                         lines[i]->start = int ((float(i)/float(processors)) * numSeqs);
432                                         lines[i]->end = int ((float(i+1)/float(processors)) * numSeqs);
433                                 }
434                         }
435
436                         createProcesses(outputFile); 
437                 
438                         map<int, int>::iterator it = processIDS.begin();
439                         rename((outputFile + toString(it->second) + ".temp").c_str(), outputFile.c_str());
440                         it++;
441                         
442                         //append and remove temp files
443                         for (; it != processIDS.end(); it++) {
444                                 m->appendFiles((outputFile + toString(it->second) + ".temp"), outputFile);
445                                 remove((outputFile + toString(it->second) + ".temp").c_str());
446                         }
447                 }
448         #else
449                 //ifstream inFASTA;
450                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
451                 else { driver(0, numSeqs, outputFile, "square");  }
452         #endif
453         
454 #endif
455                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; remove(outputFile.c_str()); return 0; }
456                 
457                 #ifdef USE_MPI
458                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
459                                         
460                         if (pid == 0) { //only one process should output to screen
461                 #endif
462                 
463                 //if (output == "square") {  convertMatrix(outputFile); }
464                 
465                 ifstream fileHandle;
466                 fileHandle.open(outputFile.c_str());
467                 if(fileHandle) {
468                         m->gobble(fileHandle);
469                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
470                 }
471                 
472                 //append the old column file to the new one
473                 if ((oldfastafile != "") && (column != ""))  {
474                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
475                         if (outputFile == column) { 
476                                 string tempcolumn = column + ".old";
477                                 m->appendFiles(tempcolumn, outputFile);
478                                 remove(tempcolumn.c_str());
479                         }else{
480                                 m->appendFiles(outputFile, column);
481                                 remove(outputFile.c_str());
482                                 outputFile = column;
483                         }
484                         
485                         if (outputDir != "") { 
486                                 string newOutputName = outputDir + m->getSimpleName(outputFile);
487                                 rename(outputFile.c_str(), newOutputName.c_str());
488                                 remove(outputFile.c_str());
489                                 outputFile = newOutputName;
490                         }
491                 }
492
493                 
494                 #ifdef USE_MPI
495                         }
496                 #endif
497                 
498                 if (m->control_pressed) { outputTypes.clear();  delete distCalculator; remove(outputFile.c_str()); return 0; }
499                 
500                 delete distCalculator;
501                 
502                 m->mothurOutEndLine();
503                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
504                 m->mothurOut(outputFile); m->mothurOutEndLine();
505                 m->mothurOutEndLine();
506                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
507
508
509                 if (m->isTrue(compress)) {
510                         m->mothurOut("Compressing..."); m->mothurOutEndLine();
511                         m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)"); m->mothurOutEndLine();
512                         system(("gzip -v " + outputFile).c_str());
513                         outputNames.push_back(outputFile + ".gz");
514                 }else { outputNames.push_back(outputFile); }
515
516                 return 0;
517                 
518         }
519         catch(exception& e) {
520                 m->errorOut(e, "DistanceCommand", "execute");
521                 exit(1);
522         }
523 }
524 /**************************************************************************************************/
525 void DistanceCommand::createProcesses(string filename) {
526         try {
527 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
528                 int process = 0;
529                 processIDS.clear();
530                 
531                 //loop through and create all the processes you want
532                 while (process != processors) {
533                         int pid = fork();
534                         
535                         if (pid > 0) {
536                                 processIDS[lines[process]->end] = pid;  //create map from line number to pid so you can append files in correct order later
537                                 process++;
538                         }else if (pid == 0){
539                                 if (output != "square") {  driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", cutoff); }
540                                 else { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", "square"); }
541                                 exit(0);
542                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
543                 }
544         
545                 //force parent to wait until all the processes are done
546                 for (map<int, int>::iterator it = processIDS.begin(); it != processIDS.end(); it++) { 
547                         int temp = it->second;
548                         wait(&temp);
549                 }
550 #endif
551         }
552         catch(exception& e) {
553                 m->errorOut(e, "DistanceCommand", "createProcesses");
554                 exit(1);
555         }
556 }
557
558 /**************************************************************************************************/
559 /////// need to fix to work with calcs and sequencedb
560 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
561         try {
562
563                 int startTime = time(NULL);
564                 
565                 //column file
566                 ofstream outFile(dFileName.c_str(), ios::trunc);
567                 outFile.setf(ios::fixed, ios::showpoint);
568                 outFile << setprecision(4);
569                 
570                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
571                 
572                 for(int i=startLine;i<endLine;i++){
573                         if(output == "lt")      {       
574                                 string name = alignDB.get(i).getName();
575                                 if (name.length() < 10) { //pad with spaces to make compatible
576                                         while (name.length() < 10) {  name += " ";  }
577                                 }
578                                 outFile << name << '\t';        
579                         }
580                         for(int j=0;j<i;j++){
581                                 
582                                 if (m->control_pressed) { outFile.close(); return 0;  }
583                                 
584                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
585                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
586                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
587                                 
588                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
589                                 double dist = distCalculator->getDist();
590                                 
591                                 if(dist <= cutoff){
592                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
593                                 }
594                                 if (output == "lt") {  outFile << dist << '\t'; }
595                         }
596                         
597                         if (output == "lt") { outFile << endl; }
598                         
599                         if(i % 100 == 0){
600                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
601                         }
602                         
603                 }
604                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
605                 
606                 outFile.close();
607                 
608                 return 1;
609         }
610         catch(exception& e) {
611                 m->errorOut(e, "DistanceCommand", "driver");
612                 exit(1);
613         }
614 }
615 /**************************************************************************************************/
616 /////// need to fix to work with calcs and sequencedb
617 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
618         try {
619
620                 int startTime = time(NULL);
621                 
622                 //column file
623                 ofstream outFile(dFileName.c_str(), ios::trunc);
624                 outFile.setf(ios::fixed, ios::showpoint);
625                 outFile << setprecision(4);
626                 
627                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
628                 
629                 for(int i=startLine;i<endLine;i++){
630                                 
631                         string name = alignDB.get(i).getName();
632                         //pad with spaces to make compatible
633                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
634                                 
635                         outFile << name << '\t';        
636                         
637                         for(int j=0;j<alignDB.getNumSeqs();j++){
638                                 
639                                 if (m->control_pressed) { outFile.close(); return 0;  }
640                                 
641                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
642                                 double dist = distCalculator->getDist();
643                                 
644                                 outFile << dist << '\t'; 
645                         }
646                         
647                         outFile << endl; 
648                         
649                         if(i % 100 == 0){
650                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
651                         }
652                         
653                 }
654                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
655                 
656                 outFile.close();
657                 
658                 return 1;
659         }
660         catch(exception& e) {
661                 m->errorOut(e, "DistanceCommand", "driver");
662                 exit(1);
663         }
664 }
665 #ifdef USE_MPI
666 /**************************************************************************************************/
667 /////// need to fix to work with calcs and sequencedb
668 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
669         try {
670                 MPI_Status status;
671                 int startTime = time(NULL);
672                 
673                 string outputString = "";
674                 
675                 for(int i=startLine;i<endLine;i++){
676         
677                         for(int j=0;j<i;j++){
678                                 
679                                 if (m->control_pressed) {  return 0;  }
680                                 
681                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
682                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
683                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
684                                 
685                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
686                                 double dist = distCalculator->getDist();
687                                 
688                                 if(dist <= cutoff){
689                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
690                                 }
691                         }
692                         
693                         if(i % 100 == 0){
694                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
695                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
696                         }
697                         
698                          
699                         //send results to parent
700                         int length = outputString.length();
701
702                         char* buf = new char[length];
703                         memcpy(buf, outputString.c_str(), length);
704                         
705                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
706                         outputString = "";
707                         delete buf;
708                         
709                 }
710                 
711                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
712                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
713                 return 1;
714         }
715         catch(exception& e) {
716                 m->errorOut(e, "DistanceCommand", "driverMPI");
717                 exit(1);
718         }
719 }
720 /**************************************************************************************************/
721 /////// need to fix to work with calcs and sequencedb
722 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
723         try {
724                 MPI_Status status;
725                 
726                 MPI_File outMPI;
727                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
728
729                 //char* filename = new char[file.length()];
730                 //memcpy(filename, file.c_str(), file.length());
731                 
732                 char filename[1024];
733                 strcpy(filename, file.c_str());
734
735                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
736                 //delete filename;
737
738                 int startTime = time(NULL);
739                 
740                 string outputString = "";
741                 size = 0;
742                 
743                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
744                 
745                 for(int i=startLine;i<endLine;i++){
746                                 
747                         string name = alignDB.get(i).getName();
748                         if (name.length() < 10) { //pad with spaces to make compatible
749                                 while (name.length() < 10) {  name += " ";  }
750                         }
751                         outputString += name + "\t";    
752                         
753                         for(int j=0;j<i;j++){
754                                 
755                                 if (m->control_pressed) {  return 0;  }
756                                 
757                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
758                                 double dist = distCalculator->getDist();
759                                 
760                                 outputString += toString(dist) + "\t"; 
761                         }
762                         
763                         outputString += "\n"; 
764
765                 
766                         if(i % 100 == 0){
767                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
768                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
769                         }
770                         
771                         
772                         //send results to parent
773                         int length = outputString.length();
774                         char* buf = new char[length];
775                         memcpy(buf, outputString.c_str(), length);
776                         
777                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
778                         size += outputString.length();
779                         outputString = "";
780                         delete buf;
781                 }
782                 
783                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
784                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
785                 MPI_File_close(&outMPI);
786                 
787                 return 1;
788         }
789         catch(exception& e) {
790                 m->errorOut(e, "DistanceCommand", "driverMPI");
791                 exit(1);
792         }
793 }
794 /**************************************************************************************************/
795 /////// need to fix to work with calcs and sequencedb
796 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
797         try {
798                 MPI_Status status;
799                 
800                 MPI_File outMPI;
801                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
802
803                 //char* filename = new char[file.length()];
804                 //memcpy(filename, file.c_str(), file.length());
805                 
806                 char filename[1024];
807                 strcpy(filename, file.c_str());
808
809                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
810                 //delete filename;
811
812                 int startTime = time(NULL);
813                 
814                 string outputString = "";
815                 size = 0;
816                 
817                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
818                 
819                 for(int i=startLine;i<endLine;i++){
820                                 
821                         string name = alignDB.get(i).getName();
822                         if (name.length() < 10) { //pad with spaces to make compatible
823                                 while (name.length() < 10) {  name += " ";  }
824                         }
825                         outputString += name + "\t";    
826                         
827                         for(int j=0;j<alignDB.getNumSeqs();j++){
828                                 
829                                 if (m->control_pressed) {  return 0;  }
830                                 
831                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
832                                 double dist = distCalculator->getDist();
833                                 
834                                 outputString += toString(dist) + "\t"; 
835                         }
836                         
837                         outputString += "\n"; 
838
839                 
840                         if(i % 100 == 0){
841                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
842                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
843                         }
844                         
845                         
846                         //send results to parent
847                         int length = outputString.length();
848                         char* buf = new char[length];
849                         memcpy(buf, outputString.c_str(), length);
850                         
851                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
852                         size += outputString.length();
853                         outputString = "";
854                         delete buf;
855                 }
856                 
857                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
858                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
859                 MPI_File_close(&outMPI);
860                 
861                 return 1;
862         }
863         catch(exception& e) {
864                 m->errorOut(e, "DistanceCommand", "driverMPI");
865                 exit(1);
866         }
867 }
868 #endif
869 /**************************************************************************************************
870 int DistanceCommand::convertMatrix(string outputFile) {
871         try{
872
873                 //sort file by first column so the distances for each row are together
874                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
875                 
876                 //use the unix sort 
877                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
878                         string command = "sort -n " + outputFile + " -o " + outfile;
879                         system(command.c_str());
880                 #else //sort using windows sort
881                         string command = "sort " + outputFile + " /O " + outfile;
882                         system(command.c_str());
883                 #endif
884                 
885
886                 //output to new file distance for each row and save positions in file where new row begins
887                 ifstream in;
888                 m->openInputFile(outfile, in);
889                 
890                 ofstream out;
891                 m->openOutputFile(outputFile, out);
892                 
893                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
894
895                 out << alignDB.getNumSeqs() << endl;
896                 
897                 //get first currentRow
898                 string first, currentRow, second;
899                 float dist;
900                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
901                 map<string, float>::iterator it;
902                 
903                 in >> first;
904                 currentRow = first;
905                 
906                 rowDists[first] = 0.00; //distance to yourself is 0.0
907                 
908                 in.seekg(0);
909                 //m->openInputFile(outfile, in);
910                 
911                 while(!in.eof()) {
912                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
913                         
914                         in >> first >> second >> dist; m->gobble(in);
915                                 
916                         if (first != currentRow) {
917                                 //print out last row
918                                 out << currentRow << '\t'; //print name
919
920                                 //print dists
921                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
922                                         out << it->second << '\t';
923                                 }
924                                 out << endl;
925                                 
926                                 //start new row
927                                 currentRow = first;
928                                 rowDists.clear();
929                                 rowDists[first] = 0.00;
930                                 rowDists[second] = dist;
931                         }else{
932                                 rowDists[second] = dist;
933                         }
934                 }
935                 //print out last row
936                 out << currentRow << '\t'; //print name
937                                 
938                 //print dists
939                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
940                         out << it->second << '\t';
941                 }
942                 out << endl;
943                 
944                 in.close();
945                 out.close();
946                 
947                 remove(outfile.c_str());
948                 
949                 return 1;
950                 
951         }
952         catch(exception& e) {
953                 m->errorOut(e, "DistanceCommand", "convertMatrix");
954                 exit(1);
955         }
956 }
957 /**************************************************************************************************
958 int DistanceCommand::convertToLowerTriangle(string outputFile) {
959         try{
960
961                 //sort file by first column so the distances for each row are together
962                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
963                 
964                 //use the unix sort 
965                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
966                         string command = "sort -n " + outputFile + " -o " + outfile;
967                         system(command.c_str());
968                 #else //sort using windows sort
969                         string command = "sort " + outputFile + " /O " + outfile;
970                         system(command.c_str());
971                 #endif
972                 
973
974                 //output to new file distance for each row and save positions in file where new row begins
975                 ifstream in;
976                 m->openInputFile(outfile, in);
977                 
978                 ofstream out;
979                 m->openOutputFile(outputFile, out);
980                 
981                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
982
983                 out << alignDB.getNumSeqs() << endl;
984                 
985                 //get first currentRow
986                 string first, currentRow, second;
987                 float dist;
988                 int i, j;
989                 i = 0; j = 0;
990                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
991                 map<string, float>::iterator it;
992                 
993                 in >> first;
994                 currentRow = first;
995                 
996                 rowDists[first] = 0.00; //distance to yourself is 0.0
997                 
998                 in.seekg(0);
999                 //m->openInputFile(outfile, in);
1000                 
1001                 while(!in.eof()) {
1002                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
1003                         
1004                         in >> first >> second >> dist; m->gobble(in);
1005                                 
1006                         if (first != currentRow) {
1007                                 //print out last row
1008                                 out << currentRow << '\t'; //print name
1009
1010                                 //print dists
1011                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1012                                         if (j >= i) { break; }
1013                                         out << it->second << '\t';
1014                                         j++;
1015                                 }
1016                                 out << endl;
1017                                 
1018                                 //start new row
1019                                 currentRow = first;
1020                                 rowDists.clear();
1021                                 rowDists[first] = 0.00;
1022                                 rowDists[second] = dist;
1023                                 j = 0;
1024                                 i++;
1025                         }else{
1026                                 rowDists[second] = dist;
1027                         }
1028                 }
1029                 //print out last row
1030                 out << currentRow << '\t'; //print name
1031                                 
1032                 //print dists
1033                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1034                         out << it->second << '\t';
1035                 }
1036                 out << endl;
1037                 
1038                 in.close();
1039                 out.close();
1040                 
1041                 remove(outfile.c_str());
1042                 
1043                 return 1;
1044                 
1045         }
1046         catch(exception& e) {
1047                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
1048                 exit(1);
1049         }
1050 }
1051 /**************************************************************************************************/
1052 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
1053 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
1054 //also check to make sure the 2 files have the same alignment length.
1055 bool DistanceCommand::sanityCheck() {
1056         try{
1057                 bool good = true;
1058                 
1059                 //make sure the 2 fasta files have the same alignment length
1060                 ifstream in;
1061                 m->openInputFile(fastafile, in);
1062                 int fastaAlignLength = 0;
1063                 if (in) { 
1064                         Sequence tempIn(in);
1065                         fastaAlignLength = tempIn.getAligned().length();
1066                 }
1067                 in.close();
1068                 
1069                 ifstream in2;
1070                 m->openInputFile(oldfastafile, in2);
1071                 int oldfastaAlignLength = 0;
1072                 if (in2) { 
1073                         Sequence tempIn2(in2);
1074                         oldfastaAlignLength = tempIn2.getAligned().length();
1075                 }
1076                 in2.close();
1077                 
1078                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1079                 
1080                 //read fasta file and save names as well as adding them to the alignDB
1081                 set<string> namesOldFasta;
1082                 
1083                 ifstream inFasta;
1084                 m->openInputFile(oldfastafile, inFasta);
1085                 
1086                 while (!inFasta.eof()) {
1087                         if (m->control_pressed) {  inFasta.close(); return good;  }
1088                 
1089                         Sequence temp(inFasta);
1090                         
1091                         if (temp.getName() != "") {
1092                                 namesOldFasta.insert(temp.getName());  //save name
1093                                 alignDB.push_back(temp);  //add to DB
1094                         }
1095                         
1096                         m->gobble(inFasta);
1097                 }
1098                 
1099                 inFasta.close();
1100                 
1101                 //read through the column file checking names and removing distances above the cutoff
1102                 ifstream inDist;
1103                 m->openInputFile(column, inDist);
1104                 
1105                 ofstream outDist;
1106                 string outputFile = column + ".temp";
1107                 m->openOutputFile(outputFile, outDist);
1108                 
1109                 string name1, name2;
1110                 float dist;
1111                 while (!inDist.eof()) {
1112                         if (m->control_pressed) {  inDist.close(); outDist.close(); remove(outputFile.c_str()); return good;  }
1113                 
1114                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1115                         
1116                         //both names are in fasta file and distance is below cutoff
1117                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1118                         else{
1119                                 if (dist <= cutoff) {
1120                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1121                                 }
1122                         }
1123                 }
1124                 
1125                 inDist.close();
1126                 outDist.close();
1127                 
1128                 if (good) {
1129                         remove(column.c_str());
1130                         rename(outputFile.c_str(), column.c_str());
1131                 }else{
1132                         remove(outputFile.c_str()); //temp file is bad because file mismatch above
1133                 }
1134                 
1135         }
1136         catch(exception& e) {
1137                 m->errorOut(e, "DistanceCommand", "m->appendFiles");
1138                 exit(1);
1139         }
1140 }
1141 /**************************************************************************************************/
1142
1143
1144
1145