]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
unifrac parallelization done, changed dist.seqs output=square to calc all distance...
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18
19 DistanceCommand::DistanceCommand(string option) {
20         try {
21                 abort = false;
22                 Estimators.clear();
23                                 
24                 //allow user to run help
25                 if(option == "help") { help(); abort = true; }
26                 
27                 else {
28                         //valid paramters for this command
29                         string Array[] =  {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"};
30                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
31                         
32                         OptionParser parser(option);
33                         map<string, string> parameters = parser.getParameters();
34                         
35                         ValidParameters validParameter("dist.seqs");
36                         map<string, string>::iterator it2;
37                 
38                         //check to make sure all parameters are valid for command
39                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
40                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
41                         }
42                         
43                         //if the user changes the input directory command factory will send this info to us in the output parameter 
44                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
45                         if (inputDir == "not found"){   inputDir = "";          }
46                         else {
47                                 string path;
48                                 it2 = parameters.find("fasta");
49                                 //user has given a template file
50                                 if(it2 != parameters.end()){ 
51                                         path = m->hasPath(it2->second);
52                                         //if the user has not given a path then, add inputdir. else leave path alone.
53                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
54                                 }
55                                 
56                                 it2 = parameters.find("oldfasta");
57                                 //user has given a template file
58                                 if(it2 != parameters.end()){ 
59                                         path = m->hasPath(it2->second);
60                                         //if the user has not given a path then, add inputdir. else leave path alone.
61                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
62                                 }
63                                 
64                                 it2 = parameters.find("column");
65                                 //user has given a template file
66                                 if(it2 != parameters.end()){ 
67                                         path = m->hasPath(it2->second);
68                                         //if the user has not given a path then, add inputdir. else leave path alone.
69                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
70                                 }
71                         }
72
73                         //check for required parameters
74                         fastafile = validParameter.validFile(parameters, "fasta", true);
75                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the dist.seqs command."); m->mothurOutEndLine(); abort = true; }
76                         else if (fastafile == "not open") { abort = true; }     
77                         else{
78                                 ifstream inFASTA;
79                                 m->openInputFile(fastafile, inFASTA);
80                                 alignDB = SequenceDB(inFASTA); 
81                                 inFASTA.close();
82                         }
83                         
84                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
85                         if (oldfastafile == "not found") { oldfastafile = ""; }
86                         else if (oldfastafile == "not open") { abort = true; }  
87                         
88                         column = validParameter.validFile(parameters, "column", true);
89                         if (column == "not found") { column = ""; }
90                         else if (column == "not open") { abort = true; }        
91                         
92                         //if the user changes the output directory command factory will send this info to us in the output parameter 
93                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
94                                 outputDir = ""; 
95                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
96                         }
97
98                         //check for optional parameter and set defaults
99                         // ...at some point should added some additional type checking...
100                         calc = validParameter.validFile(parameters, "calc", false);                     
101                         if (calc == "not found") { calc = "onegap";  }
102                         else { 
103                                  if (calc == "default")  {  calc = "onegap";  }
104                         }
105                         m->splitAtDash(calc, Estimators);
106
107                         string temp;
108                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
109                         convert(temp, countends); 
110                         
111                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
112                         convert(temp, cutoff); 
113                         
114                         temp = validParameter.validFile(parameters, "processors", false);       if(temp == "not found"){        temp = "1"; }
115                         convert(temp, processors); 
116                         
117                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
118                         
119                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
120                         
121                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
122                         
123                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
124                         
125                         ValidCalculators validCalculator;
126                         
127                         if (m->isTrue(countends) == true) {
128                                 for (int i=0; i<Estimators.size(); i++) {
129                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
130                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
131                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
132                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
133                                         }
134                                 }
135                         }else {
136                                 for (int i=0; i<Estimators.size(); i++) {
137                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
138                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
139                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
140                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
141                                         }
142                                 }
143                         }
144
145                 }
146                                 
147         }
148         catch(exception& e) {
149                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
150                 exit(1);
151         }
152 }
153
154 //**********************************************************************************************************************
155
156 DistanceCommand::~DistanceCommand(){
157         
158         for(int i=0;i<lines.size();i++){
159                 delete lines[i];
160         }
161         
162 }
163         
164 //**********************************************************************************************************************
165
166 void DistanceCommand::help(){
167         try {
168                 m->mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
169                 m->mothurOut("The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, cutoff and processors.  \n");
170                 m->mothurOut("The fasta parameter is required.\n");
171                 m->mothurOut("The oldfasta and column parameters allow you to append the distances calculated to the column file.\n");
172                 m->mothurOut("The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
173                 m->mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n");
174                 m->mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
175                 m->mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
176                 m->mothurOut("The processors parameter allows you to specify number of processors to use.  The default is 1.\n");
177                 m->mothurOut("The dist.seqs command should be in the following format: \n");
178                 m->mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
179                 m->mothurOut("Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n");
180                 m->mothurOut("Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n");
181         }
182         catch(exception& e) {
183                 m->errorOut(e, "DistanceCommand", "help");
184                 exit(1);
185         }
186 }
187 //**********************************************************************************************************************
188
189 int DistanceCommand::execute(){
190         try {
191                 
192                 if (abort == true) { return 0; }
193                 
194                 int startTime = time(NULL);
195                 
196                 //save number of new sequence
197                 numNewFasta = alignDB.getNumSeqs();
198                 
199                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
200                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
201                 
202                 if (m->control_pressed) { return 0; }
203                 
204                 int numSeqs = alignDB.getNumSeqs();
205                 cutoff += 0.005;
206                 
207                 string outputFile;
208                                 
209                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
210                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
211                         remove(outputFile.c_str());
212                         
213                         //output numSeqs to phylip formatted dist file
214                 }else if (output == "column") { //user wants column format
215                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
216                         
217                         //so we don't accidentally overwrite
218                         if (outputFile == column) { 
219                                 string tempcolumn = column + ".old"; 
220                                 rename(column.c_str(), tempcolumn.c_str());
221                         }
222                         
223                         remove(outputFile.c_str());
224                 }else { //assume square
225                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
226                         remove(outputFile.c_str());
227                 }
228                 
229
230 #ifdef USE_MPI
231                 
232                 int pid, start, end; 
233                 int tag = 2001;
234                                 
235                 MPI_Status status; 
236                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
237                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
238                 
239                 //each process gets where it should start and stop in the file
240                 if (output != "square") {
241                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
242                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
243                 }else{
244                         start = int ((float(pid)/float(processors)) * numSeqs);
245                         end = int ((float(pid+1)/float(processors)) * numSeqs);
246                 }
247                 
248                 if (output == "column") {
249                         MPI_File outMPI;
250                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
251
252                         //char* filename = new char[outputFile.length()];
253                         //memcpy(filename, outputFile.c_str(), outputFile.length());
254                         
255                         char filename[1024];
256                         strcpy(filename, outputFile.c_str());
257                         
258                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
259                         //delete filename;
260
261                         if (pid == 0) { //you are the root process 
262                         
263                                 //do your part
264                                 string outputMyPart;
265                                 
266                                 driverMPI(start, end, outMPI, cutoff); 
267                                 
268                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
269                         
270                                 //wait on chidren
271                                 for(int i = 1; i < processors; i++) { 
272                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
273                                         
274                                         char buf[4];
275                                         MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
276                                 }
277                         }else { //you are a child process
278                                 //do your part
279                                 driverMPI(start, end, outMPI, cutoff); 
280                                 
281                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
282                         
283                                 char buf[4];
284                                 strcpy(buf, "done"); 
285                                 //tell parent you are done.
286                                 MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
287                         }
288                         
289                         MPI_File_close(&outMPI);
290                         
291                 }else { //lower triangle format
292                         if (pid == 0) { //you are the root process 
293                         
294                                 //do your part
295                                 string outputMyPart;
296                                 unsigned long int mySize;
297                                 
298                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
299                                 else { driverMPI(start, end, outputFile, mySize, output); }
300         
301                                 if (m->control_pressed) {  delete distCalculator;  return 0; }
302                                 
303                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
304                                 MPI_File outMPI;
305                                 MPI_File inMPI;
306
307                                 //char* filename = new char[outputFile.length()];
308                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
309                                 
310                                 char filename[1024];
311                                 strcpy(filename, outputFile.c_str());
312
313                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
314                                 //delete filename;
315
316                                 //wait on chidren
317                                 for(int b = 1; b < processors; b++) { 
318                                         unsigned long int fileSize;
319                                         
320                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
321                                         
322                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
323                                         
324                                         string outTemp = outputFile + toString(b) + ".temp";
325
326                                         char* buf = new char[outTemp.length()];
327                                         memcpy(buf, outTemp.c_str(), outTemp.length());
328                                         
329                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
330                                         delete buf;
331
332                                         int count = 0;
333                                         while (count < fileSize) { 
334                                                 char buf2[1];
335                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
336                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
337                                                 count += 1;
338                                         }
339                                         
340                                         MPI_File_close(&inMPI); //deleted on close
341                                 }
342                                 
343                                 MPI_File_close(&outMPI);
344                         }else { //you are a child process
345                                 //do your part
346                                 unsigned long int size;
347                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
348                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
349                                 
350                                 if (m->control_pressed) { delete distCalculator;  return 0; }
351                         
352                                 //tell parent you are done.
353                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
354                         }
355                 }
356                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
357 #else           
358                                 
359         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
360                 //if you don't need to fork anything
361                 if(processors == 1){
362                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
363                         else { driver(0, numSeqs, outputFile, "square");  }
364                 }else{ //you have multiple processors
365                         
366                         for (int i = 0; i < processors; i++) {
367                                 lines.push_back(new linePair());
368                                 if (output != "square") {
369                                         lines[i]->start = int (sqrt(float(i)/float(processors)) * numSeqs);
370                                         lines[i]->end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
371                                 }else{
372                                         lines[i]->start = int ((float(i)/float(processors)) * numSeqs);
373                                         lines[i]->end = int ((float(i+1)/float(processors)) * numSeqs);
374                                 }
375                         }
376
377                         createProcesses(outputFile); 
378                 
379                         map<int, int>::iterator it = processIDS.begin();
380                         rename((outputFile + toString(it->second) + ".temp").c_str(), outputFile.c_str());
381                         it++;
382                         
383                         //append and remove temp files
384                         for (; it != processIDS.end(); it++) {
385                                 m->appendFiles((outputFile + toString(it->second) + ".temp"), outputFile);
386                                 remove((outputFile + toString(it->second) + ".temp").c_str());
387                         }
388                 }
389         #else
390                 //ifstream inFASTA;
391                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
392                 else { driver(0, numSeqs, outputFile, "square");  }
393         #endif
394         
395 #endif
396                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
397                 
398                 #ifdef USE_MPI
399                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
400                                         
401                         if (pid == 0) { //only one process should output to screen
402                 #endif
403                 
404                 //if (output == "square") {  convertMatrix(outputFile); }
405                 
406                 ifstream fileHandle;
407                 fileHandle.open(outputFile.c_str());
408                 if(fileHandle) {
409                         m->gobble(fileHandle);
410                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
411                 }
412                 
413                 //append the old column file to the new one
414                 if ((oldfastafile != "") && (column != ""))  {
415                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
416                         if (outputFile == column) { 
417                                 string tempcolumn = column + ".old";
418                                 m->appendFiles(tempcolumn, outputFile);
419                                 remove(tempcolumn.c_str());
420                         }else{
421                                 m->appendFiles(outputFile, column);
422                                 remove(outputFile.c_str());
423                                 outputFile = column;
424                         }
425                 }
426
427                 
428                 #ifdef USE_MPI
429                         }
430                 #endif
431                 
432                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
433                 
434                 delete distCalculator;
435                 
436                 m->mothurOutEndLine();
437                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
438                 m->mothurOut(outputFile); m->mothurOutEndLine();
439                 m->mothurOutEndLine();
440                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
441                 return 0;
442                 
443         }
444         catch(exception& e) {
445                 m->errorOut(e, "DistanceCommand", "execute");
446                 exit(1);
447         }
448 }
449 /**************************************************************************************************/
450 void DistanceCommand::createProcesses(string filename) {
451         try {
452 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
453                 int process = 0;
454                 processIDS.clear();
455                 
456                 //loop through and create all the processes you want
457                 while (process != processors) {
458                         int pid = fork();
459                         
460                         if (pid > 0) {
461                                 processIDS[lines[process]->end] = pid;  //create map from line number to pid so you can append files in correct order later
462                                 process++;
463                         }else if (pid == 0){
464                                 if (output != "square") {  driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", cutoff); }
465                                 else { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", "square"); }
466                                 exit(0);
467                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
468                 }
469         
470                 //force parent to wait until all the processes are done
471                 for (map<int, int>::iterator it = processIDS.begin(); it != processIDS.end(); it++) { 
472                         int temp = it->second;
473                         wait(&temp);
474                 }
475 #endif
476         }
477         catch(exception& e) {
478                 m->errorOut(e, "DistanceCommand", "createProcesses");
479                 exit(1);
480         }
481 }
482
483 /**************************************************************************************************/
484 /////// need to fix to work with calcs and sequencedb
485 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
486         try {
487
488                 int startTime = time(NULL);
489                 
490                 //column file
491                 ofstream outFile(dFileName.c_str(), ios::trunc);
492                 outFile.setf(ios::fixed, ios::showpoint);
493                 outFile << setprecision(4);
494                 
495                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
496                 
497                 for(int i=startLine;i<endLine;i++){
498                         if(output == "lt")      {       
499                                 string name = alignDB.get(i).getName();
500                                 if (name.length() < 10) { //pad with spaces to make compatible
501                                         while (name.length() < 10) {  name += " ";  }
502                                 }
503                                 outFile << name << '\t';        
504                         }
505                         for(int j=0;j<i;j++){
506                                 
507                                 if (m->control_pressed) { outFile.close(); return 0;  }
508                                 
509                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
510                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
511                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
512                                 
513                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
514                                 double dist = distCalculator->getDist();
515                                 
516                                 if(dist <= cutoff){
517                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
518                                 }
519                                 if (output == "lt") {  outFile << dist << '\t'; }
520                         }
521                         
522                         if (output == "lt") { outFile << endl; }
523                         
524                         if(i % 100 == 0){
525                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
526                         }
527                         
528                 }
529                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
530                 
531                 outFile.close();
532                 
533                 return 1;
534         }
535         catch(exception& e) {
536                 m->errorOut(e, "DistanceCommand", "driver");
537                 exit(1);
538         }
539 }
540 /**************************************************************************************************/
541 /////// need to fix to work with calcs and sequencedb
542 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
543         try {
544
545                 int startTime = time(NULL);
546                 
547                 //column file
548                 ofstream outFile(dFileName.c_str(), ios::trunc);
549                 outFile.setf(ios::fixed, ios::showpoint);
550                 outFile << setprecision(4);
551                 
552                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
553                 
554                 for(int i=startLine;i<endLine;i++){
555                                 
556                         string name = alignDB.get(i).getName();
557                         //pad with spaces to make compatible
558                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
559                                 
560                         outFile << name << '\t';        
561                         
562                         for(int j=0;j<alignDB.getNumSeqs();j++){
563                                 
564                                 if (m->control_pressed) { outFile.close(); return 0;  }
565                                 
566                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
567                                 double dist = distCalculator->getDist();
568                                 
569                                 outFile << dist << '\t'; 
570                         }
571                         
572                         outFile << endl; 
573                         
574                         if(i % 100 == 0){
575                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
576                         }
577                         
578                 }
579                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
580                 
581                 outFile.close();
582                 
583                 return 1;
584         }
585         catch(exception& e) {
586                 m->errorOut(e, "DistanceCommand", "driver");
587                 exit(1);
588         }
589 }
590 #ifdef USE_MPI
591 /**************************************************************************************************/
592 /////// need to fix to work with calcs and sequencedb
593 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
594         try {
595                 MPI_Status status;
596                 int startTime = time(NULL);
597                 
598                 string outputString = "";
599                 
600                 for(int i=startLine;i<endLine;i++){
601         
602                         for(int j=0;j<i;j++){
603                                 
604                                 if (m->control_pressed) {  return 0;  }
605                                 
606                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
607                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
608                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
609                                 
610                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
611                                 double dist = distCalculator->getDist();
612                                 
613                                 if(dist <= cutoff){
614                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
615                                 }
616                         }
617                         
618                         if(i % 100 == 0){
619                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
620                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
621                         }
622                         
623                          
624                         //send results to parent
625                         int length = outputString.length();
626
627                         char* buf = new char[length];
628                         memcpy(buf, outputString.c_str(), length);
629                         
630                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
631                         outputString = "";
632                         delete buf;
633                         
634                 }
635                 
636                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
637                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
638                 return 1;
639         }
640         catch(exception& e) {
641                 m->errorOut(e, "DistanceCommand", "driverMPI");
642                 exit(1);
643         }
644 }
645 /**************************************************************************************************/
646 /////// need to fix to work with calcs and sequencedb
647 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
648         try {
649                 MPI_Status status;
650                 
651                 MPI_File outMPI;
652                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
653
654                 //char* filename = new char[file.length()];
655                 //memcpy(filename, file.c_str(), file.length());
656                 
657                 char filename[1024];
658                 strcpy(filename, file.c_str());
659
660                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
661                 //delete filename;
662
663                 int startTime = time(NULL);
664                 
665                 string outputString = "";
666                 size = 0;
667                 
668                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
669                 
670                 for(int i=startLine;i<endLine;i++){
671                                 
672                         string name = alignDB.get(i).getName();
673                         if (name.length() < 10) { //pad with spaces to make compatible
674                                 while (name.length() < 10) {  name += " ";  }
675                         }
676                         outputString += name + "\t";    
677                         
678                         for(int j=0;j<i;j++){
679                                 
680                                 if (m->control_pressed) {  return 0;  }
681                                 
682                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
683                                 double dist = distCalculator->getDist();
684                                 
685                                 outputString += toString(dist) + "\t"; 
686                         }
687                         
688                         outputString += "\n"; 
689
690                 
691                         if(i % 100 == 0){
692                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
693                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
694                         }
695                         
696                         
697                         //send results to parent
698                         int length = outputString.length();
699                         char* buf = new char[length];
700                         memcpy(buf, outputString.c_str(), length);
701                         
702                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
703                         size += outputString.length();
704                         outputString = "";
705                         delete buf;
706                 }
707                 
708                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
709                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
710                 MPI_File_close(&outMPI);
711                 
712                 return 1;
713         }
714         catch(exception& e) {
715                 m->errorOut(e, "DistanceCommand", "driverMPI");
716                 exit(1);
717         }
718 }
719 /**************************************************************************************************/
720 /////// need to fix to work with calcs and sequencedb
721 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
722         try {
723                 MPI_Status status;
724                 
725                 MPI_File outMPI;
726                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
727
728                 //char* filename = new char[file.length()];
729                 //memcpy(filename, file.c_str(), file.length());
730                 
731                 char filename[1024];
732                 strcpy(filename, file.c_str());
733
734                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
735                 //delete filename;
736
737                 int startTime = time(NULL);
738                 
739                 string outputString = "";
740                 size = 0;
741                 
742                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
743                 
744                 for(int i=startLine;i<endLine;i++){
745                                 
746                         string name = alignDB.get(i).getName();
747                         if (name.length() < 10) { //pad with spaces to make compatible
748                                 while (name.length() < 10) {  name += " ";  }
749                         }
750                         outputString += name + "\t";    
751                         
752                         for(int j=0;j<alignDB.getNumSeqs();j++){
753                                 
754                                 if (m->control_pressed) {  return 0;  }
755                                 
756                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
757                                 double dist = distCalculator->getDist();
758                                 
759                                 outputString += toString(dist) + "\t"; 
760                         }
761                         
762                         outputString += "\n"; 
763
764                 
765                         if(i % 100 == 0){
766                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
767                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
768                         }
769                         
770                         
771                         //send results to parent
772                         int length = outputString.length();
773                         char* buf = new char[length];
774                         memcpy(buf, outputString.c_str(), length);
775                         
776                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
777                         size += outputString.length();
778                         outputString = "";
779                         delete buf;
780                 }
781                 
782                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
783                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
784                 MPI_File_close(&outMPI);
785                 
786                 return 1;
787         }
788         catch(exception& e) {
789                 m->errorOut(e, "DistanceCommand", "driverMPI");
790                 exit(1);
791         }
792 }
793 #endif
794 /**************************************************************************************************
795 int DistanceCommand::convertMatrix(string outputFile) {
796         try{
797
798                 //sort file by first column so the distances for each row are together
799                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
800                 
801                 //use the unix sort 
802                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
803                         string command = "sort -n " + outputFile + " -o " + outfile;
804                         system(command.c_str());
805                 #else //sort using windows sort
806                         string command = "sort " + outputFile + " /O " + outfile;
807                         system(command.c_str());
808                 #endif
809                 
810
811                 //output to new file distance for each row and save positions in file where new row begins
812                 ifstream in;
813                 m->openInputFile(outfile, in);
814                 
815                 ofstream out;
816                 m->openOutputFile(outputFile, out);
817                 
818                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
819
820                 out << alignDB.getNumSeqs() << endl;
821                 
822                 //get first currentRow
823                 string first, currentRow, second;
824                 float dist;
825                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
826                 map<string, float>::iterator it;
827                 
828                 in >> first;
829                 currentRow = first;
830                 
831                 rowDists[first] = 0.00; //distance to yourself is 0.0
832                 
833                 in.seekg(0);
834                 //m->openInputFile(outfile, in);
835                 
836                 while(!in.eof()) {
837                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
838                         
839                         in >> first >> second >> dist; m->gobble(in);
840                                 
841                         if (first != currentRow) {
842                                 //print out last row
843                                 out << currentRow << '\t'; //print name
844
845                                 //print dists
846                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
847                                         out << it->second << '\t';
848                                 }
849                                 out << endl;
850                                 
851                                 //start new row
852                                 currentRow = first;
853                                 rowDists.clear();
854                                 rowDists[first] = 0.00;
855                                 rowDists[second] = dist;
856                         }else{
857                                 rowDists[second] = dist;
858                         }
859                 }
860                 //print out last row
861                 out << currentRow << '\t'; //print name
862                                 
863                 //print dists
864                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
865                         out << it->second << '\t';
866                 }
867                 out << endl;
868                 
869                 in.close();
870                 out.close();
871                 
872                 remove(outfile.c_str());
873                 
874                 return 1;
875                 
876         }
877         catch(exception& e) {
878                 m->errorOut(e, "DistanceCommand", "convertMatrix");
879                 exit(1);
880         }
881 }
882 /**************************************************************************************************
883 int DistanceCommand::convertToLowerTriangle(string outputFile) {
884         try{
885
886                 //sort file by first column so the distances for each row are together
887                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
888                 
889                 //use the unix sort 
890                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
891                         string command = "sort -n " + outputFile + " -o " + outfile;
892                         system(command.c_str());
893                 #else //sort using windows sort
894                         string command = "sort " + outputFile + " /O " + outfile;
895                         system(command.c_str());
896                 #endif
897                 
898
899                 //output to new file distance for each row and save positions in file where new row begins
900                 ifstream in;
901                 m->openInputFile(outfile, in);
902                 
903                 ofstream out;
904                 m->openOutputFile(outputFile, out);
905                 
906                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
907
908                 out << alignDB.getNumSeqs() << endl;
909                 
910                 //get first currentRow
911                 string first, currentRow, second;
912                 float dist;
913                 int i, j;
914                 i = 0; j = 0;
915                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
916                 map<string, float>::iterator it;
917                 
918                 in >> first;
919                 currentRow = first;
920                 
921                 rowDists[first] = 0.00; //distance to yourself is 0.0
922                 
923                 in.seekg(0);
924                 //m->openInputFile(outfile, in);
925                 
926                 while(!in.eof()) {
927                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
928                         
929                         in >> first >> second >> dist; m->gobble(in);
930                                 
931                         if (first != currentRow) {
932                                 //print out last row
933                                 out << currentRow << '\t'; //print name
934
935                                 //print dists
936                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
937                                         if (j >= i) { break; }
938                                         out << it->second << '\t';
939                                         j++;
940                                 }
941                                 out << endl;
942                                 
943                                 //start new row
944                                 currentRow = first;
945                                 rowDists.clear();
946                                 rowDists[first] = 0.00;
947                                 rowDists[second] = dist;
948                                 j = 0;
949                                 i++;
950                         }else{
951                                 rowDists[second] = dist;
952                         }
953                 }
954                 //print out last row
955                 out << currentRow << '\t'; //print name
956                                 
957                 //print dists
958                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
959                         out << it->second << '\t';
960                 }
961                 out << endl;
962                 
963                 in.close();
964                 out.close();
965                 
966                 remove(outfile.c_str());
967                 
968                 return 1;
969                 
970         }
971         catch(exception& e) {
972                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
973                 exit(1);
974         }
975 }
976 /**************************************************************************************************/
977 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
978 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
979 //also check to make sure the 2 files have the same alignment length.
980 bool DistanceCommand::sanityCheck() {
981         try{
982                 bool good = true;
983                 
984                 //make sure the 2 fasta files have the same alignment length
985                 ifstream in;
986                 m->openInputFile(fastafile, in);
987                 int fastaAlignLength = 0;
988                 if (in) { 
989                         Sequence tempIn(in);
990                         fastaAlignLength = tempIn.getAligned().length();
991                 }
992                 in.close();
993                 
994                 ifstream in2;
995                 m->openInputFile(oldfastafile, in2);
996                 int oldfastaAlignLength = 0;
997                 if (in2) { 
998                         Sequence tempIn2(in2);
999                         oldfastaAlignLength = tempIn2.getAligned().length();
1000                 }
1001                 in2.close();
1002                 
1003                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1004                 
1005                 //read fasta file and save names as well as adding them to the alignDB
1006                 set<string> namesOldFasta;
1007                 
1008                 ifstream inFasta;
1009                 m->openInputFile(oldfastafile, inFasta);
1010                 
1011                 while (!inFasta.eof()) {
1012                         if (m->control_pressed) {  inFasta.close(); return good;  }
1013                 
1014                         Sequence temp(inFasta);
1015                         
1016                         if (temp.getName() != "") {
1017                                 namesOldFasta.insert(temp.getName());  //save name
1018                                 alignDB.push_back(temp);  //add to DB
1019                         }
1020                         
1021                         m->gobble(inFasta);
1022                 }
1023                 
1024                 inFasta.close();
1025                 
1026                 //read through the column file checking names and removing distances above the cutoff
1027                 ifstream inDist;
1028                 m->openInputFile(column, inDist);
1029                 
1030                 ofstream outDist;
1031                 string outputFile = column + ".temp";
1032                 m->openOutputFile(outputFile, outDist);
1033                 
1034                 string name1, name2;
1035                 float dist;
1036                 while (!inDist.eof()) {
1037                         if (m->control_pressed) {  inDist.close(); outDist.close(); remove(outputFile.c_str()); return good;  }
1038                 
1039                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1040                         
1041                         //both names are in fasta file and distance is below cutoff
1042                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1043                         else{
1044                                 if (dist <= cutoff) {
1045                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1046                                 }
1047                         }
1048                 }
1049                 
1050                 inDist.close();
1051                 outDist.close();
1052                 
1053                 if (good) {
1054                         remove(column.c_str());
1055                         rename(outputFile.c_str(), column.c_str());
1056                 }else{
1057                         remove(outputFile.c_str()); //temp file is bad because file mismatch above
1058                 }
1059                 
1060         }
1061         catch(exception& e) {
1062                 m->errorOut(e, "DistanceCommand", "m->appendFiles");
1063                 exit(1);
1064         }
1065 }
1066 /**************************************************************************************************/
1067
1068
1069
1070