]> git.donarmstrong.com Git - mothur.git/blob - distancecommand.cpp
done testing 1.13.0
[mothur.git] / distancecommand.cpp
1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
16
17 //**********************************************************************************************************************
18
19 DistanceCommand::DistanceCommand(string option) {
20         try {
21                 abort = false;
22                 Estimators.clear();
23                                 
24                 //allow user to run help
25                 if(option == "help") { help(); abort = true; }
26                 
27                 else {
28                         //valid paramters for this command
29                         string Array[] =  {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"};
30                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
31                         
32                         OptionParser parser(option);
33                         map<string, string> parameters = parser.getParameters();
34                         
35                         ValidParameters validParameter("dist.seqs");
36                         map<string, string>::iterator it2;
37                 
38                         //check to make sure all parameters are valid for command
39                         for (it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
40                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
41                         }
42                         
43                         //if the user changes the input directory command factory will send this info to us in the output parameter 
44                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
45                         if (inputDir == "not found"){   inputDir = "";          }
46                         else {
47                                 string path;
48                                 it2 = parameters.find("fasta");
49                                 //user has given a template file
50                                 if(it2 != parameters.end()){ 
51                                         path = m->hasPath(it2->second);
52                                         //if the user has not given a path then, add inputdir. else leave path alone.
53                                         if (path == "") {       parameters["fasta"] = inputDir + it2->second;           }
54                                 }
55                                 
56                                 it2 = parameters.find("oldfasta");
57                                 //user has given a template file
58                                 if(it2 != parameters.end()){ 
59                                         path = m->hasPath(it2->second);
60                                         //if the user has not given a path then, add inputdir. else leave path alone.
61                                         if (path == "") {       parameters["oldfasta"] = inputDir + it2->second;                }
62                                 }
63                                 
64                                 it2 = parameters.find("column");
65                                 //user has given a template file
66                                 if(it2 != parameters.end()){ 
67                                         path = m->hasPath(it2->second);
68                                         //if the user has not given a path then, add inputdir. else leave path alone.
69                                         if (path == "") {       parameters["column"] = inputDir + it2->second;          }
70                                 }
71                         }
72
73                         //check for required parameters
74                         fastafile = validParameter.validFile(parameters, "fasta", true);
75                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the dist.seqs command."); m->mothurOutEndLine(); abort = true; }
76                         else if (fastafile == "not open") { abort = true; }     
77                         else{
78                                 ifstream inFASTA;
79                                 m->openInputFile(fastafile, inFASTA);
80                                 alignDB = SequenceDB(inFASTA); 
81                                 inFASTA.close();
82                         }
83                         
84                         oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
85                         if (oldfastafile == "not found") { oldfastafile = ""; }
86                         else if (oldfastafile == "not open") { abort = true; }  
87                         
88                         column = validParameter.validFile(parameters, "column", true);
89                         if (column == "not found") { column = ""; }
90                         else if (column == "not open") { abort = true; }        
91                         
92                         //if the user changes the output directory command factory will send this info to us in the output parameter 
93                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
94                                 outputDir = ""; 
95                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
96                         }
97
98                         //check for optional parameter and set defaults
99                         // ...at some point should added some additional type checking...
100                         calc = validParameter.validFile(parameters, "calc", false);                     
101                         if (calc == "not found") { calc = "onegap";  }
102                         else { 
103                                  if (calc == "default")  {  calc = "onegap";  }
104                         }
105                         m->splitAtDash(calc, Estimators);
106
107                         string temp;
108                         temp = validParameter.validFile(parameters, "countends", false);        if(temp == "not found"){        temp = "T";     }
109                         convert(temp, countends); 
110                         
111                         temp = validParameter.validFile(parameters, "cutoff", false);           if(temp == "not found"){        temp = "1.0"; }
112                         convert(temp, cutoff); 
113                         
114                         temp = validParameter.validFile(parameters, "processors", false);       if(temp == "not found"){        temp = "1"; }
115                         convert(temp, processors); 
116                         
117                         output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
118                         
119                         if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
120                         
121                         if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
122                         
123                         if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
124                         
125                         ValidCalculators validCalculator;
126                         
127                         if (m->isTrue(countends) == true) {
128                                 for (int i=0; i<Estimators.size(); i++) {
129                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
130                                                 if (Estimators[i] == "nogaps")                  {       distCalculator = new ignoreGaps();      }
131                                                 else if (Estimators[i] == "eachgap")    {       distCalculator = new eachGapDist();     }
132                                                 else if (Estimators[i] == "onegap")             {       distCalculator = new oneGapDist();      }
133                                         }
134                                 }
135                         }else {
136                                 for (int i=0; i<Estimators.size(); i++) {
137                                         if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) { 
138                                                 if (Estimators[i] == "nogaps")          {       distCalculator = new ignoreGaps();                                      }
139                                                 else if (Estimators[i] == "eachgap"){   distCalculator = new eachGapIgnoreTermGapDist();        }
140                                                 else if (Estimators[i] == "onegap")     {       distCalculator = new oneGapIgnoreTermGapDist();         }
141                                         }
142                                 }
143                         }
144
145                 }
146                                 
147         }
148         catch(exception& e) {
149                 m->errorOut(e, "DistanceCommand", "DistanceCommand");
150                 exit(1);
151         }
152 }
153
154 //**********************************************************************************************************************
155
156 DistanceCommand::~DistanceCommand(){
157         
158         for(int i=0;i<lines.size();i++){
159                 delete lines[i];
160         }
161         
162 }
163         
164 //**********************************************************************************************************************
165
166 void DistanceCommand::help(){
167         try {
168                 m->mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
169                 m->mothurOut("The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, cutoff and processors.  \n");
170                 m->mothurOut("The fasta parameter is required.\n");
171                 m->mothurOut("The oldfasta and column parameters allow you to append the distances calculated to the column file.\n");
172                 m->mothurOut("The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
173                 m->mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n");
174                 m->mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
175                 m->mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
176                 m->mothurOut("The processors parameter allows you to specify number of processors to use.  The default is 1.\n");
177                 m->mothurOut("The dist.seqs command should be in the following format: \n");
178                 m->mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
179                 m->mothurOut("Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n");
180                 m->mothurOut("Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n");
181         }
182         catch(exception& e) {
183                 m->errorOut(e, "DistanceCommand", "help");
184                 exit(1);
185         }
186 }
187 //**********************************************************************************************************************
188
189 int DistanceCommand::execute(){
190         try {
191                 
192                 if (abort == true) { return 0; }
193                 
194                 int startTime = time(NULL);
195                 
196                 //save number of new sequence
197                 numNewFasta = alignDB.getNumSeqs();
198                 
199                 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
200                 if ((oldfastafile != "") && (column != ""))  {  if (!(sanityCheck())) { return 0; }  }
201                 
202                 if (m->control_pressed) { return 0; }
203                 
204                 int numSeqs = alignDB.getNumSeqs();
205                 cutoff += 0.005;
206                 
207                 string outputFile;
208                                 
209                 if (output == "lt") { //does the user want lower triangle phylip formatted file 
210                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
211                         remove(outputFile.c_str());
212                         
213                         //output numSeqs to phylip formatted dist file
214                 }else if (output == "column") { //user wants column format
215                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
216                         
217                         //so we don't accidentally overwrite
218                         if (outputFile == column) { 
219                                 string tempcolumn = column + ".old"; 
220                                 rename(column.c_str(), tempcolumn.c_str());
221                         }
222                         
223                         remove(outputFile.c_str());
224                 }else { //assume square
225                         outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
226                         remove(outputFile.c_str());
227                 }
228                 
229
230 #ifdef USE_MPI
231                 
232                 int pid, start, end; 
233                 int tag = 2001;
234                                 
235                 MPI_Status status; 
236                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
237                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
238                 
239                 //each process gets where it should start and stop in the file
240                 if (output != "square") {
241                         start = int (sqrt(float(pid)/float(processors)) * numSeqs);
242                         end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
243                 }else{
244                         start = int ((float(pid)/float(processors)) * numSeqs);
245                         end = int ((float(pid+1)/float(processors)) * numSeqs);
246                 }
247                 
248                 if (output == "column") {
249                         MPI_File outMPI;
250                         int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
251
252                         //char* filename = new char[outputFile.length()];
253                         //memcpy(filename, outputFile.c_str(), outputFile.length());
254                         
255                         char filename[1024];
256                         strcpy(filename, outputFile.c_str());
257                         
258                         MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
259                         //delete filename;
260
261                         if (pid == 0) { //you are the root process 
262                         
263                                 //do your part
264                                 string outputMyPart;
265                                 
266                                 driverMPI(start, end, outMPI, cutoff); 
267                                 
268                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
269                         
270                                 //wait on chidren
271                                 for(int i = 1; i < processors; i++) { 
272                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
273                                         
274                                         char buf[4];
275                                         MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
276                                 }
277                         }else { //you are a child process
278                                 //do your part
279                                 driverMPI(start, end, outMPI, cutoff); 
280                                 
281                                 if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
282                         
283                                 char buf[4];
284                                 strcpy(buf, "done"); 
285                                 //tell parent you are done.
286                                 MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
287                         }
288                         
289                         MPI_File_close(&outMPI);
290                         
291                 }else { //lower triangle format
292                         if (pid == 0) { //you are the root process 
293                         
294                                 //do your part
295                                 string outputMyPart;
296                                 unsigned long int mySize;
297                                 
298                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
299                                 else { driverMPI(start, end, outputFile, mySize, output); }
300         
301                                 if (m->control_pressed) {  delete distCalculator;  return 0; }
302                                 
303                                 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
304                                 MPI_File outMPI;
305                                 MPI_File inMPI;
306
307                                 //char* filename = new char[outputFile.length()];
308                                 //memcpy(filename, outputFile.c_str(), outputFile.length());
309                                 
310                                 char filename[1024];
311                                 strcpy(filename, outputFile.c_str());
312
313                                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
314                                 //delete filename;
315
316                                 //wait on chidren
317                                 for(int b = 1; b < processors; b++) { 
318                                         unsigned long int fileSize;
319                                         
320                                         if (m->control_pressed) { MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
321                                         
322                                         MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status); 
323                                         
324                                         string outTemp = outputFile + toString(b) + ".temp";
325
326                                         char* buf = new char[outTemp.length()];
327                                         memcpy(buf, outTemp.c_str(), outTemp.length());
328                                         
329                                         MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
330                                         delete buf;
331
332                                         int count = 0;
333                                         while (count < fileSize) { 
334                                                 char buf2[1];
335                                                 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
336                                                 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
337                                                 count += 1;
338                                         }
339                                         
340                                         MPI_File_close(&inMPI); //deleted on close
341                                 }
342                                 
343                                 MPI_File_close(&outMPI);
344                         }else { //you are a child process
345                                 //do your part
346                                 unsigned long int size;
347                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
348                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
349                                 
350                                 if (m->control_pressed) { delete distCalculator;  return 0; }
351                         
352                                 //tell parent you are done.
353                                 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
354                         }
355                 }
356                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
357 #else           
358                                 
359         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
360                 //if you don't need to fork anything
361                 if(processors == 1){
362                         if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
363                         else { driver(0, numSeqs, outputFile, "square");  }
364                 }else{ //you have multiple processors
365                         
366                         for (int i = 0; i < processors; i++) {
367                                 lines.push_back(new linePair());
368                                 if (output != "square") {
369                                         lines[i]->start = int (sqrt(float(i)/float(processors)) * numSeqs);
370                                         lines[i]->end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
371                                 }else{
372                                         lines[i]->start = int ((float(i)/float(processors)) * numSeqs);
373                                         lines[i]->end = int ((float(i+1)/float(processors)) * numSeqs);
374                                 }
375                         }
376
377                         createProcesses(outputFile); 
378                 
379                         map<int, int>::iterator it = processIDS.begin();
380                         rename((outputFile + toString(it->second) + ".temp").c_str(), outputFile.c_str());
381                         it++;
382                         
383                         //append and remove temp files
384                         for (; it != processIDS.end(); it++) {
385                                 m->appendFiles((outputFile + toString(it->second) + ".temp"), outputFile);
386                                 remove((outputFile + toString(it->second) + ".temp").c_str());
387                         }
388                 }
389         #else
390                 //ifstream inFASTA;
391                 if (output != "square") {  driver(0, numSeqs, outputFile, cutoff); }
392                 else { driver(0, numSeqs, outputFile, "square");  }
393         #endif
394         
395 #endif
396                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
397                 
398                 #ifdef USE_MPI
399                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
400                                         
401                         if (pid == 0) { //only one process should output to screen
402                 #endif
403                 
404                 //if (output == "square") {  convertMatrix(outputFile); }
405                 
406                 ifstream fileHandle;
407                 fileHandle.open(outputFile.c_str());
408                 if(fileHandle) {
409                         m->gobble(fileHandle);
410                         if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.");  m->mothurOutEndLine(); }
411                 }
412                 
413                 //append the old column file to the new one
414                 if ((oldfastafile != "") && (column != ""))  {
415                         //we had to rename the column file so we didnt overwrite above, but we want to keep old name
416                         if (outputFile == column) { 
417                                 string tempcolumn = column + ".old";
418                                 m->appendFiles(tempcolumn, outputFile);
419                                 remove(tempcolumn.c_str());
420                         }else{
421                                 m->appendFiles(outputFile, column);
422                                 remove(outputFile.c_str());
423                                 outputFile = column;
424                         }
425                         
426                         if (outputDir != "") { 
427                                 string newOutputName = outputDir + m->getSimpleName(outputFile);
428                                 rename(outputFile.c_str(), newOutputName.c_str());
429                                 remove(outputFile.c_str());
430                                 outputFile = newOutputName;
431                         }
432                 }
433
434                 
435                 #ifdef USE_MPI
436                         }
437                 #endif
438                 
439                 if (m->control_pressed) { delete distCalculator; remove(outputFile.c_str()); return 0; }
440                 
441                 delete distCalculator;
442                 
443                 m->mothurOutEndLine();
444                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
445                 m->mothurOut(outputFile); m->mothurOutEndLine();
446                 m->mothurOutEndLine();
447                 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
448                 return 0;
449                 
450         }
451         catch(exception& e) {
452                 m->errorOut(e, "DistanceCommand", "execute");
453                 exit(1);
454         }
455 }
456 /**************************************************************************************************/
457 void DistanceCommand::createProcesses(string filename) {
458         try {
459 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
460                 int process = 0;
461                 processIDS.clear();
462                 
463                 //loop through and create all the processes you want
464                 while (process != processors) {
465                         int pid = fork();
466                         
467                         if (pid > 0) {
468                                 processIDS[lines[process]->end] = pid;  //create map from line number to pid so you can append files in correct order later
469                                 process++;
470                         }else if (pid == 0){
471                                 if (output != "square") {  driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", cutoff); }
472                                 else { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", "square"); }
473                                 exit(0);
474                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
475                 }
476         
477                 //force parent to wait until all the processes are done
478                 for (map<int, int>::iterator it = processIDS.begin(); it != processIDS.end(); it++) { 
479                         int temp = it->second;
480                         wait(&temp);
481                 }
482 #endif
483         }
484         catch(exception& e) {
485                 m->errorOut(e, "DistanceCommand", "createProcesses");
486                 exit(1);
487         }
488 }
489
490 /**************************************************************************************************/
491 /////// need to fix to work with calcs and sequencedb
492 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
493         try {
494
495                 int startTime = time(NULL);
496                 
497                 //column file
498                 ofstream outFile(dFileName.c_str(), ios::trunc);
499                 outFile.setf(ios::fixed, ios::showpoint);
500                 outFile << setprecision(4);
501                 
502                 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
503                 
504                 for(int i=startLine;i<endLine;i++){
505                         if(output == "lt")      {       
506                                 string name = alignDB.get(i).getName();
507                                 if (name.length() < 10) { //pad with spaces to make compatible
508                                         while (name.length() < 10) {  name += " ";  }
509                                 }
510                                 outFile << name << '\t';        
511                         }
512                         for(int j=0;j<i;j++){
513                                 
514                                 if (m->control_pressed) { outFile.close(); return 0;  }
515                                 
516                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
517                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
518                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
519                                 
520                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
521                                 double dist = distCalculator->getDist();
522                                 
523                                 if(dist <= cutoff){
524                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
525                                 }
526                                 if (output == "lt") {  outFile << dist << '\t'; }
527                         }
528                         
529                         if (output == "lt") { outFile << endl; }
530                         
531                         if(i % 100 == 0){
532                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
533                         }
534                         
535                 }
536                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
537                 
538                 outFile.close();
539                 
540                 return 1;
541         }
542         catch(exception& e) {
543                 m->errorOut(e, "DistanceCommand", "driver");
544                 exit(1);
545         }
546 }
547 /**************************************************************************************************/
548 /////// need to fix to work with calcs and sequencedb
549 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
550         try {
551
552                 int startTime = time(NULL);
553                 
554                 //column file
555                 ofstream outFile(dFileName.c_str(), ios::trunc);
556                 outFile.setf(ios::fixed, ios::showpoint);
557                 outFile << setprecision(4);
558                 
559                 if(startLine == 0){     outFile << alignDB.getNumSeqs() << endl;        }
560                 
561                 for(int i=startLine;i<endLine;i++){
562                                 
563                         string name = alignDB.get(i).getName();
564                         //pad with spaces to make compatible
565                         if (name.length() < 10) { while (name.length() < 10) {  name += " ";  } }
566                                 
567                         outFile << name << '\t';        
568                         
569                         for(int j=0;j<alignDB.getNumSeqs();j++){
570                                 
571                                 if (m->control_pressed) { outFile.close(); return 0;  }
572                                 
573                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
574                                 double dist = distCalculator->getDist();
575                                 
576                                 outFile << dist << '\t'; 
577                         }
578                         
579                         outFile << endl; 
580                         
581                         if(i % 100 == 0){
582                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
583                         }
584                         
585                 }
586                 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
587                 
588                 outFile.close();
589                 
590                 return 1;
591         }
592         catch(exception& e) {
593                 m->errorOut(e, "DistanceCommand", "driver");
594                 exit(1);
595         }
596 }
597 #ifdef USE_MPI
598 /**************************************************************************************************/
599 /////// need to fix to work with calcs and sequencedb
600 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
601         try {
602                 MPI_Status status;
603                 int startTime = time(NULL);
604                 
605                 string outputString = "";
606                 
607                 for(int i=startLine;i<endLine;i++){
608         
609                         for(int j=0;j<i;j++){
610                                 
611                                 if (m->control_pressed) {  return 0;  }
612                                 
613                                 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
614                                 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
615                                 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
616                                 
617                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
618                                 double dist = distCalculator->getDist();
619                                 
620                                 if(dist <= cutoff){
621                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
622                                 }
623                         }
624                         
625                         if(i % 100 == 0){
626                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
627                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
628                         }
629                         
630                          
631                         //send results to parent
632                         int length = outputString.length();
633
634                         char* buf = new char[length];
635                         memcpy(buf, outputString.c_str(), length);
636                         
637                         MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
638                         outputString = "";
639                         delete buf;
640                         
641                 }
642                 
643                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
644                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;                
645                 return 1;
646         }
647         catch(exception& e) {
648                 m->errorOut(e, "DistanceCommand", "driverMPI");
649                 exit(1);
650         }
651 }
652 /**************************************************************************************************/
653 /////// need to fix to work with calcs and sequencedb
654 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
655         try {
656                 MPI_Status status;
657                 
658                 MPI_File outMPI;
659                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
660
661                 //char* filename = new char[file.length()];
662                 //memcpy(filename, file.c_str(), file.length());
663                 
664                 char filename[1024];
665                 strcpy(filename, file.c_str());
666
667                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
668                 //delete filename;
669
670                 int startTime = time(NULL);
671                 
672                 string outputString = "";
673                 size = 0;
674                 
675                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
676                 
677                 for(int i=startLine;i<endLine;i++){
678                                 
679                         string name = alignDB.get(i).getName();
680                         if (name.length() < 10) { //pad with spaces to make compatible
681                                 while (name.length() < 10) {  name += " ";  }
682                         }
683                         outputString += name + "\t";    
684                         
685                         for(int j=0;j<i;j++){
686                                 
687                                 if (m->control_pressed) {  return 0;  }
688                                 
689                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
690                                 double dist = distCalculator->getDist();
691                                 
692                                 outputString += toString(dist) + "\t"; 
693                         }
694                         
695                         outputString += "\n"; 
696
697                 
698                         if(i % 100 == 0){
699                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
700                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
701                         }
702                         
703                         
704                         //send results to parent
705                         int length = outputString.length();
706                         char* buf = new char[length];
707                         memcpy(buf, outputString.c_str(), length);
708                         
709                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
710                         size += outputString.length();
711                         outputString = "";
712                         delete buf;
713                 }
714                 
715                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
716                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
717                 MPI_File_close(&outMPI);
718                 
719                 return 1;
720         }
721         catch(exception& e) {
722                 m->errorOut(e, "DistanceCommand", "driverMPI");
723                 exit(1);
724         }
725 }
726 /**************************************************************************************************/
727 /////// need to fix to work with calcs and sequencedb
728 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
729         try {
730                 MPI_Status status;
731                 
732                 MPI_File outMPI;
733                 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
734
735                 //char* filename = new char[file.length()];
736                 //memcpy(filename, file.c_str(), file.length());
737                 
738                 char filename[1024];
739                 strcpy(filename, file.c_str());
740
741                 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
742                 //delete filename;
743
744                 int startTime = time(NULL);
745                 
746                 string outputString = "";
747                 size = 0;
748                 
749                 if(startLine == 0){     outputString += toString(alignDB.getNumSeqs()) + "\n";  }
750                 
751                 for(int i=startLine;i<endLine;i++){
752                                 
753                         string name = alignDB.get(i).getName();
754                         if (name.length() < 10) { //pad with spaces to make compatible
755                                 while (name.length() < 10) {  name += " ";  }
756                         }
757                         outputString += name + "\t";    
758                         
759                         for(int j=0;j<alignDB.getNumSeqs();j++){
760                                 
761                                 if (m->control_pressed) {  return 0;  }
762                                 
763                                 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
764                                 double dist = distCalculator->getDist();
765                                 
766                                 outputString += toString(dist) + "\t"; 
767                         }
768                         
769                         outputString += "\n"; 
770
771                 
772                         if(i % 100 == 0){
773                                 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
774                                 cout << i << '\t' << (time(NULL) - startTime) << endl;
775                         }
776                         
777                         
778                         //send results to parent
779                         int length = outputString.length();
780                         char* buf = new char[length];
781                         memcpy(buf, outputString.c_str(), length);
782                         
783                         MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
784                         size += outputString.length();
785                         outputString = "";
786                         delete buf;
787                 }
788                 
789                 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
790                 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
791                 MPI_File_close(&outMPI);
792                 
793                 return 1;
794         }
795         catch(exception& e) {
796                 m->errorOut(e, "DistanceCommand", "driverMPI");
797                 exit(1);
798         }
799 }
800 #endif
801 /**************************************************************************************************
802 int DistanceCommand::convertMatrix(string outputFile) {
803         try{
804
805                 //sort file by first column so the distances for each row are together
806                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
807                 
808                 //use the unix sort 
809                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
810                         string command = "sort -n " + outputFile + " -o " + outfile;
811                         system(command.c_str());
812                 #else //sort using windows sort
813                         string command = "sort " + outputFile + " /O " + outfile;
814                         system(command.c_str());
815                 #endif
816                 
817
818                 //output to new file distance for each row and save positions in file where new row begins
819                 ifstream in;
820                 m->openInputFile(outfile, in);
821                 
822                 ofstream out;
823                 m->openOutputFile(outputFile, out);
824                 
825                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
826
827                 out << alignDB.getNumSeqs() << endl;
828                 
829                 //get first currentRow
830                 string first, currentRow, second;
831                 float dist;
832                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
833                 map<string, float>::iterator it;
834                 
835                 in >> first;
836                 currentRow = first;
837                 
838                 rowDists[first] = 0.00; //distance to yourself is 0.0
839                 
840                 in.seekg(0);
841                 //m->openInputFile(outfile, in);
842                 
843                 while(!in.eof()) {
844                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
845                         
846                         in >> first >> second >> dist; m->gobble(in);
847                                 
848                         if (first != currentRow) {
849                                 //print out last row
850                                 out << currentRow << '\t'; //print name
851
852                                 //print dists
853                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
854                                         out << it->second << '\t';
855                                 }
856                                 out << endl;
857                                 
858                                 //start new row
859                                 currentRow = first;
860                                 rowDists.clear();
861                                 rowDists[first] = 0.00;
862                                 rowDists[second] = dist;
863                         }else{
864                                 rowDists[second] = dist;
865                         }
866                 }
867                 //print out last row
868                 out << currentRow << '\t'; //print name
869                                 
870                 //print dists
871                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
872                         out << it->second << '\t';
873                 }
874                 out << endl;
875                 
876                 in.close();
877                 out.close();
878                 
879                 remove(outfile.c_str());
880                 
881                 return 1;
882                 
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "DistanceCommand", "convertMatrix");
886                 exit(1);
887         }
888 }
889 /**************************************************************************************************
890 int DistanceCommand::convertToLowerTriangle(string outputFile) {
891         try{
892
893                 //sort file by first column so the distances for each row are together
894                 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
895                 
896                 //use the unix sort 
897                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
898                         string command = "sort -n " + outputFile + " -o " + outfile;
899                         system(command.c_str());
900                 #else //sort using windows sort
901                         string command = "sort " + outputFile + " /O " + outfile;
902                         system(command.c_str());
903                 #endif
904                 
905
906                 //output to new file distance for each row and save positions in file where new row begins
907                 ifstream in;
908                 m->openInputFile(outfile, in);
909                 
910                 ofstream out;
911                 m->openOutputFile(outputFile, out);
912                 
913                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
914
915                 out << alignDB.getNumSeqs() << endl;
916                 
917                 //get first currentRow
918                 string first, currentRow, second;
919                 float dist;
920                 int i, j;
921                 i = 0; j = 0;
922                 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
923                 map<string, float>::iterator it;
924                 
925                 in >> first;
926                 currentRow = first;
927                 
928                 rowDists[first] = 0.00; //distance to yourself is 0.0
929                 
930                 in.seekg(0);
931                 //m->openInputFile(outfile, in);
932                 
933                 while(!in.eof()) {
934                         if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
935                         
936                         in >> first >> second >> dist; m->gobble(in);
937                                 
938                         if (first != currentRow) {
939                                 //print out last row
940                                 out << currentRow << '\t'; //print name
941
942                                 //print dists
943                                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
944                                         if (j >= i) { break; }
945                                         out << it->second << '\t';
946                                         j++;
947                                 }
948                                 out << endl;
949                                 
950                                 //start new row
951                                 currentRow = first;
952                                 rowDists.clear();
953                                 rowDists[first] = 0.00;
954                                 rowDists[second] = dist;
955                                 j = 0;
956                                 i++;
957                         }else{
958                                 rowDists[second] = dist;
959                         }
960                 }
961                 //print out last row
962                 out << currentRow << '\t'; //print name
963                                 
964                 //print dists
965                 for (it = rowDists.begin(); it != rowDists.end(); it++) {
966                         out << it->second << '\t';
967                 }
968                 out << endl;
969                 
970                 in.close();
971                 out.close();
972                 
973                 remove(outfile.c_str());
974                 
975                 return 1;
976                 
977         }
978         catch(exception& e) {
979                 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
980                 exit(1);
981         }
982 }
983 /**************************************************************************************************/
984 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
985 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
986 //also check to make sure the 2 files have the same alignment length.
987 bool DistanceCommand::sanityCheck() {
988         try{
989                 bool good = true;
990                 
991                 //make sure the 2 fasta files have the same alignment length
992                 ifstream in;
993                 m->openInputFile(fastafile, in);
994                 int fastaAlignLength = 0;
995                 if (in) { 
996                         Sequence tempIn(in);
997                         fastaAlignLength = tempIn.getAligned().length();
998                 }
999                 in.close();
1000                 
1001                 ifstream in2;
1002                 m->openInputFile(oldfastafile, in2);
1003                 int oldfastaAlignLength = 0;
1004                 if (in2) { 
1005                         Sequence tempIn2(in2);
1006                         oldfastaAlignLength = tempIn2.getAligned().length();
1007                 }
1008                 in2.close();
1009                 
1010                 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false;  }
1011                 
1012                 //read fasta file and save names as well as adding them to the alignDB
1013                 set<string> namesOldFasta;
1014                 
1015                 ifstream inFasta;
1016                 m->openInputFile(oldfastafile, inFasta);
1017                 
1018                 while (!inFasta.eof()) {
1019                         if (m->control_pressed) {  inFasta.close(); return good;  }
1020                 
1021                         Sequence temp(inFasta);
1022                         
1023                         if (temp.getName() != "") {
1024                                 namesOldFasta.insert(temp.getName());  //save name
1025                                 alignDB.push_back(temp);  //add to DB
1026                         }
1027                         
1028                         m->gobble(inFasta);
1029                 }
1030                 
1031                 inFasta.close();
1032                 
1033                 //read through the column file checking names and removing distances above the cutoff
1034                 ifstream inDist;
1035                 m->openInputFile(column, inDist);
1036                 
1037                 ofstream outDist;
1038                 string outputFile = column + ".temp";
1039                 m->openOutputFile(outputFile, outDist);
1040                 
1041                 string name1, name2;
1042                 float dist;
1043                 while (!inDist.eof()) {
1044                         if (m->control_pressed) {  inDist.close(); outDist.close(); remove(outputFile.c_str()); return good;  }
1045                 
1046                         inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1047                         
1048                         //both names are in fasta file and distance is below cutoff
1049                         if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
1050                         else{
1051                                 if (dist <= cutoff) {
1052                                         outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1053                                 }
1054                         }
1055                 }
1056                 
1057                 inDist.close();
1058                 outDist.close();
1059                 
1060                 if (good) {
1061                         remove(column.c_str());
1062                         rename(outputFile.c_str(), column.c_str());
1063                 }else{
1064                         remove(outputFile.c_str()); //temp file is bad because file mismatch above
1065                 }
1066                 
1067         }
1068         catch(exception& e) {
1069                 m->errorOut(e, "DistanceCommand", "m->appendFiles");
1070                 exit(1);
1071         }
1072 }
1073 /**************************************************************************************************/
1074
1075
1076
1077