5 * Created by Sarah Westcott on 5/7/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "distancecommand.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 #include "eachgapignore.h"
14 #include "onegapdist.h"
15 #include "onegapignore.h"
17 //**********************************************************************************************************************
18 vector<string> DistanceCommand::getValidParameters(){
20 string Array[] = {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir","compress"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "DistanceCommand", "getValidParameters");
29 //**********************************************************************************************************************
30 DistanceCommand::DistanceCommand(){
32 //initialize outputTypes
33 vector<string> tempOutNames;
34 outputTypes["phylip"] = tempOutNames;
35 outputTypes["column"] = tempOutNames;
38 m->errorOut(e, "DistanceCommand", "DistanceCommand");
42 //**********************************************************************************************************************
43 vector<string> DistanceCommand::getRequiredParameters(){
45 string Array[] = {"fasta"};
46 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
50 m->errorOut(e, "ChopSeqsCommand", "getRequiredParameters");
54 //**********************************************************************************************************************
55 vector<string> DistanceCommand::getRequiredFiles(){
57 vector<string> myArray;
61 m->errorOut(e, "DistanceCommand", "getRequiredFiles");
65 //**********************************************************************************************************************
66 DistanceCommand::DistanceCommand(string option) {
71 //allow user to run help
72 if(option == "help") { help(); abort = true; }
75 //valid paramters for this command
76 string Array[] = {"fasta","oldfasta","column", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir","compress"};
78 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
80 OptionParser parser(option);
81 map<string, string> parameters = parser.getParameters();
83 ValidParameters validParameter("dist.seqs");
84 map<string, string>::iterator it2;
86 //check to make sure all parameters are valid for command
87 for (it2 = parameters.begin(); it2 != parameters.end(); it2++) {
88 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; }
91 //initialize outputTypes
92 vector<string> tempOutNames;
93 outputTypes["phylip"] = tempOutNames;
94 outputTypes["column"] = tempOutNames;
96 //if the user changes the input directory command factory will send this info to us in the output parameter
97 string inputDir = validParameter.validFile(parameters, "inputdir", false);
98 if (inputDir == "not found"){ inputDir = ""; }
101 it2 = parameters.find("fasta");
102 //user has given a template file
103 if(it2 != parameters.end()){
104 path = m->hasPath(it2->second);
105 //if the user has not given a path then, add inputdir. else leave path alone.
106 if (path == "") { parameters["fasta"] = inputDir + it2->second; }
109 it2 = parameters.find("oldfasta");
110 //user has given a template file
111 if(it2 != parameters.end()){
112 path = m->hasPath(it2->second);
113 //if the user has not given a path then, add inputdir. else leave path alone.
114 if (path == "") { parameters["oldfasta"] = inputDir + it2->second; }
117 it2 = parameters.find("column");
118 //user has given a template file
119 if(it2 != parameters.end()){
120 path = m->hasPath(it2->second);
121 //if the user has not given a path then, add inputdir. else leave path alone.
122 if (path == "") { parameters["column"] = inputDir + it2->second; }
126 //check for required parameters
127 fastafile = validParameter.validFile(parameters, "fasta", true);
128 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the dist.seqs command."); m->mothurOutEndLine(); abort = true; }
129 else if (fastafile == "not open") { abort = true; }
132 m->openInputFile(fastafile, inFASTA);
133 alignDB = SequenceDB(inFASTA);
137 oldfastafile = validParameter.validFile(parameters, "oldfasta", true);
138 if (oldfastafile == "not found") { oldfastafile = ""; }
139 else if (oldfastafile == "not open") { abort = true; }
141 column = validParameter.validFile(parameters, "column", true);
142 if (column == "not found") { column = ""; }
143 else if (column == "not open") { abort = true; }
145 //if the user changes the output directory command factory will send this info to us in the output parameter
146 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
148 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
151 //check for optional parameter and set defaults
152 // ...at some point should added some additional type checking...
153 calc = validParameter.validFile(parameters, "calc", false);
154 if (calc == "not found") { calc = "onegap"; }
156 if (calc == "default") { calc = "onegap"; }
158 m->splitAtDash(calc, Estimators);
161 temp = validParameter.validFile(parameters, "countends", false); if(temp == "not found"){ temp = "T"; }
162 convert(temp, countends);
164 temp = validParameter.validFile(parameters, "cutoff", false); if(temp == "not found"){ temp = "1.0"; }
165 convert(temp, cutoff);
167 temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; }
168 convert(temp, processors);
170 temp = validParameter.validFile(parameters, "compress", false); if(temp == "not found"){ temp = "F"; }
171 convert(temp, compress);
173 output = validParameter.validFile(parameters, "output", false); if(output == "not found"){ output = "column"; }
175 if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; }
177 if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so."); m->mothurOutEndLine(); abort=true; }
179 if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; }
181 ValidCalculators validCalculator;
183 if (m->isTrue(countends) == true) {
184 for (int i=0; i<Estimators.size(); i++) {
185 if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) {
186 if (Estimators[i] == "nogaps") { distCalculator = new ignoreGaps(); }
187 else if (Estimators[i] == "eachgap") { distCalculator = new eachGapDist(); }
188 else if (Estimators[i] == "onegap") { distCalculator = new oneGapDist(); }
192 for (int i=0; i<Estimators.size(); i++) {
193 if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) {
194 if (Estimators[i] == "nogaps") { distCalculator = new ignoreGaps(); }
195 else if (Estimators[i] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
196 else if (Estimators[i] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
204 catch(exception& e) {
205 m->errorOut(e, "DistanceCommand", "DistanceCommand");
210 //**********************************************************************************************************************
212 DistanceCommand::~DistanceCommand(){
214 for(int i=0;i<lines.size();i++){
220 //**********************************************************************************************************************
222 void DistanceCommand::help(){
224 m->mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
225 m->mothurOut("The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors. \n");
226 m->mothurOut("The fasta parameter is required.\n");
227 m->mothurOut("The oldfasta and column parameters allow you to append the distances calculated to the column file.\n");
228 m->mothurOut("The calc parameter allows you to specify the method of calculating the distances. Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
229 m->mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance. Your options are: T or F. The default is T.\n");
230 m->mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
231 m->mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
232 m->mothurOut("The processors parameter allows you to specify number of processors to use. The default is 1.\n");
233 m->mothurOut("The compress parameter allows you to indicate that you want the resulting distance file compressed. The default is false.\n");
234 m->mothurOut("The dist.seqs command should be in the following format: \n");
235 m->mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
236 m->mothurOut("Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n");
237 m->mothurOut("Note: No spaces between parameter labels (i.e. calc), '=' and parameters (i.e.yourCalc).\n\n");
239 catch(exception& e) {
240 m->errorOut(e, "DistanceCommand", "help");
244 //**********************************************************************************************************************
246 int DistanceCommand::execute(){
249 if (abort == true) { return 0; }
251 int startTime = time(NULL);
253 //save number of new sequence
254 numNewFasta = alignDB.getNumSeqs();
256 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
257 if ((oldfastafile != "") && (column != "")) { if (!(sanityCheck())) { return 0; } }
259 if (m->control_pressed) { return 0; }
261 int numSeqs = alignDB.getNumSeqs();
266 if (output == "lt") { //does the user want lower triangle phylip formatted file
267 outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "phylip.dist";
268 remove(outputFile.c_str()); outputTypes["phylip"].push_back(outputFile);
270 //output numSeqs to phylip formatted dist file
271 }else if (output == "column") { //user wants column format
272 outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
273 outputTypes["column"].push_back(outputFile);
275 //so we don't accidentally overwrite
276 if (outputFile == column) {
277 string tempcolumn = column + ".old";
278 rename(column.c_str(), tempcolumn.c_str());
281 remove(outputFile.c_str());
282 }else { //assume square
283 outputFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "square.dist";
284 remove(outputFile.c_str());
285 outputTypes["phylip"].push_back(outputFile);
295 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
296 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
298 //each process gets where it should start and stop in the file
299 if (output != "square") {
300 start = int (sqrt(float(pid)/float(processors)) * numSeqs);
301 end = int (sqrt(float(pid+1)/float(processors)) * numSeqs);
303 start = int ((float(pid)/float(processors)) * numSeqs);
304 end = int ((float(pid+1)/float(processors)) * numSeqs);
307 if (output == "column") {
309 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
311 //char* filename = new char[outputFile.length()];
312 //memcpy(filename, outputFile.c_str(), outputFile.length());
315 strcpy(filename, outputFile.c_str());
317 MPI_File_open(MPI_COMM_WORLD, filename, amode, MPI_INFO_NULL, &outMPI);
320 if (pid == 0) { //you are the root process
325 driverMPI(start, end, outMPI, cutoff);
327 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; }
330 for(int i = 1; i < processors; i++) {
331 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; }
334 MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
336 }else { //you are a child process
338 driverMPI(start, end, outMPI, cutoff);
340 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; }
344 //tell parent you are done.
345 MPI_Send(buf, 5, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
348 MPI_File_close(&outMPI);
350 }else { //lower triangle format
351 if (pid == 0) { //you are the root process
355 unsigned long int mySize;
357 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
358 else { driverMPI(start, end, outputFile, mySize, output); }
360 if (m->control_pressed) { outputTypes.clear(); delete distCalculator; return 0; }
362 int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
366 //char* filename = new char[outputFile.length()];
367 //memcpy(filename, outputFile.c_str(), outputFile.length());
370 strcpy(filename, outputFile.c_str());
372 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
376 for(int b = 1; b < processors; b++) {
377 unsigned long int fileSize;
379 if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; }
381 MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status);
383 string outTemp = outputFile + toString(b) + ".temp";
385 char* buf = new char[outTemp.length()];
386 memcpy(buf, outTemp.c_str(), outTemp.length());
388 MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
392 while (count < fileSize) {
394 MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
395 MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
399 MPI_File_close(&inMPI); //deleted on close
402 MPI_File_close(&outMPI);
403 }else { //you are a child process
405 unsigned long int size;
406 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
407 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
409 if (m->control_pressed) { delete distCalculator; return 0; }
411 //tell parent you are done.
412 MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
415 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
418 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
419 //if you don't need to fork anything
421 if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
422 else { driver(0, numSeqs, outputFile, "square"); }
423 }else{ //you have multiple processors
425 for (int i = 0; i < processors; i++) {
426 lines.push_back(new linePair());
427 if (output != "square") {
428 lines[i]->start = int (sqrt(float(i)/float(processors)) * numSeqs);
429 lines[i]->end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
431 lines[i]->start = int ((float(i)/float(processors)) * numSeqs);
432 lines[i]->end = int ((float(i+1)/float(processors)) * numSeqs);
436 createProcesses(outputFile);
438 map<int, int>::iterator it = processIDS.begin();
439 rename((outputFile + toString(it->second) + ".temp").c_str(), outputFile.c_str());
442 //append and remove temp files
443 for (; it != processIDS.end(); it++) {
444 m->appendFiles((outputFile + toString(it->second) + ".temp"), outputFile);
445 remove((outputFile + toString(it->second) + ".temp").c_str());
450 if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
451 else { driver(0, numSeqs, outputFile, "square"); }
455 if (m->control_pressed) { outputTypes.clear(); delete distCalculator; remove(outputFile.c_str()); return 0; }
458 MPI_Comm_rank(MPI_COMM_WORLD, &pid);
460 if (pid == 0) { //only one process should output to screen
463 //if (output == "square") { convertMatrix(outputFile); }
466 fileHandle.open(outputFile.c_str());
468 m->gobble(fileHandle);
469 if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff."); m->mothurOutEndLine(); }
472 //append the old column file to the new one
473 if ((oldfastafile != "") && (column != "")) {
474 //we had to rename the column file so we didnt overwrite above, but we want to keep old name
475 if (outputFile == column) {
476 string tempcolumn = column + ".old";
477 m->appendFiles(tempcolumn, outputFile);
478 remove(tempcolumn.c_str());
480 m->appendFiles(outputFile, column);
481 remove(outputFile.c_str());
485 if (outputDir != "") {
486 string newOutputName = outputDir + m->getSimpleName(outputFile);
487 rename(outputFile.c_str(), newOutputName.c_str());
488 remove(outputFile.c_str());
489 outputFile = newOutputName;
498 if (m->control_pressed) { outputTypes.clear(); delete distCalculator; remove(outputFile.c_str()); return 0; }
500 delete distCalculator;
502 m->mothurOutEndLine();
503 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
504 m->mothurOut(outputFile); m->mothurOutEndLine();
505 m->mothurOutEndLine();
506 m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
509 if (m->isTrue(compress)) {
510 m->mothurOut("Compressing..."); m->mothurOutEndLine();
511 m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)"); m->mothurOutEndLine();
512 system(("gzip -v " + outputFile).c_str());
513 outputNames.push_back(outputFile + ".gz");
514 }else { outputNames.push_back(outputFile); }
519 catch(exception& e) {
520 m->errorOut(e, "DistanceCommand", "execute");
524 /**************************************************************************************************/
525 void DistanceCommand::createProcesses(string filename) {
527 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
531 //loop through and create all the processes you want
532 while (process != processors) {
536 processIDS[lines[process]->end] = pid; //create map from line number to pid so you can append files in correct order later
539 if (output != "square") { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", cutoff); }
540 else { driver(lines[process]->start, lines[process]->end, filename + toString(getpid()) + ".temp", "square"); }
542 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
545 //force parent to wait until all the processes are done
546 for (map<int, int>::iterator it = processIDS.begin(); it != processIDS.end(); it++) {
547 int temp = it->second;
552 catch(exception& e) {
553 m->errorOut(e, "DistanceCommand", "createProcesses");
558 /**************************************************************************************************/
559 /////// need to fix to work with calcs and sequencedb
560 int DistanceCommand::driver(int startLine, int endLine, string dFileName, float cutoff){
563 int startTime = time(NULL);
566 ofstream outFile(dFileName.c_str(), ios::trunc);
567 outFile.setf(ios::fixed, ios::showpoint);
568 outFile << setprecision(4);
570 if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl; }
572 for(int i=startLine;i<endLine;i++){
574 string name = alignDB.get(i).getName();
575 if (name.length() < 10) { //pad with spaces to make compatible
576 while (name.length() < 10) { name += " "; }
578 outFile << name << '\t';
580 for(int j=0;j<i;j++){
582 if (m->control_pressed) { outFile.close(); return 0; }
584 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
585 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
586 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
588 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
589 double dist = distCalculator->getDist();
592 if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
594 if (output == "lt") { outFile << dist << '\t'; }
597 if (output == "lt") { outFile << endl; }
600 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
604 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
610 catch(exception& e) {
611 m->errorOut(e, "DistanceCommand", "driver");
615 /**************************************************************************************************/
616 /////// need to fix to work with calcs and sequencedb
617 int DistanceCommand::driver(int startLine, int endLine, string dFileName, string square){
620 int startTime = time(NULL);
623 ofstream outFile(dFileName.c_str(), ios::trunc);
624 outFile.setf(ios::fixed, ios::showpoint);
625 outFile << setprecision(4);
627 if(startLine == 0){ outFile << alignDB.getNumSeqs() << endl; }
629 for(int i=startLine;i<endLine;i++){
631 string name = alignDB.get(i).getName();
632 //pad with spaces to make compatible
633 if (name.length() < 10) { while (name.length() < 10) { name += " "; } }
635 outFile << name << '\t';
637 for(int j=0;j<alignDB.getNumSeqs();j++){
639 if (m->control_pressed) { outFile.close(); return 0; }
641 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
642 double dist = distCalculator->getDist();
644 outFile << dist << '\t';
650 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
654 m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
660 catch(exception& e) {
661 m->errorOut(e, "DistanceCommand", "driver");
666 /**************************************************************************************************/
667 /////// need to fix to work with calcs and sequencedb
668 int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, float cutoff){
671 int startTime = time(NULL);
673 string outputString = "";
675 for(int i=startLine;i<endLine;i++){
677 for(int j=0;j<i;j++){
679 if (m->control_pressed) { return 0; }
681 //if there was a column file given and we are appending, we don't want to calculate the distances that are already in the column file
682 //the alignDB contains the new sequences and then the old, so if i an oldsequence and j is an old sequence then break out of this loop
683 if ((i >= numNewFasta) && (j >= numNewFasta)) { break; }
685 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
686 double dist = distCalculator->getDist();
689 outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n');
694 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
695 cout << i << '\t' << (time(NULL) - startTime) << endl;
699 //send results to parent
700 int length = outputString.length();
702 char* buf = new char[length];
703 memcpy(buf, outputString.c_str(), length);
705 MPI_File_write_shared(outMPI, buf, length, MPI_CHAR, &status);
711 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
712 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
715 catch(exception& e) {
716 m->errorOut(e, "DistanceCommand", "driverMPI");
720 /**************************************************************************************************/
721 /////// need to fix to work with calcs and sequencedb
722 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
727 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
729 //char* filename = new char[file.length()];
730 //memcpy(filename, file.c_str(), file.length());
733 strcpy(filename, file.c_str());
735 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
738 int startTime = time(NULL);
740 string outputString = "";
743 if(startLine == 0){ outputString += toString(alignDB.getNumSeqs()) + "\n"; }
745 for(int i=startLine;i<endLine;i++){
747 string name = alignDB.get(i).getName();
748 if (name.length() < 10) { //pad with spaces to make compatible
749 while (name.length() < 10) { name += " "; }
751 outputString += name + "\t";
753 for(int j=0;j<i;j++){
755 if (m->control_pressed) { return 0; }
757 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
758 double dist = distCalculator->getDist();
760 outputString += toString(dist) + "\t";
763 outputString += "\n";
767 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
768 cout << i << '\t' << (time(NULL) - startTime) << endl;
772 //send results to parent
773 int length = outputString.length();
774 char* buf = new char[length];
775 memcpy(buf, outputString.c_str(), length);
777 MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
778 size += outputString.length();
783 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
784 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
785 MPI_File_close(&outMPI);
789 catch(exception& e) {
790 m->errorOut(e, "DistanceCommand", "driverMPI");
794 /**************************************************************************************************/
795 /////// need to fix to work with calcs and sequencedb
796 int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
801 int amode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
803 //char* filename = new char[file.length()];
804 //memcpy(filename, file.c_str(), file.length());
807 strcpy(filename, file.c_str());
809 MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
812 int startTime = time(NULL);
814 string outputString = "";
817 if(startLine == 0){ outputString += toString(alignDB.getNumSeqs()) + "\n"; }
819 for(int i=startLine;i<endLine;i++){
821 string name = alignDB.get(i).getName();
822 if (name.length() < 10) { //pad with spaces to make compatible
823 while (name.length() < 10) { name += " "; }
825 outputString += name + "\t";
827 for(int j=0;j<alignDB.getNumSeqs();j++){
829 if (m->control_pressed) { return 0; }
831 distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
832 double dist = distCalculator->getDist();
834 outputString += toString(dist) + "\t";
837 outputString += "\n";
841 //m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
842 cout << i << '\t' << (time(NULL) - startTime) << endl;
846 //send results to parent
847 int length = outputString.length();
848 char* buf = new char[length];
849 memcpy(buf, outputString.c_str(), length);
851 MPI_File_write(outMPI, buf, length, MPI_CHAR, &status);
852 size += outputString.length();
857 //m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
858 cout << (endLine-1) << '\t' << (time(NULL) - startTime) << endl;
859 MPI_File_close(&outMPI);
863 catch(exception& e) {
864 m->errorOut(e, "DistanceCommand", "driverMPI");
869 /**************************************************************************************************
870 int DistanceCommand::convertMatrix(string outputFile) {
873 //sort file by first column so the distances for each row are together
874 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
877 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
878 string command = "sort -n " + outputFile + " -o " + outfile;
879 system(command.c_str());
880 #else //sort using windows sort
881 string command = "sort " + outputFile + " /O " + outfile;
882 system(command.c_str());
886 //output to new file distance for each row and save positions in file where new row begins
888 m->openInputFile(outfile, in);
891 m->openOutputFile(outputFile, out);
893 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
895 out << alignDB.getNumSeqs() << endl;
897 //get first currentRow
898 string first, currentRow, second;
900 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key
901 map<string, float>::iterator it;
906 rowDists[first] = 0.00; //distance to yourself is 0.0
909 //m->openInputFile(outfile, in);
912 if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
914 in >> first >> second >> dist; m->gobble(in);
916 if (first != currentRow) {
918 out << currentRow << '\t'; //print name
921 for (it = rowDists.begin(); it != rowDists.end(); it++) {
922 out << it->second << '\t';
929 rowDists[first] = 0.00;
930 rowDists[second] = dist;
932 rowDists[second] = dist;
936 out << currentRow << '\t'; //print name
939 for (it = rowDists.begin(); it != rowDists.end(); it++) {
940 out << it->second << '\t';
947 remove(outfile.c_str());
952 catch(exception& e) {
953 m->errorOut(e, "DistanceCommand", "convertMatrix");
957 /**************************************************************************************************
958 int DistanceCommand::convertToLowerTriangle(string outputFile) {
961 //sort file by first column so the distances for each row are together
962 string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
965 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
966 string command = "sort -n " + outputFile + " -o " + outfile;
967 system(command.c_str());
968 #else //sort using windows sort
969 string command = "sort " + outputFile + " /O " + outfile;
970 system(command.c_str());
974 //output to new file distance for each row and save positions in file where new row begins
976 m->openInputFile(outfile, in);
979 m->openOutputFile(outputFile, out);
981 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
983 out << alignDB.getNumSeqs() << endl;
985 //get first currentRow
986 string first, currentRow, second;
990 map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key
991 map<string, float>::iterator it;
996 rowDists[first] = 0.00; //distance to yourself is 0.0
999 //m->openInputFile(outfile, in);
1002 if (m->control_pressed) { in.close(); remove(outfile.c_str()); out.close(); return 0; }
1004 in >> first >> second >> dist; m->gobble(in);
1006 if (first != currentRow) {
1007 //print out last row
1008 out << currentRow << '\t'; //print name
1011 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1012 if (j >= i) { break; }
1013 out << it->second << '\t';
1021 rowDists[first] = 0.00;
1022 rowDists[second] = dist;
1026 rowDists[second] = dist;
1029 //print out last row
1030 out << currentRow << '\t'; //print name
1033 for (it = rowDists.begin(); it != rowDists.end(); it++) {
1034 out << it->second << '\t';
1041 remove(outfile.c_str());
1046 catch(exception& e) {
1047 m->errorOut(e, "DistanceCommand", "convertToLowerTriangle");
1051 /**************************************************************************************************/
1052 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
1053 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
1054 //also check to make sure the 2 files have the same alignment length.
1055 bool DistanceCommand::sanityCheck() {
1059 //make sure the 2 fasta files have the same alignment length
1061 m->openInputFile(fastafile, in);
1062 int fastaAlignLength = 0;
1064 Sequence tempIn(in);
1065 fastaAlignLength = tempIn.getAligned().length();
1070 m->openInputFile(oldfastafile, in2);
1071 int oldfastaAlignLength = 0;
1073 Sequence tempIn2(in2);
1074 oldfastaAlignLength = tempIn2.getAligned().length();
1078 if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length."); m->mothurOutEndLine(); return false; }
1080 //read fasta file and save names as well as adding them to the alignDB
1081 set<string> namesOldFasta;
1084 m->openInputFile(oldfastafile, inFasta);
1086 while (!inFasta.eof()) {
1087 if (m->control_pressed) { inFasta.close(); return good; }
1089 Sequence temp(inFasta);
1091 if (temp.getName() != "") {
1092 namesOldFasta.insert(temp.getName()); //save name
1093 alignDB.push_back(temp); //add to DB
1101 //read through the column file checking names and removing distances above the cutoff
1103 m->openInputFile(column, inDist);
1106 string outputFile = column + ".temp";
1107 m->openOutputFile(outputFile, outDist);
1109 string name1, name2;
1111 while (!inDist.eof()) {
1112 if (m->control_pressed) { inDist.close(); outDist.close(); remove(outputFile.c_str()); return good; }
1114 inDist >> name1 >> name2 >> dist; m->gobble(inDist);
1116 //both names are in fasta file and distance is below cutoff
1117 if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) { good = false; break; }
1119 if (dist <= cutoff) {
1120 outDist << name1 << '\t' << name2 << '\t' << dist << endl;
1129 remove(column.c_str());
1130 rename(outputFile.c_str(), column.c_str());
1132 remove(outputFile.c_str()); //temp file is bad because file mismatch above
1136 catch(exception& e) {
1137 m->errorOut(e, "DistanceCommand", "m->appendFiles");
1141 /**************************************************************************************************/