2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
35 //initialize outputTypes
36 vector<string> tempOutNames;
37 outputTypes["list"] = tempOutNames;
38 outputTypes["rabund"] = tempOutNames;
39 outputTypes["sabund"] = tempOutNames;
42 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
46 //**********************************************************************************************************************
47 vector<string> ClusterSplitCommand::getRequiredParameters(){
49 string Array[] = {"fasta","phylip","column","or"};
50 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
58 //**********************************************************************************************************************
59 vector<string> ClusterSplitCommand::getRequiredFiles(){
61 vector<string> myArray;
65 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
69 //**********************************************************************************************************************
70 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
71 ClusterSplitCommand::ClusterSplitCommand(string option) {
73 globaldata = GlobalData::getInstance();
77 //allow user to run help
78 if(option == "help") { help(); abort = true; }
81 //valid paramters for this command
82 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
83 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
85 OptionParser parser(option);
86 map<string,string> parameters = parser.getParameters();
88 ValidParameters validParameter("cluster.split");
90 //check to make sure all parameters are valid for command
91 map<string,string>::iterator it;
92 for (it = parameters.begin(); it != parameters.end(); it++) {
93 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 outputTypes["list"] = tempOutNames;
101 outputTypes["rabund"] = tempOutNames;
102 outputTypes["sabund"] = tempOutNames;
104 globaldata->newRead();
106 //if the user changes the output directory command factory will send this info to us in the output parameter
107 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
109 //if the user changes the input directory command factory will send this info to us in the output parameter
110 string inputDir = validParameter.validFile(parameters, "inputdir", false);
111 if (inputDir == "not found"){ inputDir = ""; }
114 it = parameters.find("phylip");
115 //user has given a template file
116 if(it != parameters.end()){
117 path = m->hasPath(it->second);
118 //if the user has not given a path then, add inputdir. else leave path alone.
119 if (path == "") { parameters["phylip"] = inputDir + it->second; }
122 it = parameters.find("column");
123 //user has given a template file
124 if(it != parameters.end()){
125 path = m->hasPath(it->second);
126 //if the user has not given a path then, add inputdir. else leave path alone.
127 if (path == "") { parameters["column"] = inputDir + it->second; }
130 it = parameters.find("name");
131 //user has given a template file
132 if(it != parameters.end()){
133 path = m->hasPath(it->second);
134 //if the user has not given a path then, add inputdir. else leave path alone.
135 if (path == "") { parameters["name"] = inputDir + it->second; }
138 it = parameters.find("taxonomy");
139 //user has given a template file
140 if(it != parameters.end()){
141 path = m->hasPath(it->second);
142 //if the user has not given a path then, add inputdir. else leave path alone.
143 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
146 it = parameters.find("fasta");
147 //user has given a template file
148 if(it != parameters.end()){
149 path = m->hasPath(it->second);
150 //if the user has not given a path then, add inputdir. else leave path alone.
151 if (path == "") { parameters["fasta"] = inputDir + it->second; }
155 //check for required parameters
156 phylipfile = validParameter.validFile(parameters, "phylip", true);
157 if (phylipfile == "not open") { abort = true; }
158 else if (phylipfile == "not found") { phylipfile = ""; }
159 else { distfile = phylipfile; format = "phylip"; }
161 columnfile = validParameter.validFile(parameters, "column", true);
162 if (columnfile == "not open") { abort = true; }
163 else if (columnfile == "not found") { columnfile = ""; }
164 else { distfile = columnfile; format = "column"; }
166 namefile = validParameter.validFile(parameters, "name", true);
167 if (namefile == "not open") { abort = true; }
168 else if (namefile == "not found") { namefile = ""; }
170 fastafile = validParameter.validFile(parameters, "fasta", true);
171 if (fastafile == "not open") { abort = true; }
172 else if (fastafile == "not found") { fastafile = ""; }
173 else { distfile = fastafile; splitmethod = "fasta"; }
175 taxFile = validParameter.validFile(parameters, "taxonomy", true);
176 if (taxFile == "not open") { abort = true; }
177 else if (taxFile == "not found") { taxFile = ""; }
179 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
180 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
182 if (columnfile != "") {
183 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
186 if (fastafile != "") {
187 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
188 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
191 //check for optional parameter and set defaults
192 // ...at some point should added some additional type checking...
193 //get user cutoff and precision or use defaults
195 temp = validParameter.validFile(parameters, "precision", false);
196 if (temp == "not found") { temp = "100"; }
197 //saves precision legnth for formatting below
198 length = temp.length();
199 convert(temp, precision);
201 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
202 hard = m->isTrue(temp);
204 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
205 large = m->isTrue(temp);
207 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
208 convert(temp, processors);
210 temp = validParameter.validFile(parameters, "splitmethod", false);
211 if (splitmethod != "fasta") {
212 if (temp == "not found") { splitmethod = "distance"; }
213 else { splitmethod = temp; }
216 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
217 convert(temp, cutoff);
218 cutoff += (5 / (precision * 10.0));
220 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
221 convert(temp, taxLevelCutoff);
223 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
225 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
226 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
228 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
229 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
231 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
233 showabund = validParameter.validFile(parameters, "showabund", false);
234 if (showabund == "not found") { showabund = "T"; }
236 timing = validParameter.validFile(parameters, "timing", false);
237 if (timing == "not found") { timing = "F"; }
241 catch(exception& e) {
242 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
247 //**********************************************************************************************************************
249 void ClusterSplitCommand::help(){
251 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
252 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
253 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
254 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
255 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
256 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
257 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
258 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
259 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
260 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
261 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
262 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
263 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
264 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
265 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
266 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
267 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
269 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
271 m->mothurOut("The cluster.split command should be in the following format: \n");
272 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
273 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
276 catch(exception& e) {
277 m->errorOut(e, "ClusterSplitCommand", "help");
282 //**********************************************************************************************************************
284 ClusterSplitCommand::~ClusterSplitCommand(){}
286 //**********************************************************************************************************************
288 int ClusterSplitCommand::execute(){
291 if (abort == true) { return 0; }
294 vector<string> listFileNames;
296 string singletonName = "";
297 double saveCutoff = cutoff;
299 //****************** file prep work ******************************//
304 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
305 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
307 if (pid == 0) { //only process 0 converts and splits
311 //if user gave a phylip file convert to column file
312 if (format == "phylip") {
314 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
316 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
318 NameAssignment* nameMap = NULL;
319 convert->setFormat("phylip");
320 convert->read(nameMap);
322 if (m->control_pressed) { delete convert; return 0; }
324 distfile = convert->getOutputFile();
326 //if no names file given with phylip file, create it
327 ListVector* listToMakeNameFile = convert->getListVector();
328 if (namefile == "") { //you need to make a namefile for split matrix
330 namefile = phylipfile + ".names";
331 m->openOutputFile(namefile, out);
332 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
333 string bin = listToMakeNameFile->get(i);
334 out << bin << '\t' << bin << endl;
338 delete listToMakeNameFile;
341 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
343 if (m->control_pressed) { return 0; }
346 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
348 //split matrix into non-overlapping groups
350 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
351 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
352 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
353 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
357 if (m->control_pressed) { delete split; return 0; }
359 singletonName = split->getSingletonNames();
360 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
363 if (m->control_pressed) { return 0; }
365 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
368 //****************** break up files between processes and cluster each file set ******************************//
370 ////you are process 0 from above////
372 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
373 dividedNames.resize(processors);
375 //for each file group figure out which process will complete it
376 //want to divide the load intelligently so the big files are spread between processes
378 for (int i = 0; i < distName.size(); i++) {
379 int processToAssign = (i+1) % processors;
380 if (processToAssign == 0) { processToAssign = processors; }
382 dividedNames[(processToAssign-1)].push_back(distName[i]);
385 //not lets reverse the order of ever other process, so we balance big files running with little ones
386 for (int i = 0; i < processors; i++) {
387 int remainder = ((i+1) % processors);
388 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
392 //send each child the list of files it needs to process
393 for(int i = 1; i < processors; i++) {
394 //send number of file pairs
395 int num = dividedNames[i].size();
396 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
398 for (int j = 0; j < num; j++) { //send filenames to process i
399 char tempDistFileName[1024];
400 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
401 int lengthDist = (dividedNames[i][j].begin()->first).length();
403 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
406 char tempNameFileName[1024];
407 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
408 int lengthName = (dividedNames[i][j].begin()->second).length();
410 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
411 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
416 listFileNames = cluster(dividedNames[0], labels);
418 //receive the other processes info
419 for(int i = 1; i < processors; i++) {
420 int num = dividedNames[i].size();
423 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
424 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
426 //send list filenames to root process
427 for (int j = 0; j < num; j++) {
429 char tempListFileName[1024];
431 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
432 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
434 string myListFileName = tempListFileName;
435 myListFileName = myListFileName.substr(0, lengthList);
437 listFileNames.push_back(myListFileName);
440 //send Labels to root process
442 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
444 for (int j = 0; j < numLabels; j++) {
448 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
449 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
451 string myLabel = tempLabel;
452 myLabel = myLabel.substr(0, lengthLabel);
454 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
458 }else { //you are a child process
459 vector < map<string, string> > myNames;
461 //recieve the files you need to process
462 //receive number of file pairs
464 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
468 for (int j = 0; j < num; j++) { //receive filenames to process
470 char tempDistFileName[1024];
472 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
473 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
475 string myDistFileName = tempDistFileName;
476 myDistFileName = myDistFileName.substr(0, lengthDist);
479 char tempNameFileName[1024];
481 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
482 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
484 string myNameFileName = tempNameFileName;
485 myNameFileName = myNameFileName.substr(0, lengthName);
488 myNames[j][myDistFileName] = myNameFileName;
492 listFileNames = cluster(myNames, labels);
495 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
497 //send list filenames to root process
498 for (int j = 0; j < num; j++) {
499 char tempListFileName[1024];
500 strcpy(tempListFileName, listFileNames[j].c_str());
501 int lengthList = listFileNames[j].length();
503 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
504 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
507 //send Labels to root process
508 int numLabels = labels.size();
509 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
511 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
513 strcpy(tempLabel, (*it).c_str());
514 int lengthLabel = (*it).length();
516 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
517 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
522 MPI_Barrier(MPI_COMM_WORLD);
526 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
528 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
530 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
531 dividedNames.resize(processors);
533 //for each file group figure out which process will complete it
534 //want to divide the load intelligently so the big files are spread between processes
536 for (int i = 0; i < distName.size(); i++) {
537 int processToAssign = (i+1) % processors;
538 if (processToAssign == 0) { processToAssign = processors; }
540 dividedNames[(processToAssign-1)].push_back(distName[i]);
543 //not lets reverse the order of ever other process, so we balance big files running with little ones
544 for (int i = 0; i < processors; i++) {
545 int remainder = ((i+1) % processors);
546 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
549 createProcesses(dividedNames);
551 if (m->control_pressed) { return 0; }
553 //get list of list file names from each process
554 for(int i=0;i<processors;i++){
555 string filename = toString(processIDS[i]) + ".temp";
557 m->openInputFile(filename, in);
559 in >> tag; m->gobble(in);
563 in >> tempName; m->gobble(in);
564 listFileNames.push_back(tempName);
567 remove((toString(processIDS[i]) + ".temp").c_str());
570 filename = toString(processIDS[i]) + ".temp.labels";
572 m->openInputFile(filename, in2);
575 in2 >> tempCutoff; m->gobble(in2);
576 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
580 in2 >> tempName; m->gobble(in2);
581 if (labels.count(tempName) == 0) { labels.insert(tempName); }
584 remove((toString(processIDS[i]) + ".temp.labels").c_str());
588 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
591 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
593 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
595 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
597 //****************** merge list file and create rabund and sabund files ******************************//
599 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
602 if (pid == 0) { //only process 0 merges
605 ListVector* listSingle;
606 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
608 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
610 mergeLists(listFileNames, labelBins, listSingle);
612 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
614 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
616 m->mothurOutEndLine();
617 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
618 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
619 m->mothurOutEndLine();
622 } //only process 0 merges
625 MPI_Barrier(MPI_COMM_WORLD);
630 catch(exception& e) {
631 m->errorOut(e, "ClusterSplitCommand", "execute");
635 //**********************************************************************************************************************
636 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
639 map<float, int> labelBin;
640 vector<float> orderFloat;
644 if (singleton != "none") {
646 m->openInputFile(singleton, in);
648 string firstCol, secondCol;
649 listSingle = new ListVector();
651 in >> firstCol >> secondCol; m->gobble(in);
652 listSingle->push_back(secondCol);
655 remove(singleton.c_str());
657 numSingleBins = listSingle->getNumBins();
658 }else{ listSingle = NULL; numSingleBins = 0; }
660 //go through users set and make them floats so we can sort them
661 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
664 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
665 else if (*it == "unique") { temp = -1.0; }
667 if (temp <= cutoff) {
668 orderFloat.push_back(temp);
669 labelBin[temp] = numSingleBins; //initialize numbins
674 sort(orderFloat.begin(), orderFloat.end());
677 //get the list info from each file
678 for (int k = 0; k < listNames.size(); k++) {
680 if (m->control_pressed) {
681 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
682 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
686 InputData* input = new InputData(listNames[k], "list");
687 ListVector* list = input->getListVector();
688 string lastLabel = list->getLabel();
690 string filledInList = listNames[k] + "filledInTemp";
692 m->openOutputFile(filledInList, outFilled);
694 //for each label needed
695 for(int l = 0; l < orderFloat.size(); l++){
698 if (orderFloat[l] == -1) { thisLabel = "unique"; }
699 else { thisLabel = toString(orderFloat[l], length-1); }
701 //this file has reached the end
703 list = input->getListVector(lastLabel, true);
704 }else{ //do you have the distance, or do you need to fill in
707 if (list->getLabel() == "unique") { labelFloat = -1.0; }
708 else { convert(list->getLabel(), labelFloat); }
710 //check for missing labels
711 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
712 //if its bigger get last label, otherwise keep it
714 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
716 lastLabel = list->getLabel();
720 list->setLabel(thisLabel);
721 list->print(outFilled);
724 labelBin[orderFloat[l]] += list->getNumBins();
728 list = input->getListVector();
731 if (list != NULL) { delete list; }
735 remove(listNames[k].c_str());
736 rename(filledInList.c_str(), listNames[k].c_str());
741 catch(exception& e) {
742 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
746 //**********************************************************************************************************************
747 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
749 if (outputDir == "") { outputDir += m->hasPath(distfile); }
750 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
752 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
753 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
754 m->openOutputFile(fileroot+ tag + ".list", outList);
756 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
757 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
758 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
760 map<float, int>::iterator itLabel;
762 //for each label needed
763 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
766 if (itLabel->first == -1) { thisLabel = "unique"; }
767 else { thisLabel = toString(itLabel->first, length-1); }
769 outList << thisLabel << '\t' << itLabel->second << '\t';
771 RAbundVector* rabund = new RAbundVector();
772 rabund->setLabel(thisLabel);
775 if (listSingle != NULL) {
776 for (int j = 0; j < listSingle->getNumBins(); j++) {
777 outList << listSingle->get(j) << '\t';
778 rabund->push_back(m->getNumNames(listSingle->get(j)));
782 //get the list info from each file
783 for (int k = 0; k < listNames.size(); k++) {
785 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
787 InputData* input = new InputData(listNames[k], "list");
788 ListVector* list = input->getListVector(thisLabel);
790 //this file has reached the end
791 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
793 for (int j = 0; j < list->getNumBins(); j++) {
794 outList << list->get(j) << '\t';
795 rabund->push_back(m->getNumNames(list->get(j)));
802 SAbundVector sabund = rabund->getSAbundVector();
804 sabund.print(outSabund);
805 rabund->print(outRabund);
815 if (listSingle != NULL) { delete listSingle; }
817 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
821 catch(exception& e) {
822 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
827 //**********************************************************************************************************************
829 void ClusterSplitCommand::printData(ListVector* oldList){
831 string label = oldList->getLabel();
832 RAbundVector oldRAbund = oldList->getRAbundVector();
834 oldRAbund.setLabel(label);
835 if (m->isTrue(showabund)) {
836 oldRAbund.getSAbundVector().print(cout);
838 oldRAbund.print(outRabund);
839 oldRAbund.getSAbundVector().print(outSabund);
841 oldList->print(outList);
843 catch(exception& e) {
844 m->errorOut(e, "ClusterSplitCommand", "printData");
848 //**********************************************************************************************************************
849 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
852 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
857 //loop through and create all the processes you want
858 while (process != processors) {
862 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
866 vector<string> listFileNames = cluster(dividedNames[process], labels);
868 //write out names to file
869 string filename = toString(getpid()) + ".temp";
871 m->openOutputFile(filename, out);
873 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
878 filename = toString(getpid()) + ".temp.labels";
879 m->openOutputFile(filename, outLabels);
881 outLabels << cutoff << endl;
882 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
883 outLabels << (*it) << endl;
889 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
890 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
895 //force parent to wait until all the processes are done
896 for (int i=0;i<processors;i++) {
897 int temp = processIDS[i];
905 catch(exception& e) {
906 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
910 //**********************************************************************************************************************
912 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
915 SparseMatrix* matrix;
918 RAbundVector* rabund;
920 vector<string> listFileNames;
922 double smallestCutoff = cutoff;
924 //cluster each distance file
925 for (int i = 0; i < distNames.size(); i++) {
926 if (m->control_pressed) { return listFileNames; }
928 string thisNamefile = distNames[i].begin()->second;
929 string thisDistFile = distNames[i].begin()->first;
931 //read in distance file
932 globaldata->setNameFile(thisNamefile);
933 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
937 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
939 //output your files too
941 cout << endl << "Reading " << thisDistFile << endl;
945 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
947 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
948 read->setCutoff(cutoff);
950 NameAssignment* nameMap = new NameAssignment(thisNamefile);
954 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
956 list = read->getListVector();
958 matrix = read->getMatrix();
965 //output your files too
967 cout << endl << "Clustering " << thisDistFile << endl;
971 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
973 rabund = new RAbundVector(list->getRAbundVector());
976 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
977 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
978 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
979 tag = cluster->getTag();
981 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
982 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
985 m->openOutputFile(fileroot+ tag + ".list", listFile);
987 listFileNames.push_back(fileroot+ tag + ".list");
989 time_t estart = time(NULL);
991 float previousDist = 0.00000;
992 float rndPreviousDist = 0.00000;
998 double saveCutoff = cutoff;
1000 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
1002 if (m->control_pressed) { //clean up
1003 delete matrix; delete list; delete cluster; delete rabund;
1005 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1006 listFileNames.clear(); return listFileNames;
1009 cluster->update(saveCutoff);
1011 float dist = matrix->getSmallDist();
1014 rndDist = m->ceilDist(dist, precision);
1016 rndDist = m->roundDist(dist, precision);
1019 if(previousDist <= 0.0000 && dist != previousDist){
1020 oldList.setLabel("unique");
1021 oldList.print(listFile);
1022 if (labels.count("unique") == 0) { labels.insert("unique"); }
1024 else if(rndDist != rndPreviousDist){
1025 oldList.setLabel(toString(rndPreviousDist, length-1));
1026 oldList.print(listFile);
1027 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1030 previousDist = dist;
1031 rndPreviousDist = rndDist;
1036 if(previousDist <= 0.0000){
1037 oldList.setLabel("unique");
1038 oldList.print(listFile);
1039 if (labels.count("unique") == 0) { labels.insert("unique"); }
1041 else if(rndPreviousDist<cutoff){
1042 oldList.setLabel(toString(rndPreviousDist, length-1));
1043 oldList.print(listFile);
1044 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1047 delete matrix; delete list; delete cluster; delete rabund;
1050 if (m->control_pressed) { //clean up
1051 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1052 listFileNames.clear(); return listFileNames;
1055 remove(thisDistFile.c_str());
1056 remove(thisNamefile.c_str());
1058 if (saveCutoff != cutoff) {
1059 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1060 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1062 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1065 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1068 cutoff = smallestCutoff;
1070 return listFileNames;
1073 catch(exception& e) {
1074 m->errorOut(e, "ClusterSplitCommand", "cluster");
1081 //**********************************************************************************************************************