2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
19 //**********************************************************************************************************************
20 vector<string> ClusterSplitCommand::getValidParameters(){
22 string AlignArray[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
23 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
27 m->errorOut(e, "ClusterSplitCommand", "getValidParameters");
31 //**********************************************************************************************************************
32 ClusterSplitCommand::ClusterSplitCommand(){
34 //initialize outputTypes
35 vector<string> tempOutNames;
36 outputTypes["list"] = tempOutNames;
37 outputTypes["rabund"] = tempOutNames;
38 outputTypes["sabund"] = tempOutNames;
41 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
45 //**********************************************************************************************************************
46 vector<string> ClusterSplitCommand::getRequiredParameters(){
48 string Array[] = {"fasta","phylip","column","or"};
49 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53 m->errorOut(e, "ClusterSplitCommand", "getRequiredParameters");
57 //**********************************************************************************************************************
58 vector<string> ClusterSplitCommand::getRequiredFiles(){
60 vector<string> myArray;
64 m->errorOut(e, "ClusterSplitCommand", "getRequiredFiles");
68 //**********************************************************************************************************************
69 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
70 ClusterSplitCommand::ClusterSplitCommand(string option) {
72 globaldata = GlobalData::getInstance();
76 //allow user to run help
77 if(option == "help") { help(); abort = true; }
80 //valid paramters for this command
81 string Array[] = {"fasta","phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
82 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
84 OptionParser parser(option);
85 map<string,string> parameters = parser.getParameters();
87 ValidParameters validParameter("cluster.split");
89 //check to make sure all parameters are valid for command
90 map<string,string>::iterator it;
91 for (it = parameters.begin(); it != parameters.end(); it++) {
92 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
97 //initialize outputTypes
98 vector<string> tempOutNames;
99 outputTypes["list"] = tempOutNames;
100 outputTypes["rabund"] = tempOutNames;
101 outputTypes["sabund"] = tempOutNames;
103 globaldata->newRead();
105 //if the user changes the output directory command factory will send this info to us in the output parameter
106 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
108 //if the user changes the input directory command factory will send this info to us in the output parameter
109 string inputDir = validParameter.validFile(parameters, "inputdir", false);
110 if (inputDir == "not found"){ inputDir = ""; }
113 it = parameters.find("phylip");
114 //user has given a template file
115 if(it != parameters.end()){
116 path = m->hasPath(it->second);
117 //if the user has not given a path then, add inputdir. else leave path alone.
118 if (path == "") { parameters["phylip"] = inputDir + it->second; }
121 it = parameters.find("column");
122 //user has given a template file
123 if(it != parameters.end()){
124 path = m->hasPath(it->second);
125 //if the user has not given a path then, add inputdir. else leave path alone.
126 if (path == "") { parameters["column"] = inputDir + it->second; }
129 it = parameters.find("name");
130 //user has given a template file
131 if(it != parameters.end()){
132 path = m->hasPath(it->second);
133 //if the user has not given a path then, add inputdir. else leave path alone.
134 if (path == "") { parameters["name"] = inputDir + it->second; }
137 it = parameters.find("taxonomy");
138 //user has given a template file
139 if(it != parameters.end()){
140 path = m->hasPath(it->second);
141 //if the user has not given a path then, add inputdir. else leave path alone.
142 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
145 it = parameters.find("fasta");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["fasta"] = inputDir + it->second; }
154 //check for required parameters
155 phylipfile = validParameter.validFile(parameters, "phylip", true);
156 if (phylipfile == "not open") { abort = true; }
157 else if (phylipfile == "not found") { phylipfile = ""; }
158 else { distfile = phylipfile; format = "phylip"; }
160 columnfile = validParameter.validFile(parameters, "column", true);
161 if (columnfile == "not open") { abort = true; }
162 else if (columnfile == "not found") { columnfile = ""; }
163 else { distfile = columnfile; format = "column"; }
165 namefile = validParameter.validFile(parameters, "name", true);
166 if (namefile == "not open") { abort = true; }
167 else if (namefile == "not found") { namefile = ""; }
169 fastafile = validParameter.validFile(parameters, "fasta", true);
170 if (fastafile == "not open") { abort = true; }
171 else if (fastafile == "not found") { fastafile = ""; }
172 else { distfile = fastafile; splitmethod = "fasta"; }
174 taxFile = validParameter.validFile(parameters, "taxonomy", true);
175 if (taxFile == "not open") { abort = true; }
176 else if (taxFile == "not found") { taxFile = ""; }
178 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column or fastafile."); m->mothurOutEndLine(); abort = true; }
179 else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
181 if (columnfile != "") {
182 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
185 if (fastafile != "") {
186 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
187 if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
190 //check for optional parameter and set defaults
191 // ...at some point should added some additional type checking...
192 //get user cutoff and precision or use defaults
194 temp = validParameter.validFile(parameters, "precision", false);
195 if (temp == "not found") { temp = "100"; }
196 //saves precision legnth for formatting below
197 length = temp.length();
198 convert(temp, precision);
200 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
201 hard = m->isTrue(temp);
203 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
204 large = m->isTrue(temp);
206 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
207 convert(temp, processors);
209 temp = validParameter.validFile(parameters, "splitmethod", false);
210 if (splitmethod != "fasta") {
211 if (temp == "not found") { splitmethod = "distance"; }
212 else { splitmethod = temp; }
215 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
216 convert(temp, cutoff);
217 cutoff += (5 / (precision * 10.0));
219 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
220 convert(temp, taxLevelCutoff);
222 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
224 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
225 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
227 if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
228 else { m->mothurOut(splitmethod + " is not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); abort = true; }
230 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
232 showabund = validParameter.validFile(parameters, "showabund", false);
233 if (showabund == "not found") { showabund = "T"; }
235 timing = validParameter.validFile(parameters, "timing", false);
236 if (timing == "not found") { timing = "F"; }
240 catch(exception& e) {
241 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
246 //**********************************************************************************************************************
248 void ClusterSplitCommand::help(){
250 m->mothurOut("The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n");
251 m->mothurOut("The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n");
252 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
253 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n");
254 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequences into distinct taxonomy groups, and split the distance file based on those groups. \n");
255 m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n");
256 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
257 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
258 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
259 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
260 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
261 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
262 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
263 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n");
264 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
265 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n");
266 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
268 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
270 m->mothurOut("The cluster.split command should be in the following format: \n");
271 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
272 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
275 catch(exception& e) {
276 m->errorOut(e, "ClusterSplitCommand", "help");
281 //**********************************************************************************************************************
283 ClusterSplitCommand::~ClusterSplitCommand(){}
285 //**********************************************************************************************************************
287 int ClusterSplitCommand::execute(){
290 if (abort == true) { return 0; }
293 vector<string> listFileNames;
295 string singletonName = "";
296 double saveCutoff = cutoff;
298 //****************** file prep work ******************************//
303 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
304 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
306 if (pid == 0) { //only process 0 converts and splits
310 //if user gave a phylip file convert to column file
311 if (format == "phylip") {
313 m->mothurOut("Converting to column format..."); m->mothurOutEndLine();
315 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
317 NameAssignment* nameMap = NULL;
318 convert->setFormat("phylip");
319 convert->read(nameMap);
321 if (m->control_pressed) { delete convert; return 0; }
323 distfile = convert->getOutputFile();
325 //if no names file given with phylip file, create it
326 ListVector* listToMakeNameFile = convert->getListVector();
327 if (namefile == "") { //you need to make a namefile for split matrix
329 namefile = phylipfile + ".names";
330 m->openOutputFile(namefile, out);
331 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
332 string bin = listToMakeNameFile->get(i);
333 out << bin << '\t' << bin << endl;
337 delete listToMakeNameFile;
340 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to convert the distance file."); m->mothurOutEndLine();
342 if (m->control_pressed) { return 0; }
345 m->mothurOut("Splitting the file..."); m->mothurOutEndLine();
347 //split matrix into non-overlapping groups
349 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
350 else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
351 else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
352 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
356 if (m->control_pressed) { delete split; return 0; }
358 singletonName = split->getSingletonNames();
359 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
362 if (m->control_pressed) { return 0; }
364 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
367 //****************** break up files between processes and cluster each file set ******************************//
369 ////you are process 0 from above////
371 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
372 dividedNames.resize(processors);
374 //for each file group figure out which process will complete it
375 //want to divide the load intelligently so the big files are spread between processes
377 for (int i = 0; i < distName.size(); i++) {
378 int processToAssign = (i+1) % processors;
379 if (processToAssign == 0) { processToAssign = processors; }
381 dividedNames[(processToAssign-1)].push_back(distName[i]);
384 //not lets reverse the order of ever other process, so we balance big files running with little ones
385 for (int i = 0; i < processors; i++) {
386 int remainder = ((i+1) % processors);
387 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
391 //send each child the list of files it needs to process
392 for(int i = 1; i < processors; i++) {
393 //send number of file pairs
394 int num = dividedNames[i].size();
395 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
397 for (int j = 0; j < num; j++) { //send filenames to process i
398 char tempDistFileName[1024];
399 strcpy(tempDistFileName, (dividedNames[i][j].begin()->first).c_str());
400 int lengthDist = (dividedNames[i][j].begin()->first).length();
402 MPI_Send(&lengthDist, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403 MPI_Send(tempDistFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
405 char tempNameFileName[1024];
406 strcpy(tempNameFileName, (dividedNames[i][j].begin()->second).c_str());
407 int lengthName = (dividedNames[i][j].begin()->second).length();
409 MPI_Send(&lengthName, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
410 MPI_Send(tempNameFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
415 listFileNames = cluster(dividedNames[0], labels);
417 //receive the other processes info
418 for(int i = 1; i < processors; i++) {
419 int num = dividedNames[i].size();
422 MPI_Recv(&tempCutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
423 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
425 //send list filenames to root process
426 for (int j = 0; j < num; j++) {
428 char tempListFileName[1024];
430 MPI_Recv(&lengthList, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
431 MPI_Recv(tempListFileName, 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
433 string myListFileName = tempListFileName;
434 myListFileName = myListFileName.substr(0, lengthList);
436 listFileNames.push_back(myListFileName);
439 //send Labels to root process
441 MPI_Recv(&numLabels, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
443 for (int j = 0; j < numLabels; j++) {
447 MPI_Recv(&lengthLabel, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
448 MPI_Recv(tempLabel, 100, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
450 string myLabel = tempLabel;
451 myLabel = myLabel.substr(0, lengthLabel);
453 if (labels.count(myLabel) == 0) { labels.insert(myLabel); }
457 }else { //you are a child process
458 vector < map<string, string> > myNames;
460 //recieve the files you need to process
461 //receive number of file pairs
463 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
467 for (int j = 0; j < num; j++) { //receive filenames to process
469 char tempDistFileName[1024];
471 MPI_Recv(&lengthDist, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
472 MPI_Recv(tempDistFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
474 string myDistFileName = tempDistFileName;
475 myDistFileName = myDistFileName.substr(0, lengthDist);
478 char tempNameFileName[1024];
480 MPI_Recv(&lengthName, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
481 MPI_Recv(tempNameFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
483 string myNameFileName = tempNameFileName;
484 myNameFileName = myNameFileName.substr(0, lengthName);
487 myNames[j][myDistFileName] = myNameFileName;
491 listFileNames = cluster(myNames, labels);
494 MPI_Send(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
496 //send list filenames to root process
497 for (int j = 0; j < num; j++) {
498 char tempListFileName[1024];
499 strcpy(tempListFileName, listFileNames[j].c_str());
500 int lengthList = listFileNames[j].length();
502 MPI_Send(&lengthList, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
503 MPI_Send(tempListFileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
506 //send Labels to root process
507 int numLabels = labels.size();
508 MPI_Send(&numLabels, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
510 for(set<string>::iterator it = labels.begin(); it != labels.end(); ++it) {
512 strcpy(tempLabel, (*it).c_str());
513 int lengthLabel = (*it).length();
515 MPI_Send(&lengthLabel, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
516 MPI_Send(tempLabel, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
521 MPI_Barrier(MPI_COMM_WORLD);
525 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
527 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
529 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
530 dividedNames.resize(processors);
532 //for each file group figure out which process will complete it
533 //want to divide the load intelligently so the big files are spread between processes
535 for (int i = 0; i < distName.size(); i++) {
536 int processToAssign = (i+1) % processors;
537 if (processToAssign == 0) { processToAssign = processors; }
539 dividedNames[(processToAssign-1)].push_back(distName[i]);
542 //not lets reverse the order of ever other process, so we balance big files running with little ones
543 for (int i = 0; i < processors; i++) {
544 int remainder = ((i+1) % processors);
545 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
548 createProcesses(dividedNames);
550 if (m->control_pressed) { return 0; }
552 //get list of list file names from each process
553 for(int i=0;i<processors;i++){
554 string filename = toString(processIDS[i]) + ".temp";
556 m->openInputFile(filename, in);
558 in >> tag; m->gobble(in);
562 in >> tempName; m->gobble(in);
563 listFileNames.push_back(tempName);
566 remove((toString(processIDS[i]) + ".temp").c_str());
569 filename = toString(processIDS[i]) + ".temp.labels";
571 m->openInputFile(filename, in2);
574 in2 >> tempCutoff; m->gobble(in2);
575 if (tempCutoff < cutoff) { cutoff = tempCutoff; }
579 in2 >> tempName; m->gobble(in2);
580 if (labels.count(tempName) == 0) { labels.insert(tempName); }
583 remove((toString(processIDS[i]) + ".temp.labels").c_str());
587 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
590 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
592 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
594 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
596 //****************** merge list file and create rabund and sabund files ******************************//
598 m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
601 if (pid == 0) { //only process 0 merges
604 ListVector* listSingle;
605 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
607 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
609 mergeLists(listFileNames, labelBins, listSingle);
611 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
613 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
615 m->mothurOutEndLine();
616 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
617 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
618 m->mothurOutEndLine();
621 } //only process 0 merges
624 MPI_Barrier(MPI_COMM_WORLD);
629 catch(exception& e) {
630 m->errorOut(e, "ClusterSplitCommand", "execute");
634 //**********************************************************************************************************************
635 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
638 map<float, int> labelBin;
639 vector<float> orderFloat;
643 if (singleton != "none") {
645 m->openInputFile(singleton, in);
647 string firstCol, secondCol;
648 listSingle = new ListVector();
650 in >> firstCol >> secondCol; m->gobble(in);
651 listSingle->push_back(secondCol);
654 remove(singleton.c_str());
656 numSingleBins = listSingle->getNumBins();
657 }else{ listSingle = NULL; numSingleBins = 0; }
659 //go through users set and make them floats so we can sort them
660 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
663 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
664 else if (*it == "unique") { temp = -1.0; }
666 if (temp <= cutoff) {
667 orderFloat.push_back(temp);
668 labelBin[temp] = numSingleBins; //initialize numbins
673 sort(orderFloat.begin(), orderFloat.end());
676 //get the list info from each file
677 for (int k = 0; k < listNames.size(); k++) {
679 if (m->control_pressed) {
680 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
681 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
685 InputData* input = new InputData(listNames[k], "list");
686 ListVector* list = input->getListVector();
687 string lastLabel = list->getLabel();
689 string filledInList = listNames[k] + "filledInTemp";
691 m->openOutputFile(filledInList, outFilled);
693 //for each label needed
694 for(int l = 0; l < orderFloat.size(); l++){
697 if (orderFloat[l] == -1) { thisLabel = "unique"; }
698 else { thisLabel = toString(orderFloat[l], length-1); }
700 //this file has reached the end
702 list = input->getListVector(lastLabel, true);
703 }else{ //do you have the distance, or do you need to fill in
706 if (list->getLabel() == "unique") { labelFloat = -1.0; }
707 else { convert(list->getLabel(), labelFloat); }
709 //check for missing labels
710 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
711 //if its bigger get last label, otherwise keep it
713 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
715 lastLabel = list->getLabel();
719 list->setLabel(thisLabel);
720 list->print(outFilled);
723 labelBin[orderFloat[l]] += list->getNumBins();
727 list = input->getListVector();
730 if (list != NULL) { delete list; }
734 remove(listNames[k].c_str());
735 rename(filledInList.c_str(), listNames[k].c_str());
740 catch(exception& e) {
741 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
745 //**********************************************************************************************************************
746 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
748 if (outputDir == "") { outputDir += m->hasPath(distfile); }
749 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
751 m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
752 m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
753 m->openOutputFile(fileroot+ tag + ".list", outList);
755 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
756 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
757 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
759 map<float, int>::iterator itLabel;
761 //for each label needed
762 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
765 if (itLabel->first == -1) { thisLabel = "unique"; }
766 else { thisLabel = toString(itLabel->first, length-1); }
768 outList << thisLabel << '\t' << itLabel->second << '\t';
770 RAbundVector* rabund = new RAbundVector();
771 rabund->setLabel(thisLabel);
774 if (listSingle != NULL) {
775 for (int j = 0; j < listSingle->getNumBins(); j++) {
776 outList << listSingle->get(j) << '\t';
777 rabund->push_back(m->getNumNames(listSingle->get(j)));
781 //get the list info from each file
782 for (int k = 0; k < listNames.size(); k++) {
784 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
786 InputData* input = new InputData(listNames[k], "list");
787 ListVector* list = input->getListVector(thisLabel);
789 //this file has reached the end
790 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
792 for (int j = 0; j < list->getNumBins(); j++) {
793 outList << list->get(j) << '\t';
794 rabund->push_back(m->getNumNames(list->get(j)));
801 SAbundVector sabund = rabund->getSAbundVector();
803 sabund.print(outSabund);
804 rabund->print(outRabund);
814 if (listSingle != NULL) { delete listSingle; }
816 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
820 catch(exception& e) {
821 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
826 //**********************************************************************************************************************
828 void ClusterSplitCommand::printData(ListVector* oldList){
830 string label = oldList->getLabel();
831 RAbundVector oldRAbund = oldList->getRAbundVector();
833 oldRAbund.setLabel(label);
834 if (m->isTrue(showabund)) {
835 oldRAbund.getSAbundVector().print(cout);
837 oldRAbund.print(outRabund);
838 oldRAbund.getSAbundVector().print(outSabund);
840 oldList->print(outList);
842 catch(exception& e) {
843 m->errorOut(e, "ClusterSplitCommand", "printData");
847 //**********************************************************************************************************************
848 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
851 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
856 //loop through and create all the processes you want
857 while (process != processors) {
861 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
865 vector<string> listFileNames = cluster(dividedNames[process], labels);
867 //write out names to file
868 string filename = toString(getpid()) + ".temp";
870 m->openOutputFile(filename, out);
872 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
877 filename = toString(getpid()) + ".temp.labels";
878 m->openOutputFile(filename, outLabels);
880 outLabels << cutoff << endl;
881 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
882 outLabels << (*it) << endl;
887 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
890 //force parent to wait until all the processes are done
891 for (int i=0;i<processors;i++) {
892 int temp = processIDS[i];
900 catch(exception& e) {
901 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
905 //**********************************************************************************************************************
907 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
910 SparseMatrix* matrix;
913 RAbundVector* rabund;
915 vector<string> listFileNames;
917 double smallestCutoff = cutoff;
919 //cluster each distance file
920 for (int i = 0; i < distNames.size(); i++) {
921 if (m->control_pressed) { return listFileNames; }
923 string thisNamefile = distNames[i].begin()->second;
924 string thisDistFile = distNames[i].begin()->first;
926 //read in distance file
927 globaldata->setNameFile(thisNamefile);
928 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
932 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
934 //output your files too
936 cout << endl << "Reading " << thisDistFile << endl;
940 m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
942 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
943 read->setCutoff(cutoff);
945 NameAssignment* nameMap = new NameAssignment(thisNamefile);
949 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
951 list = read->getListVector();
953 matrix = read->getMatrix();
960 //output your files too
962 cout << endl << "Clustering " << thisDistFile << endl;
966 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
968 rabund = new RAbundVector(list->getRAbundVector());
971 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
972 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
973 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
974 tag = cluster->getTag();
976 if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
977 fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
980 m->openOutputFile(fileroot+ tag + ".list", listFile);
982 listFileNames.push_back(fileroot+ tag + ".list");
984 time_t estart = time(NULL);
986 float previousDist = 0.00000;
987 float rndPreviousDist = 0.00000;
993 double saveCutoff = cutoff;
995 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
997 if (m->control_pressed) { //clean up
998 delete matrix; delete list; delete cluster; delete rabund;
1000 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1001 listFileNames.clear(); return listFileNames;
1004 cluster->update(saveCutoff);
1006 float dist = matrix->getSmallDist();
1009 rndDist = m->ceilDist(dist, precision);
1011 rndDist = m->roundDist(dist, precision);
1014 if(previousDist <= 0.0000 && dist != previousDist){
1015 oldList.setLabel("unique");
1016 oldList.print(listFile);
1017 if (labels.count("unique") == 0) { labels.insert("unique"); }
1019 else if(rndDist != rndPreviousDist){
1020 oldList.setLabel(toString(rndPreviousDist, length-1));
1021 oldList.print(listFile);
1022 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1025 previousDist = dist;
1026 rndPreviousDist = rndDist;
1031 if(previousDist <= 0.0000){
1032 oldList.setLabel("unique");
1033 oldList.print(listFile);
1034 if (labels.count("unique") == 0) { labels.insert("unique"); }
1036 else if(rndPreviousDist<cutoff){
1037 oldList.setLabel(toString(rndPreviousDist, length-1));
1038 oldList.print(listFile);
1039 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
1042 delete matrix; delete list; delete cluster; delete rabund;
1045 if (m->control_pressed) { //clean up
1046 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
1047 listFileNames.clear(); return listFileNames;
1050 remove(thisDistFile.c_str());
1051 remove(thisNamefile.c_str());
1053 if (saveCutoff != cutoff) {
1054 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
1055 else { saveCutoff = m->roundDist(saveCutoff, precision); }
1057 m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
1060 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1063 cutoff = smallestCutoff;
1065 return listFileNames;
1068 catch(exception& e) {
1069 m->errorOut(e, "ClusterSplitCommand", "cluster");
1076 //**********************************************************************************************************************