2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","showabund","timing","hard","processors","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
80 it = parameters.find("taxonomy");
81 //user has given a template file
82 if(it != parameters.end()){
83 path = hasPath(it->second);
84 //if the user has not given a path then, add inputdir. else leave path alone.
85 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
89 //check for required parameters
90 phylipfile = validParameter.validFile(parameters, "phylip", true);
91 if (phylipfile == "not open") { abort = true; }
92 else if (phylipfile == "not found") { phylipfile = ""; }
93 else { distfile = phylipfile; format = "phylip"; }
95 columnfile = validParameter.validFile(parameters, "column", true);
96 if (columnfile == "not open") { abort = true; }
97 else if (columnfile == "not found") { columnfile = ""; }
98 else { distfile = columnfile; format = "column"; }
100 namefile = validParameter.validFile(parameters, "name", true);
101 if (namefile == "not open") { abort = true; }
102 else if (namefile == "not found") { namefile = ""; }
104 taxFile = validParameter.validFile(parameters, "taxonomy", true);
105 if (taxFile == "not open") { abort = true; }
106 else if (taxFile == "not found") { taxFile = ""; }
108 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
109 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
111 if (columnfile != "") {
112 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
115 //check for optional parameter and set defaults
116 // ...at some point should added some additional type checking...
117 //get user cutoff and precision or use defaults
119 temp = validParameter.validFile(parameters, "precision", false);
120 if (temp == "not found") { temp = "100"; }
121 //saves precision legnth for formatting below
122 length = temp.length();
123 convert(temp, precision);
125 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
128 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
129 convert(temp, processors);
131 temp = validParameter.validFile(parameters, "cutoff", false);
132 if (temp == "not found") { temp = "10"; }
133 convert(temp, cutoff);
134 cutoff += (5 / (precision * 10.0));
136 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
138 splitmethod = validParameter.validFile(parameters, "splitmethod", false); if (splitmethod == "not found") { method = "distance"; }
140 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
141 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
143 if ((splitmethod == "distance") || (splitmethod == "classify")) { }
144 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
146 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
148 showabund = validParameter.validFile(parameters, "showabund", false);
149 if (showabund == "not found") { showabund = "T"; }
151 timing = validParameter.validFile(parameters, "timing", false);
152 if (timing == "not found") { timing = "F"; }
156 catch(exception& e) {
157 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
162 //**********************************************************************************************************************
164 void ClusterSplitCommand::help(){
166 m->mothurOut("The cluster.split command parameter options are cutoff, splitcutoff, precision, method, splitmethod, phylip, column, name, showabund, timing. Phylip or column and name are required.\n");
167 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
168 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
169 m->mothurOut("The cluster.split command should be in the following format: \n");
170 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
171 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify) \n");
174 catch(exception& e) {
175 m->errorOut(e, "ClusterSplitCommand", "help");
180 //**********************************************************************************************************************
182 ClusterSplitCommand::~ClusterSplitCommand(){}
184 //**********************************************************************************************************************
186 int ClusterSplitCommand::execute(){
189 if (abort == true) { return 0; }
191 //****************** file prep work ******************************//
193 //if user gave a phylip file convert to column file
194 if (format == "phylip") {
196 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
198 NameAssignment* nameMap = NULL;
199 convert->setFormat("phylip");
200 convert->read(nameMap);
202 if (m->control_pressed) { delete convert; return 0; }
204 distfile = convert->getOutputFile();
206 //if no names file given with phylip file, create it
207 ListVector* listToMakeNameFile = convert->getListVector();
208 if (namefile == "") { //you need to make a namefile for split matrix
210 namefile = phylipfile + ".names";
211 openOutputFile(namefile, out);
212 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
213 string bin = listToMakeNameFile->get(i);
214 out << bin << '\t' << bin << endl;
218 delete listToMakeNameFile;
221 if (m->control_pressed) { return 0; }
223 time_t estart = time(NULL);
225 //split matrix into non-overlapping groups
226 SplitMatrix* split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);
229 if (m->control_pressed) { delete split; return 0; }
231 string singletonName = split->getSingletonNames();
232 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
235 if (m->control_pressed) { return 0; }
237 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
240 //****************** break up files between processes and cluster each file set ******************************//
241 vector<string> listFileNames;
243 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
245 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
247 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
248 dividedNames.resize(processors);
250 //for each file group figure out which process will complete it
251 //want to divide the load intelligently so the big files are spread between processes
253 for (int i = 0; i < distName.size(); i++) {
254 int processToAssign = (i+1) % processors;
255 if (processToAssign == 0) { processToAssign = processors; }
257 dividedNames[(processToAssign-1)].push_back(distName[i]);
260 //not lets reverse the order of ever other process, so we balance big files running with little ones
261 for (int i = 0; i < processors; i++) {
262 int remainder = ((i+1) % processors);
263 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
266 createProcesses(dividedNames);
268 if (m->control_pressed) { return 0; }
270 //get list of list file names from each process
271 for(int i=0;i<processors;i++){
272 string filename = toString(processIDS[i]) + ".temp";
274 openInputFile(filename, in);
278 in >> tempName; gobble(in);
279 listFileNames.push_back(tempName);
282 remove((toString(processIDS[i]) + ".temp").c_str());
285 filename = toString(processIDS[i]) + ".temp.labels";
287 openInputFile(filename, in2);
291 in2 >> tempName; gobble(in);
292 if (labels.count(tempName) == 0) { labels.insert(tempName); }
295 remove((toString(processIDS[i]) + ".temp.labels").c_str());
299 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
302 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
304 //****************** merge list file and create rabund and sabund files ******************************//
306 mergeLists(listFileNames, singletonName, labels);
308 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
310 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
312 m->mothurOutEndLine();
313 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
314 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
315 m->mothurOutEndLine();
319 catch(exception& e) {
320 m->errorOut(e, "ClusterSplitCommand", "execute");
324 //**********************************************************************************************************************
325 int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
327 if (outputDir == "") { outputDir += hasPath(distfile); }
328 fileroot = outputDir + getRootName(getSimpleName(distfile));
330 openOutputFile(fileroot+ tag + ".sabund", outSabund);
331 openOutputFile(fileroot+ tag + ".rabund", outRabund);
332 openOutputFile(fileroot+ tag + ".list", outList);
334 outputNames.push_back(fileroot+ tag + ".sabund");
335 outputNames.push_back(fileroot+ tag + ".rabund");
336 outputNames.push_back(fileroot+ tag + ".list");
339 ListVector* listSingle = NULL;
340 if (singleton != "none") {
342 openInputFile(singleton, in);
344 string firstCol, secondCol;
345 listSingle = new ListVector();
347 in >> firstCol >> secondCol; gobble(in);
348 listSingle->push_back(secondCol);
353 vector<float> orderFloat;
355 //go through users set and make them floats so we can sort them
356 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
359 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
361 orderFloat.push_back(temp);
362 }else if (*it == "unique") { orderFloat.push_back(-1.0); }
364 userLabels.erase(*it);
370 sort(orderFloat.begin(), orderFloat.end());
372 vector<InputData*> inputs;
373 vector<string> lastLabels;
374 for (int i = 0; i < listNames.size(); i++) {
375 InputData* input = new InputData(listNames[i], "list");
376 inputs.push_back(input);
379 openInputFile(listNames[i], in);
380 ListVector tempList(in);
381 lastLabels.push_back(tempList.getLabel());
385 ListVector* merged = NULL;
387 //for each label needed
388 for(int l = 0; l < orderFloat.size(); l++){
391 if (orderFloat[l] == -1) { thisLabel = "unique"; }
392 else { thisLabel = toString(orderFloat[l], length-1); }
394 //get the list info from each file
395 for (int k = 0; k < listNames.size(); k++) {
397 if (m->control_pressed) {
398 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
399 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
400 delete merged; merged = NULL;
404 ListVector* list = inputs[k]->getListVector();
406 //this file has reached the end
407 if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
410 if (list->getLabel() == "unique") { labelFloat = -1.0; }
411 else { convert(list->getLabel(), labelFloat); }
413 //check for missing labels
414 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
415 //if its bigger get last label, otherwise keep it
417 list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
419 lastLabels[k] = list->getLabel();
421 //is this the first file
422 if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
424 for (int j = 0; j < list->getNumBins(); j++) {
425 merged->push_back(list->get(j));
432 for (int j = 0; j < listSingle->getNumBins(); j++) {
433 merged->push_back(listSingle->get(j));
439 delete merged; merged = NULL;
442 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
444 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
448 catch(exception& e) {
449 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
453 //**********************************************************************************************************************
455 void ClusterSplitCommand::printData(ListVector* oldList){
457 string label = oldList->getLabel();
458 RAbundVector oldRAbund = oldList->getRAbundVector();
460 oldRAbund.setLabel(label);
461 if (isTrue(showabund)) {
462 oldRAbund.getSAbundVector().print(cout);
464 oldRAbund.print(outRabund);
465 oldRAbund.getSAbundVector().print(outSabund);
467 oldList->print(outList);
469 catch(exception& e) {
470 m->errorOut(e, "ClusterSplitCommand", "printData");
474 //**********************************************************************************************************************
475 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
478 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
483 //loop through and create all the processes you want
484 while (process != processors) {
488 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
492 vector<string> listFileNames = cluster(dividedNames[process], labels);
494 //write out names to file
495 string filename = toString(getpid()) + ".temp";
497 openOutputFile(filename, out);
498 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
503 filename = toString(getpid()) + ".temp.labels";
504 openOutputFile(filename, outLabels);
506 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
507 outLabels << (*it) << endl;
512 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
515 //force parent to wait until all the processes are done
516 for (int i=0;i<processors;i++) {
517 int temp = processIDS[i];
525 catch(exception& e) {
526 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
530 //**********************************************************************************************************************
532 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
535 SparseMatrix* matrix;
538 RAbundVector* rabund;
540 vector<string> listFileNames;
542 //cluster each distance file
543 for (int i = 0; i < distNames.size(); i++) {
545 string thisNamefile = distNames[i].begin()->second;
546 string thisDistFile = distNames[i].begin()->first;
548 //read in distance file
549 globaldata->setNameFile(thisNamefile);
550 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
552 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
553 read->setCutoff(cutoff);
555 NameAssignment* nameMap = new NameAssignment(thisNamefile);
559 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
561 list = read->getListVector();
563 matrix = read->getMatrix();
568 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
570 rabund = new RAbundVector(list->getRAbundVector());
573 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
574 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
575 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
576 tag = cluster->getTag();
578 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
579 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
582 openOutputFile(fileroot+ tag + ".list", listFile);
584 listFileNames.push_back(fileroot+ tag + ".list");
586 time_t estart = time(NULL);
588 float previousDist = 0.00000;
589 float rndPreviousDist = 0.00000;
595 double saveCutoff = cutoff;
597 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
599 if (m->control_pressed) { //clean up
600 delete matrix; delete list; delete cluster; delete rabund;
602 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
603 listFileNames.clear(); return listFileNames;
606 cluster->update(cutoff);
608 float dist = matrix->getSmallDist();
611 rndDist = ceilDist(dist, precision);
613 rndDist = roundDist(dist, precision);
616 if(previousDist <= 0.0000 && dist != previousDist){
617 oldList.setLabel("unique");
618 oldList.print(listFile);
619 if (labels.count("unique") == 0) { labels.insert("unique"); }
621 else if(rndDist != rndPreviousDist){
622 oldList.setLabel(toString(rndPreviousDist, length-1));
623 oldList.print(listFile);
624 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
628 rndPreviousDist = rndDist;
633 if(previousDist <= 0.0000){
634 oldList.setLabel("unique");
635 oldList.print(listFile);
636 if (labels.count("unique") == 0) { labels.insert("unique"); }
638 else if(rndPreviousDist<cutoff){
639 oldList.setLabel(toString(rndPreviousDist, length-1));
640 oldList.print(listFile);
641 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
644 delete matrix; delete list; delete cluster; delete rabund;
647 if (m->control_pressed) { //clean up
648 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
649 listFileNames.clear(); return listFileNames;
652 remove(thisDistFile.c_str());
653 remove(thisNamefile.c_str());
657 return listFileNames;
660 catch(exception& e) {
661 m->errorOut(e, "ClusterSplitCommand", "cluster");
668 //**********************************************************************************************************************