2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","splitcutoff","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
81 //check for required parameters
82 phylipfile = validParameter.validFile(parameters, "phylip", true);
83 if (phylipfile == "not open") { abort = true; }
84 else if (phylipfile == "not found") { phylipfile = ""; }
85 else { distfile = phylipfile; format = "phylip"; }
87 columnfile = validParameter.validFile(parameters, "column", true);
88 if (columnfile == "not open") { abort = true; }
89 else if (columnfile == "not found") { columnfile = ""; }
90 else { distfile = columnfile; format = "column"; }
92 namefile = validParameter.validFile(parameters, "name", true);
93 if (namefile == "not open") { abort = true; }
94 else if (namefile == "not found") { namefile = ""; }
96 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
97 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
99 if (columnfile != "") {
100 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
103 //check for optional parameter and set defaults
104 // ...at some point should added some additional type checking...
105 //get user cutoff and precision or use defaults
107 temp = validParameter.validFile(parameters, "precision", false);
108 if (temp == "not found") { temp = "100"; }
109 //saves precision legnth for formatting below
110 length = temp.length();
111 convert(temp, precision);
113 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
116 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
117 convert(temp, processors);
119 temp = validParameter.validFile(parameters, "cutoff", false);
120 if (temp == "not found") { temp = "10"; }
121 convert(temp, cutoff);
122 if (!hard) { cutoff += (5 / (precision * 10.0)); }
124 temp = validParameter.validFile(parameters, "splitcutoff", false);
125 if (temp == "not found") { temp = "0.10"; }
126 convert(temp, splitcutoff);
127 if (!hard) { splitcutoff += (5 / (precision * 10.0)); }
129 method = validParameter.validFile(parameters, "method", false);
130 if (method == "not found") { method = "furthest"; }
132 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
133 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
135 showabund = validParameter.validFile(parameters, "showabund", false);
136 if (showabund == "not found") { showabund = "T"; }
138 timing = validParameter.validFile(parameters, "timing", false);
139 if (timing == "not found") { timing = "F"; }
143 catch(exception& e) {
144 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
149 //**********************************************************************************************************************
151 void ClusterSplitCommand::help(){
153 m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
154 m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
155 m->mothurOut("The cluster command should be in the following format: \n");
156 m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
157 m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ClusterSplitCommand", "help");
165 //**********************************************************************************************************************
167 ClusterSplitCommand::~ClusterSplitCommand(){}
169 //**********************************************************************************************************************
171 int ClusterSplitCommand::execute(){
174 if (abort == true) { return 0; }
176 //****************** file prep work ******************************//
178 //if user gave a phylip file convert to column file
179 if (format == "phylip") {
181 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
183 NameAssignment* nameMap = NULL;
184 convert->setFormat("phylip");
185 convert->read(nameMap);
187 if (m->control_pressed) { delete convert; return 0; }
189 distfile = convert->getOutputFile();
191 //if no names file given with phylip file, create it
192 ListVector* listToMakeNameFile = convert->getListVector();
193 if (namefile == "") { //you need to make a namefile for split matrix
195 namefile = phylipfile + ".names";
196 openOutputFile(namefile, out);
197 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
198 string bin = listToMakeNameFile->get(i);
199 out << bin << '\t' << bin << endl;
203 delete listToMakeNameFile;
206 if (m->control_pressed) { return 0; }
208 time_t estart = time(NULL);
210 //split matrix into non-overlapping groups
211 SplitMatrix* split = new SplitMatrix(distfile, namefile, splitcutoff);
214 if (m->control_pressed) { delete split; return 0; }
216 string singletonName = split->getSingletonNames();
217 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
220 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
223 if (m->control_pressed) { return 0; }
225 //****************** break up files between processes and cluster each file set ******************************//
226 vector<string> listFileNames;
228 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
230 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
232 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
233 dividedNames.resize(processors);
235 //for each file group figure out which process will complete it
236 //want to divide the load intelligently so the big files are spread between processes
238 for (int i = 0; i < distName.size(); i++) {
239 int processToAssign = (i+1) % processors;
240 if (processToAssign == 0) { processToAssign = processors; }
242 dividedNames[(processToAssign-1)].push_back(distName[i]);
245 createProcesses(dividedNames);
247 if (m->control_pressed) { return 0; }
249 //get list of list file names from each process
250 for(int i=0;i<processors;i++){
251 string filename = toString(processIDS[i]) + ".temp";
253 openInputFile(filename, in);
257 in >> tempName; gobble(in);
258 listFileNames.push_back(tempName);
261 remove((toString(processIDS[i]) + ".temp").c_str());
264 filename = toString(processIDS[i]) + ".temp.labels";
266 openInputFile(filename, in2);
270 in2 >> tempName; gobble(in);
271 if (labels.count(tempName) == 0) { labels.insert(tempName); }
274 remove((toString(processIDS[i]) + ".temp.labels").c_str());
278 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
281 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
283 //****************** merge list file and create rabund and sabund files ******************************//
285 mergeLists(listFileNames, singletonName, labels);
287 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
289 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
291 m->mothurOutEndLine();
292 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
293 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
294 m->mothurOutEndLine();
298 catch(exception& e) {
299 m->errorOut(e, "ClusterSplitCommand", "execute");
303 //**********************************************************************************************************************
304 int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
306 if (outputDir == "") { outputDir += hasPath(distfile); }
307 fileroot = outputDir + getRootName(getSimpleName(distfile));
309 openOutputFile(fileroot+ tag + ".sabund", outSabund);
310 openOutputFile(fileroot+ tag + ".rabund", outRabund);
311 openOutputFile(fileroot+ tag + ".list", outList);
313 outputNames.push_back(fileroot+ tag + ".sabund");
314 outputNames.push_back(fileroot+ tag + ".rabund");
315 outputNames.push_back(fileroot+ tag + ".list");
318 ListVector* listSingle = NULL;
319 if (singleton != "none") {
321 openInputFile(singleton, in);
323 string firstCol, secondCol;
324 listSingle = new ListVector();
326 in >> firstCol >> secondCol; gobble(in);
327 listSingle->push_back(secondCol);
332 vector<float> orderFloat;
334 //go through users set and make them floats so we can sort them
335 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
338 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
340 orderFloat.push_back(temp);
341 }else if (*it == "unique") { orderFloat.push_back(-1.0); }
343 userLabels.erase(*it);
349 sort(orderFloat.begin(), orderFloat.end());
351 vector<InputData*> inputs;
352 vector<string> lastLabels;
353 for (int i = 0; i < listNames.size(); i++) {
354 InputData* input = new InputData(listNames[i], "list");
355 inputs.push_back(input);
358 openInputFile(listNames[i], in);
359 ListVector tempList(in);
360 lastLabels.push_back(tempList.getLabel());
364 ListVector* merged = NULL;
366 //for each label needed
367 for(int l = 0; l < orderFloat.size(); l++){
370 if (orderFloat[l] == -1) { thisLabel = "unique"; }
371 else { thisLabel = toString(orderFloat[l], length-1); }
373 //get the list info from each file
374 for (int k = 0; k < listNames.size(); k++) {
376 if (m->control_pressed) {
377 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
378 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
379 delete merged; merged = NULL;
383 ListVector* list = inputs[k]->getListVector();
385 //this file has reached the end
386 if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
389 if (list->getLabel() == "unique") { labelFloat = -1.0; }
390 else { convert(list->getLabel(), labelFloat); }
392 //check for missing labels
393 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
394 //if its bigger get last label, otherwise keep it
396 list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
398 lastLabels[k] = list->getLabel();
400 //is this the first file
401 if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
403 for (int j = 0; j < list->getNumBins(); j++) {
404 merged->push_back(list->get(j));
411 for (int j = 0; j < listSingle->getNumBins(); j++) {
412 merged->push_back(listSingle->get(j));
418 delete merged; merged = NULL;
421 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
423 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
427 catch(exception& e) {
428 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
432 //**********************************************************************************************************************
434 void ClusterSplitCommand::printData(ListVector* oldList){
436 string label = oldList->getLabel();
437 RAbundVector oldRAbund = oldList->getRAbundVector();
439 oldRAbund.setLabel(label);
440 if (isTrue(showabund)) {
441 oldRAbund.getSAbundVector().print(cout);
443 oldRAbund.print(outRabund);
444 oldRAbund.getSAbundVector().print(outSabund);
446 oldList->print(outList);
448 catch(exception& e) {
449 m->errorOut(e, "ClusterSplitCommand", "printData");
453 //**********************************************************************************************************************
454 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
457 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
462 //loop through and create all the processes you want
463 while (process != processors) {
467 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
471 vector<string> listFileNames = cluster(dividedNames[process], labels);
473 //write out names to file
474 string filename = toString(getpid()) + ".temp";
476 openOutputFile(filename, out);
477 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
482 filename = toString(getpid()) + ".temp.labels";
483 openOutputFile(filename, outLabels);
485 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
486 outLabels << (*it) << endl;
491 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
494 //force parent to wait until all the processes are done
495 for (int i=0;i<processors;i++) {
496 int temp = processIDS[i];
504 catch(exception& e) {
505 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
509 //**********************************************************************************************************************
511 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
514 SparseMatrix* matrix;
517 RAbundVector* rabund;
519 vector<string> listFileNames;
521 //cluster each distance file
522 for (int i = 0; i < distNames.size(); i++) {
524 string thisNamefile = distNames[i].begin()->second;
525 string thisDistFile = distNames[i].begin()->first;
527 //read in distance file
528 globaldata->setNameFile(thisNamefile);
529 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
531 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
532 read->setCutoff(cutoff);
534 NameAssignment* nameMap = new NameAssignment(thisNamefile);
538 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
540 list = read->getListVector();
542 matrix = read->getMatrix();
547 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
549 rabund = new RAbundVector(list->getRAbundVector());
552 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
553 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
554 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
555 tag = cluster->getTag();
557 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
558 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
561 openOutputFile(fileroot+ tag + ".list", listFile);
563 listFileNames.push_back(fileroot+ tag + ".list");
565 time_t estart = time(NULL);
567 float previousDist = 0.00000;
568 float rndPreviousDist = 0.00000;
574 double saveCutoff = cutoff;
576 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
578 if (m->control_pressed) { //clean up
579 delete matrix; delete list; delete cluster; delete rabund;
581 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
582 listFileNames.clear(); return listFileNames;
585 cluster->update(cutoff);
587 float dist = matrix->getSmallDist();
588 float rndDist = roundDist(dist, precision);
590 if(previousDist <= 0.0000 && dist != previousDist){
591 oldList.setLabel("unique");
592 oldList.print(listFile);
593 if (labels.count("unique") == 0) { labels.insert("unique"); }
595 else if(rndDist != rndPreviousDist){
596 oldList.setLabel(toString(rndPreviousDist, length-1));
597 oldList.print(listFile);
598 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
602 rndPreviousDist = rndDist;
607 if(previousDist <= 0.0000){
608 oldList.setLabel("unique");
609 oldList.print(listFile);
610 if (labels.count("unique") == 0) { labels.insert("unique"); }
612 else if(rndPreviousDist<cutoff){
613 oldList.setLabel(toString(rndPreviousDist, length-1));
614 oldList.print(listFile);
615 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
618 delete matrix; delete list; delete cluster; delete rabund;
621 if (m->control_pressed) { //clean up
622 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
623 listFileNames.clear(); return listFileNames;
626 remove(thisDistFile.c_str());
627 remove(thisNamefile.c_str());
631 return listFileNames;
634 catch(exception& e) {
635 m->errorOut(e, "ClusterSplitCommand", "cluster");
642 //**********************************************************************************************************************