2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","splitcutoff","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
81 //check for required parameters
82 phylipfile = validParameter.validFile(parameters, "phylip", true);
83 if (phylipfile == "not open") { abort = true; }
84 else if (phylipfile == "not found") { phylipfile = ""; }
85 else { distfile = phylipfile; format = "phylip"; }
87 columnfile = validParameter.validFile(parameters, "column", true);
88 if (columnfile == "not open") { abort = true; }
89 else if (columnfile == "not found") { columnfile = ""; }
90 else { distfile = columnfile; format = "column"; }
92 namefile = validParameter.validFile(parameters, "name", true);
93 if (namefile == "not open") { abort = true; }
94 else if (namefile == "not found") { namefile = ""; }
96 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
97 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
99 if (columnfile != "") {
100 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
103 //check for optional parameter and set defaults
104 // ...at some point should added some additional type checking...
105 //get user cutoff and precision or use defaults
107 temp = validParameter.validFile(parameters, "precision", false);
108 if (temp == "not found") { temp = "100"; }
109 //saves precision legnth for formatting below
110 length = temp.length();
111 convert(temp, precision);
113 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
116 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
117 convert(temp, processors);
119 temp = validParameter.validFile(parameters, "cutoff", false);
120 if (temp == "not found") { temp = "10"; }
121 convert(temp, cutoff);
122 if (!hard) { cutoff += (5 / (precision * 10.0)); }
124 temp = validParameter.validFile(parameters, "splitcutoff", false);
125 if (temp == "not found") { temp = "0.10"; }
126 convert(temp, splitcutoff);
127 if (!hard) { splitcutoff += (5 / (precision * 10.0)); }
129 method = validParameter.validFile(parameters, "method", false);
130 if (method == "not found") { method = "furthest"; }
132 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
133 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
135 showabund = validParameter.validFile(parameters, "showabund", false);
136 if (showabund == "not found") { showabund = "T"; }
138 timing = validParameter.validFile(parameters, "timing", false);
139 if (timing == "not found") { timing = "F"; }
143 catch(exception& e) {
144 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
149 //**********************************************************************************************************************
151 void ClusterSplitCommand::help(){
153 m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
154 m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
155 m->mothurOut("The cluster command should be in the following format: \n");
156 m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
157 m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ClusterSplitCommand", "help");
165 //**********************************************************************************************************************
167 ClusterSplitCommand::~ClusterSplitCommand(){}
169 //**********************************************************************************************************************
171 int ClusterSplitCommand::execute(){
174 if (abort == true) { return 0; }
176 //****************** file prep work ******************************//
178 //if user gave a phylip file convert to column file
179 if (format == "phylip") {
180 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
182 NameAssignment* nameMap = NULL;
183 convert->read(nameMap);
185 if (m->control_pressed) { delete convert; return 0; }
187 string distfile = convert->getOutputFile();
189 //if no names file given with phylip file, create it
190 ListVector* listToMakeNameFile = convert->getListVector();
191 if (namefile == "") { //you need to make a namefile for split matrix
193 namefile = phylipfile + ".names";
194 openOutputFile(namefile, out);
195 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
196 string bin = listToMakeNameFile->get(i);
197 out << bin << '\t' << bin << endl;
201 delete listToMakeNameFile;
205 if (m->control_pressed) { return 0; }
207 time_t estart = time(NULL);
209 //split matrix into non-overlapping groups
210 SplitMatrix* split = new SplitMatrix(distfile, namefile, splitcutoff);
213 if (m->control_pressed) { delete split; return 0; }
215 string singletonName = split->getSingletonNames();
216 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
219 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
222 if (m->control_pressed) { return 0; }
224 //****************** break up files between processes and cluster each file set ******************************//
225 vector<string> listFileNames;
227 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
229 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
231 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
232 dividedNames.resize(processors);
234 //for each file group figure out which process will complete it
235 //want to divide the load intelligently so the big files are spread between processes
237 for (int i = 0; i < distName.size(); i++) {
238 int processToAssign = (i+1) % processors;
239 if (processToAssign == 0) { processToAssign = processors; }
241 dividedNames[(processToAssign-1)].push_back(distName[i]);
244 createProcesses(dividedNames);
246 if (m->control_pressed) { return 0; }
248 //get list of list file names from each process
249 for(int i=0;i<processors;i++){
250 string filename = toString(processIDS[i]) + ".temp";
252 openInputFile(filename, in);
256 in >> tempName; gobble(in);
257 listFileNames.push_back(tempName);
260 remove((toString(processIDS[i]) + ".temp").c_str());
263 filename = toString(processIDS[i]) + ".temp.labels";
265 openInputFile(filename, in2);
269 in2 >> tempName; gobble(in);
270 if (labels.count(tempName) == 0) { labels.insert(tempName); }
273 remove((toString(processIDS[i]) + ".temp.labels").c_str());
277 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
280 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
282 //****************** merge list file and create rabund and sabund files ******************************//
284 mergeLists(listFileNames, singletonName, labels);
286 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
288 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
290 m->mothurOutEndLine();
291 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
292 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
293 m->mothurOutEndLine();
297 catch(exception& e) {
298 m->errorOut(e, "ClusterSplitCommand", "execute");
302 //**********************************************************************************************************************
303 int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
305 if (outputDir == "") { outputDir += hasPath(distfile); }
306 fileroot = outputDir + getRootName(getSimpleName(distfile));
308 openOutputFile(fileroot+ tag + ".sabund", outSabund);
309 openOutputFile(fileroot+ tag + ".rabund", outRabund);
310 openOutputFile(fileroot+ tag + ".list", outList);
312 outputNames.push_back(fileroot+ tag + ".sabund");
313 outputNames.push_back(fileroot+ tag + ".rabund");
314 outputNames.push_back(fileroot+ tag + ".list");
317 ListVector* listSingle = NULL;
318 if (singleton != "none") {
320 openInputFile(singleton, in);
322 string firstCol, secondCol;
323 listSingle = new ListVector();
325 in >> firstCol >> secondCol; gobble(in);
326 listSingle->push_back(secondCol);
331 vector<float> orderFloat;
333 //go through users set and make them floats so we can sort them
334 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
337 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
339 orderFloat.push_back(temp);
340 }else if (*it == "unique") { orderFloat.push_back(-1.0); }
342 userLabels.erase(*it);
348 sort(orderFloat.begin(), orderFloat.end());
350 vector<InputData*> inputs;
351 vector<string> lastLabels;
352 for (int i = 0; i < listNames.size(); i++) {
353 InputData* input = new InputData(listNames[i], "list");
354 inputs.push_back(input);
357 openInputFile(listNames[i], in);
358 ListVector tempList(in);
359 lastLabels.push_back(tempList.getLabel());
363 ListVector* merged = NULL;
365 //for each label needed
366 for(int l = 0; l < orderFloat.size(); l++){
369 if (orderFloat[l] == -1) { thisLabel = "unique"; }
370 else { thisLabel = toString(orderFloat[l], length-1); }
372 //get the list info from each file
373 for (int k = 0; k < listNames.size(); k++) {
375 ListVector* list = inputs[k]->getListVector();
377 //this file has reached the end
378 if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
381 if (list->getLabel() == "unique") { labelFloat = -1.0; }
382 else { convert(list->getLabel(), labelFloat); }
384 //check for missing labels
385 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
386 //if its bigger get last label, otherwise keep it
388 list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
390 lastLabels[k] = list->getLabel();
392 //is this the first file
393 if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
395 for (int j = 0; j < list->getNumBins(); j++) {
396 merged->push_back(list->get(j));
403 for (int j = 0; j < listSingle->getNumBins(); j++) {
404 merged->push_back(listSingle->get(j));
410 delete merged; merged = NULL;
413 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
415 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
417 catch(exception& e) {
418 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
422 //**********************************************************************************************************************
424 void ClusterSplitCommand::printData(ListVector* oldList){
426 string label = oldList->getLabel();
427 RAbundVector oldRAbund = oldList->getRAbundVector();
429 oldRAbund.setLabel(label);
430 if (isTrue(showabund)) {
431 oldRAbund.getSAbundVector().print(cout);
433 oldRAbund.print(outRabund);
434 oldRAbund.getSAbundVector().print(outSabund);
436 oldList->print(outList);
438 catch(exception& e) {
439 m->errorOut(e, "ClusterSplitCommand", "printData");
443 //**********************************************************************************************************************
444 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
447 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
452 //loop through and create all the processes you want
453 while (process != processors) {
457 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
461 vector<string> listFileNames = cluster(dividedNames[process], labels);
463 //write out names to file
464 string filename = toString(getpid()) + ".temp";
466 openOutputFile(filename, out);
467 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
472 filename = toString(getpid()) + ".temp.labels";
473 openOutputFile(filename, outLabels);
475 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
476 outLabels << (*it) << endl;
481 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
484 //force parent to wait until all the processes are done
485 for (int i=0;i<processors;i++) {
486 int temp = processIDS[i];
494 catch(exception& e) {
495 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
499 //**********************************************************************************************************************
501 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
504 SparseMatrix* matrix;
507 RAbundVector* rabund;
509 vector<string> listFileNames;
511 //cluster each distance file
512 for (int i = 0; i < distNames.size(); i++) {
514 string thisNamefile = distNames[i].begin()->second;
515 string thisDistFile = distNames[i].begin()->first;
517 //read in distance file
518 globaldata->setNameFile(thisNamefile);
519 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
521 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
522 read->setCutoff(cutoff);
524 NameAssignment* nameMap = new NameAssignment(thisNamefile);
528 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
530 list = read->getListVector();
532 matrix = read->getMatrix();
537 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
539 rabund = new RAbundVector(list->getRAbundVector());
542 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
543 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
544 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
545 tag = cluster->getTag();
547 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
548 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
551 openOutputFile(fileroot+ tag + ".list", listFile);
553 listFileNames.push_back(fileroot+ tag + ".list");
555 time_t estart = time(NULL);
557 float previousDist = 0.00000;
558 float rndPreviousDist = 0.00000;
564 double saveCutoff = cutoff;
566 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
568 if (m->control_pressed) { //clean up
569 delete matrix; delete list; delete cluster; delete rabund;
571 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
572 listFileNames.clear(); return listFileNames;
575 cluster->update(cutoff);
577 float dist = matrix->getSmallDist();
578 float rndDist = roundDist(dist, precision);
580 if(previousDist <= 0.0000 && dist != previousDist){
581 oldList.setLabel("unique");
582 oldList.print(listFile);
583 if (labels.count("unique") == 0) { labels.insert("unique"); }
585 else if(rndDist != rndPreviousDist){
586 oldList.setLabel(toString(rndPreviousDist, length-1));
587 oldList.print(listFile);
588 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
592 rndPreviousDist = rndDist;
597 if(previousDist <= 0.0000){
598 oldList.setLabel("unique");
599 oldList.print(listFile);
600 if (labels.count("unique") == 0) { labels.insert("unique"); }
602 else if(rndPreviousDist<cutoff){
603 oldList.setLabel(toString(rndPreviousDist, length-1));
604 oldList.print(listFile);
605 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
608 delete matrix; delete list; delete cluster; delete rabund;
611 if (m->control_pressed) { //clean up
612 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
613 listFileNames.clear(); return listFileNames;
616 remove(thisDistFile.c_str());
617 remove(thisNamefile.c_str());
621 return listFileNames;
624 catch(exception& e) {
625 m->errorOut(e, "ClusterSplitCommand", "cluster");
632 //**********************************************************************************************************************