2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","splitcutoff","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
81 //check for required parameters
82 phylipfile = validParameter.validFile(parameters, "phylip", true);
83 if (phylipfile == "not open") { abort = true; }
84 else if (phylipfile == "not found") { phylipfile = ""; }
85 else { distfile = phylipfile; format = "phylip"; }
87 columnfile = validParameter.validFile(parameters, "column", true);
88 if (columnfile == "not open") { abort = true; }
89 else if (columnfile == "not found") { columnfile = ""; }
90 else { distfile = columnfile; format = "column"; }
92 namefile = validParameter.validFile(parameters, "name", true);
93 if (namefile == "not open") { abort = true; }
94 else if (namefile == "not found") { namefile = ""; }
96 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
97 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
99 if (columnfile != "") {
100 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
103 //check for optional parameter and set defaults
104 // ...at some point should added some additional type checking...
105 //get user cutoff and precision or use defaults
107 temp = validParameter.validFile(parameters, "precision", false);
108 if (temp == "not found") { temp = "100"; }
109 //saves precision legnth for formatting below
110 length = temp.length();
111 convert(temp, precision);
113 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
116 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
117 convert(temp, processors);
119 temp = validParameter.validFile(parameters, "cutoff", false);
120 if (temp == "not found") { temp = "10"; }
121 convert(temp, cutoff);
122 if (!hard) { cutoff += (5 / (precision * 10.0)); }
124 temp = validParameter.validFile(parameters, "splitcutoff", false);
125 if (temp == "not found") { temp = "0.10"; }
126 convert(temp, splitcutoff);
127 if (!hard) { splitcutoff += (5 / (precision * 10.0)); }
129 method = validParameter.validFile(parameters, "method", false);
130 if (method == "not found") { method = "furthest"; }
132 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
133 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
135 showabund = validParameter.validFile(parameters, "showabund", false);
136 if (showabund == "not found") { showabund = "T"; }
138 timing = validParameter.validFile(parameters, "timing", false);
139 if (timing == "not found") { timing = "F"; }
143 catch(exception& e) {
144 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
149 //**********************************************************************************************************************
151 void ClusterSplitCommand::help(){
153 m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
154 m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
155 m->mothurOut("The cluster command should be in the following format: \n");
156 m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
157 m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ClusterSplitCommand", "help");
165 //**********************************************************************************************************************
167 ClusterSplitCommand::~ClusterSplitCommand(){}
169 //**********************************************************************************************************************
171 int ClusterSplitCommand::execute(){
174 if (abort == true) { return 0; }
176 //****************** file prep work ******************************//
178 //if user gave a phylip file convert to column file
179 if (format == "phylip") {
181 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
183 NameAssignment* nameMap = NULL;
184 convert->setFormat("phylip");
185 convert->read(nameMap);
187 if (m->control_pressed) { delete convert; return 0; }
189 distfile = convert->getOutputFile();
191 //if no names file given with phylip file, create it
192 ListVector* listToMakeNameFile = convert->getListVector();
193 if (namefile == "") { //you need to make a namefile for split matrix
195 namefile = phylipfile + ".names";
196 openOutputFile(namefile, out);
197 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
198 string bin = listToMakeNameFile->get(i);
199 out << bin << '\t' << bin << endl;
203 delete listToMakeNameFile;
206 if (m->control_pressed) { return 0; }
208 time_t estart = time(NULL);
210 //split matrix into non-overlapping groups
211 SplitMatrix* split = new SplitMatrix(distfile, namefile, splitcutoff);
214 if (m->control_pressed) { delete split; return 0; }
216 string singletonName = split->getSingletonNames();
217 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
220 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
223 if (m->control_pressed) { return 0; }
225 //****************** break up files between processes and cluster each file set ******************************//
226 vector<string> listFileNames;
228 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
230 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
232 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
233 dividedNames.resize(processors);
235 //for each file group figure out which process will complete it
236 //want to divide the load intelligently so the big files are spread between processes
238 for (int i = 0; i < distName.size(); i++) {
239 int processToAssign = (i+1) % processors;
240 if (processToAssign == 0) { processToAssign = processors; }
242 dividedNames[(processToAssign-1)].push_back(distName[i]);
245 //not lets reverse the order of ever other process, so we balance big files running with little ones
246 for (int i = 0; i < processors; i++) {
247 int remainder = ((i+1) % processors);
248 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
251 createProcesses(dividedNames);
253 if (m->control_pressed) { return 0; }
255 //get list of list file names from each process
256 for(int i=0;i<processors;i++){
257 string filename = toString(processIDS[i]) + ".temp";
259 openInputFile(filename, in);
263 in >> tempName; gobble(in);
264 listFileNames.push_back(tempName);
267 remove((toString(processIDS[i]) + ".temp").c_str());
270 filename = toString(processIDS[i]) + ".temp.labels";
272 openInputFile(filename, in2);
276 in2 >> tempName; gobble(in);
277 if (labels.count(tempName) == 0) { labels.insert(tempName); }
280 remove((toString(processIDS[i]) + ".temp.labels").c_str());
284 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
287 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
289 //****************** merge list file and create rabund and sabund files ******************************//
291 mergeLists(listFileNames, singletonName, labels);
293 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
295 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
297 m->mothurOutEndLine();
298 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
299 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
300 m->mothurOutEndLine();
304 catch(exception& e) {
305 m->errorOut(e, "ClusterSplitCommand", "execute");
309 //**********************************************************************************************************************
310 int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
312 if (outputDir == "") { outputDir += hasPath(distfile); }
313 fileroot = outputDir + getRootName(getSimpleName(distfile));
315 openOutputFile(fileroot+ tag + ".sabund", outSabund);
316 openOutputFile(fileroot+ tag + ".rabund", outRabund);
317 openOutputFile(fileroot+ tag + ".list", outList);
319 outputNames.push_back(fileroot+ tag + ".sabund");
320 outputNames.push_back(fileroot+ tag + ".rabund");
321 outputNames.push_back(fileroot+ tag + ".list");
324 ListVector* listSingle = NULL;
325 if (singleton != "none") {
327 openInputFile(singleton, in);
329 string firstCol, secondCol;
330 listSingle = new ListVector();
332 in >> firstCol >> secondCol; gobble(in);
333 listSingle->push_back(secondCol);
338 vector<float> orderFloat;
340 //go through users set and make them floats so we can sort them
341 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
344 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
346 orderFloat.push_back(temp);
347 }else if (*it == "unique") { orderFloat.push_back(-1.0); }
349 userLabels.erase(*it);
355 sort(orderFloat.begin(), orderFloat.end());
357 vector<InputData*> inputs;
358 vector<string> lastLabels;
359 for (int i = 0; i < listNames.size(); i++) {
360 InputData* input = new InputData(listNames[i], "list");
361 inputs.push_back(input);
364 openInputFile(listNames[i], in);
365 ListVector tempList(in);
366 lastLabels.push_back(tempList.getLabel());
370 ListVector* merged = NULL;
372 //for each label needed
373 for(int l = 0; l < orderFloat.size(); l++){
376 if (orderFloat[l] == -1) { thisLabel = "unique"; }
377 else { thisLabel = toString(orderFloat[l], length-1); }
379 //get the list info from each file
380 for (int k = 0; k < listNames.size(); k++) {
382 if (m->control_pressed) {
383 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
384 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
385 delete merged; merged = NULL;
389 ListVector* list = inputs[k]->getListVector();
391 //this file has reached the end
392 if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
395 if (list->getLabel() == "unique") { labelFloat = -1.0; }
396 else { convert(list->getLabel(), labelFloat); }
398 //check for missing labels
399 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
400 //if its bigger get last label, otherwise keep it
402 list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
404 lastLabels[k] = list->getLabel();
406 //is this the first file
407 if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
409 for (int j = 0; j < list->getNumBins(); j++) {
410 merged->push_back(list->get(j));
417 for (int j = 0; j < listSingle->getNumBins(); j++) {
418 merged->push_back(listSingle->get(j));
424 delete merged; merged = NULL;
427 if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
429 for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
433 catch(exception& e) {
434 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
438 //**********************************************************************************************************************
440 void ClusterSplitCommand::printData(ListVector* oldList){
442 string label = oldList->getLabel();
443 RAbundVector oldRAbund = oldList->getRAbundVector();
445 oldRAbund.setLabel(label);
446 if (isTrue(showabund)) {
447 oldRAbund.getSAbundVector().print(cout);
449 oldRAbund.print(outRabund);
450 oldRAbund.getSAbundVector().print(outSabund);
452 oldList->print(outList);
454 catch(exception& e) {
455 m->errorOut(e, "ClusterSplitCommand", "printData");
459 //**********************************************************************************************************************
460 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
463 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
468 //loop through and create all the processes you want
469 while (process != processors) {
473 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
477 vector<string> listFileNames = cluster(dividedNames[process], labels);
479 //write out names to file
480 string filename = toString(getpid()) + ".temp";
482 openOutputFile(filename, out);
483 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
488 filename = toString(getpid()) + ".temp.labels";
489 openOutputFile(filename, outLabels);
491 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
492 outLabels << (*it) << endl;
497 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
500 //force parent to wait until all the processes are done
501 for (int i=0;i<processors;i++) {
502 int temp = processIDS[i];
510 catch(exception& e) {
511 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
515 //**********************************************************************************************************************
517 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
520 SparseMatrix* matrix;
523 RAbundVector* rabund;
525 vector<string> listFileNames;
527 //cluster each distance file
528 for (int i = 0; i < distNames.size(); i++) {
530 string thisNamefile = distNames[i].begin()->second;
531 string thisDistFile = distNames[i].begin()->first;
533 //read in distance file
534 globaldata->setNameFile(thisNamefile);
535 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
537 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
538 read->setCutoff(cutoff);
540 NameAssignment* nameMap = new NameAssignment(thisNamefile);
544 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
546 list = read->getListVector();
548 matrix = read->getMatrix();
553 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
555 rabund = new RAbundVector(list->getRAbundVector());
558 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
559 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
560 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
561 tag = cluster->getTag();
563 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
564 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
567 openOutputFile(fileroot+ tag + ".list", listFile);
569 listFileNames.push_back(fileroot+ tag + ".list");
571 time_t estart = time(NULL);
573 float previousDist = 0.00000;
574 float rndPreviousDist = 0.00000;
580 double saveCutoff = cutoff;
582 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
584 if (m->control_pressed) { //clean up
585 delete matrix; delete list; delete cluster; delete rabund;
587 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
588 listFileNames.clear(); return listFileNames;
591 cluster->update(cutoff);
593 float dist = matrix->getSmallDist();
594 float rndDist = roundDist(dist, precision);
596 if(previousDist <= 0.0000 && dist != previousDist){
597 oldList.setLabel("unique");
598 oldList.print(listFile);
599 if (labels.count("unique") == 0) { labels.insert("unique"); }
601 else if(rndDist != rndPreviousDist){
602 oldList.setLabel(toString(rndPreviousDist, length-1));
603 oldList.print(listFile);
604 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
608 rndPreviousDist = rndDist;
613 if(previousDist <= 0.0000){
614 oldList.setLabel("unique");
615 oldList.print(listFile);
616 if (labels.count("unique") == 0) { labels.insert("unique"); }
618 else if(rndPreviousDist<cutoff){
619 oldList.setLabel(toString(rndPreviousDist, length-1));
620 oldList.print(listFile);
621 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
624 delete matrix; delete list; delete cluster; delete rabund;
627 if (m->control_pressed) { //clean up
628 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
629 listFileNames.clear(); return listFileNames;
632 remove(thisDistFile.c_str());
633 remove(thisNamefile.c_str());
637 return listFileNames;
640 catch(exception& e) {
641 m->errorOut(e, "ClusterSplitCommand", "cluster");
648 //**********************************************************************************************************************