2 * clustersplitcommand.cpp
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "clustersplitcommand.h"
11 #include "readcluster.h"
12 #include "splitmatrix.h"
13 #include "readphylip.h"
14 #include "readcolumn.h"
15 #include "readmatrix.hpp"
16 #include "inputdata.h"
18 //**********************************************************************************************************************
19 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
20 ClusterSplitCommand::ClusterSplitCommand(string option) {
22 globaldata = GlobalData::getInstance();
25 //allow user to run help
26 if(option == "help") { help(); abort = true; }
29 //valid paramters for this command
30 string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
38 //check to make sure all parameters are valid for command
39 map<string,string>::iterator it;
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
46 globaldata->newRead();
48 //if the user changes the output directory command factory will send this info to us in the output parameter
49 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
51 //if the user changes the input directory command factory will send this info to us in the output parameter
52 string inputDir = validParameter.validFile(parameters, "inputdir", false);
53 if (inputDir == "not found"){ inputDir = ""; }
56 it = parameters.find("phylip");
57 //user has given a template file
58 if(it != parameters.end()){
59 path = hasPath(it->second);
60 //if the user has not given a path then, add inputdir. else leave path alone.
61 if (path == "") { parameters["phylip"] = inputDir + it->second; }
64 it = parameters.find("column");
65 //user has given a template file
66 if(it != parameters.end()){
67 path = hasPath(it->second);
68 //if the user has not given a path then, add inputdir. else leave path alone.
69 if (path == "") { parameters["column"] = inputDir + it->second; }
72 it = parameters.find("name");
73 //user has given a template file
74 if(it != parameters.end()){
75 path = hasPath(it->second);
76 //if the user has not given a path then, add inputdir. else leave path alone.
77 if (path == "") { parameters["name"] = inputDir + it->second; }
80 it = parameters.find("taxonomy");
81 //user has given a template file
82 if(it != parameters.end()){
83 path = hasPath(it->second);
84 //if the user has not given a path then, add inputdir. else leave path alone.
85 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
89 //check for required parameters
90 phylipfile = validParameter.validFile(parameters, "phylip", true);
91 if (phylipfile == "not open") { abort = true; }
92 else if (phylipfile == "not found") { phylipfile = ""; }
93 else { distfile = phylipfile; format = "phylip"; }
95 columnfile = validParameter.validFile(parameters, "column", true);
96 if (columnfile == "not open") { abort = true; }
97 else if (columnfile == "not found") { columnfile = ""; }
98 else { distfile = columnfile; format = "column"; }
100 namefile = validParameter.validFile(parameters, "name", true);
101 if (namefile == "not open") { abort = true; }
102 else if (namefile == "not found") { namefile = ""; }
104 taxFile = validParameter.validFile(parameters, "taxonomy", true);
105 if (taxFile == "not open") { abort = true; }
106 else if (taxFile == "not found") { taxFile = ""; }
108 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
109 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
111 if (columnfile != "") {
112 if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
115 //check for optional parameter and set defaults
116 // ...at some point should added some additional type checking...
117 //get user cutoff and precision or use defaults
119 temp = validParameter.validFile(parameters, "precision", false);
120 if (temp == "not found") { temp = "100"; }
121 //saves precision legnth for formatting below
122 length = temp.length();
123 convert(temp, precision);
125 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
128 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
129 convert(temp, processors);
131 splitmethod = validParameter.validFile(parameters, "splitmethod", false); if (splitmethod == "not found") { splitmethod = "distance"; }
133 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
134 convert(temp, cutoff);
135 cutoff += (5 / (precision * 10.0));
137 temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
138 convert(temp, taxLevelCutoff);
140 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
142 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
143 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
145 if ((splitmethod == "distance") || (splitmethod == "classify")) { }
146 else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
148 if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
150 showabund = validParameter.validFile(parameters, "showabund", false);
151 if (showabund == "not found") { showabund = "T"; }
153 timing = validParameter.validFile(parameters, "timing", false);
154 if (timing == "not found") { timing = "F"; }
158 catch(exception& e) {
159 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
164 //**********************************************************************************************************************
166 void ClusterSplitCommand::help(){
168 m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
169 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
170 m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
171 m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
172 m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
173 m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
174 m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
175 m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
176 m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
177 m->mothurOut("The cluster.split command should be in the following format: \n");
178 m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
179 m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
182 catch(exception& e) {
183 m->errorOut(e, "ClusterSplitCommand", "help");
188 //**********************************************************************************************************************
190 ClusterSplitCommand::~ClusterSplitCommand(){}
192 //**********************************************************************************************************************
194 int ClusterSplitCommand::execute(){
197 if (abort == true) { return 0; }
199 //****************** file prep work ******************************//
201 //if user gave a phylip file convert to column file
202 if (format == "phylip") {
204 ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
206 NameAssignment* nameMap = NULL;
207 convert->setFormat("phylip");
208 convert->read(nameMap);
210 if (m->control_pressed) { delete convert; return 0; }
212 distfile = convert->getOutputFile();
214 //if no names file given with phylip file, create it
215 ListVector* listToMakeNameFile = convert->getListVector();
216 if (namefile == "") { //you need to make a namefile for split matrix
218 namefile = phylipfile + ".names";
219 openOutputFile(namefile, out);
220 for (int i = 0; i < listToMakeNameFile->getNumBins(); i++) {
221 string bin = listToMakeNameFile->get(i);
222 out << bin << '\t' << bin << endl;
226 delete listToMakeNameFile;
229 if (m->control_pressed) { return 0; }
231 time_t estart = time(NULL);
233 //split matrix into non-overlapping groups
235 if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod); }
236 else { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod); }
240 if (m->control_pressed) { delete split; return 0; }
242 string singletonName = split->getSingletonNames();
243 vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
246 if (m->control_pressed) { return 0; }
248 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
251 //****************** break up files between processes and cluster each file set ******************************//
252 vector<string> listFileNames;
254 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
256 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
258 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
259 dividedNames.resize(processors);
261 //for each file group figure out which process will complete it
262 //want to divide the load intelligently so the big files are spread between processes
264 for (int i = 0; i < distName.size(); i++) {
265 int processToAssign = (i+1) % processors;
266 if (processToAssign == 0) { processToAssign = processors; }
268 dividedNames[(processToAssign-1)].push_back(distName[i]);
271 //not lets reverse the order of ever other process, so we balance big files running with little ones
272 for (int i = 0; i < processors; i++) {
273 int remainder = ((i+1) % processors);
274 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
277 createProcesses(dividedNames);
279 if (m->control_pressed) { return 0; }
281 //get list of list file names from each process
282 for(int i=0;i<processors;i++){
283 string filename = toString(processIDS[i]) + ".temp";
285 openInputFile(filename, in);
289 in >> tempName; gobble(in);
290 listFileNames.push_back(tempName);
293 remove((toString(processIDS[i]) + ".temp").c_str());
296 filename = toString(processIDS[i]) + ".temp.labels";
298 openInputFile(filename, in2);
302 in2 >> tempName; gobble(in);
303 if (labels.count(tempName) == 0) { labels.insert(tempName); }
306 remove((toString(processIDS[i]) + ".temp.labels").c_str());
310 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
313 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
315 //****************** merge list file and create rabund and sabund files ******************************//
316 ListVector* listSingle;
317 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
319 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
321 mergeLists(listFileNames, labelBins, listSingle);
323 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
325 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
327 m->mothurOutEndLine();
328 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
329 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
330 m->mothurOutEndLine();
334 catch(exception& e) {
335 m->errorOut(e, "ClusterSplitCommand", "execute");
339 //**********************************************************************************************************************
340 map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string> userLabels, ListVector*& listSingle){
343 map<float, int> labelBin;
344 vector<float> orderFloat;
348 if (singleton != "none") {
350 openInputFile(singleton, in);
352 string firstCol, secondCol;
353 listSingle = new ListVector();
355 in >> firstCol >> secondCol; gobble(in);
356 listSingle->push_back(secondCol);
359 remove(singleton.c_str());
361 numSingleBins = listSingle->getNumBins();
362 }else{ listSingle = NULL; numSingleBins = 0; }
364 //go through users set and make them floats so we can sort them
365 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
368 if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
369 else if (*it == "unique") { temp = -1.0; }
371 orderFloat.push_back(temp);
372 labelBin[temp] = numSingleBins; //initialize numbins
376 sort(orderFloat.begin(), orderFloat.end());
379 //get the list info from each file
380 for (int k = 0; k < listNames.size(); k++) {
382 if (m->control_pressed) {
383 if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
384 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
388 InputData* input = new InputData(listNames[k], "list");
389 ListVector* list = input->getListVector();
390 string lastLabel = list->getLabel();
392 string filledInList = listNames[k] + "filledInTemp";
394 openOutputFile(filledInList, outFilled);
396 //for each label needed
397 for(int l = 0; l < orderFloat.size(); l++){
400 if (orderFloat[l] == -1) { thisLabel = "unique"; }
401 else { thisLabel = toString(orderFloat[l], length-1); }
403 //this file has reached the end
405 list = input->getListVector(lastLabel, true);
406 }else{ //do you have the distance, or do you need to fill in
409 if (list->getLabel() == "unique") { labelFloat = -1.0; }
410 else { convert(list->getLabel(), labelFloat); }
412 //check for missing labels
413 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
414 //if its bigger get last label, otherwise keep it
416 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
418 lastLabel = list->getLabel();
422 list->setLabel(thisLabel);
423 list->print(outFilled);
426 labelBin[orderFloat[l]] += list->getNumBins();
430 list = input->getListVector();
433 if (list != NULL) { delete list; }
437 remove(listNames[k].c_str());
438 rename(filledInList.c_str(), listNames[k].c_str());
443 catch(exception& e) {
444 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
448 //**********************************************************************************************************************
449 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
451 if (outputDir == "") { outputDir += hasPath(distfile); }
452 fileroot = outputDir + getRootName(getSimpleName(distfile));
454 openOutputFile(fileroot+ tag + ".sabund", outSabund);
455 openOutputFile(fileroot+ tag + ".rabund", outRabund);
456 openOutputFile(fileroot+ tag + ".list", outList);
458 outputNames.push_back(fileroot+ tag + ".sabund");
459 outputNames.push_back(fileroot+ tag + ".rabund");
460 outputNames.push_back(fileroot+ tag + ".list");
462 map<float, int>::iterator itLabel;
464 //for each label needed
465 for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
468 if (itLabel->first == -1) { thisLabel = "unique"; }
469 else { thisLabel = toString(itLabel->first, length-1); }
471 outList << thisLabel << '\t' << itLabel->second << '\t';
473 RAbundVector* rabund = new RAbundVector();
474 rabund->setLabel(thisLabel);
477 if (listSingle != NULL) {
478 for (int j = 0; j < listSingle->getNumBins(); j++) {
479 outList << listSingle->get(j) << '\t';
480 rabund->push_back(getNumNames(listSingle->get(j)));
484 //get the list info from each file
485 for (int k = 0; k < listNames.size(); k++) {
487 if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
489 InputData* input = new InputData(listNames[k], "list");
490 ListVector* list = input->getListVector(thisLabel);
492 //this file has reached the end
493 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
495 for (int j = 0; j < list->getNumBins(); j++) {
496 outList << list->get(j) << '\t';
497 rabund->push_back(getNumNames(list->get(j)));
504 SAbundVector sabund = rabund->getSAbundVector();
506 sabund.print(outSabund);
507 rabund->print(outRabund);
517 if (listSingle != NULL) { delete listSingle; }
519 for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
523 catch(exception& e) {
524 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
529 //**********************************************************************************************************************
531 void ClusterSplitCommand::printData(ListVector* oldList){
533 string label = oldList->getLabel();
534 RAbundVector oldRAbund = oldList->getRAbundVector();
536 oldRAbund.setLabel(label);
537 if (isTrue(showabund)) {
538 oldRAbund.getSAbundVector().print(cout);
540 oldRAbund.print(outRabund);
541 oldRAbund.getSAbundVector().print(outSabund);
543 oldList->print(outList);
545 catch(exception& e) {
546 m->errorOut(e, "ClusterSplitCommand", "printData");
550 //**********************************************************************************************************************
551 int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
554 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
559 //loop through and create all the processes you want
560 while (process != processors) {
564 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
568 vector<string> listFileNames = cluster(dividedNames[process], labels);
570 //write out names to file
571 string filename = toString(getpid()) + ".temp";
573 openOutputFile(filename, out);
574 for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
579 filename = toString(getpid()) + ".temp.labels";
580 openOutputFile(filename, outLabels);
582 for (set<string>::iterator it = labels.begin(); it != labels.end(); it++) {
583 outLabels << (*it) << endl;
588 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
591 //force parent to wait until all the processes are done
592 for (int i=0;i<processors;i++) {
593 int temp = processIDS[i];
601 catch(exception& e) {
602 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
606 //**********************************************************************************************************************
608 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
611 SparseMatrix* matrix;
614 RAbundVector* rabund;
616 vector<string> listFileNames;
618 //cluster each distance file
619 for (int i = 0; i < distNames.size(); i++) {
621 string thisNamefile = distNames[i].begin()->second;
622 string thisDistFile = distNames[i].begin()->first;
624 //read in distance file
625 globaldata->setNameFile(thisNamefile);
626 globaldata->setColumnFile(thisDistFile); globaldata->setFormat("column");
628 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
629 read->setCutoff(cutoff);
631 NameAssignment* nameMap = new NameAssignment(thisNamefile);
635 if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
637 list = read->getListVector();
639 matrix = read->getMatrix();
644 m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
646 rabund = new RAbundVector(list->getRAbundVector());
649 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
650 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
651 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
652 tag = cluster->getTag();
654 if (outputDir == "") { outputDir += hasPath(thisDistFile); }
655 fileroot = outputDir + getRootName(getSimpleName(thisDistFile));
658 openOutputFile(fileroot+ tag + ".list", listFile);
660 listFileNames.push_back(fileroot+ tag + ".list");
662 time_t estart = time(NULL);
664 float previousDist = 0.00000;
665 float rndPreviousDist = 0.00000;
671 double saveCutoff = cutoff;
673 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
675 if (m->control_pressed) { //clean up
676 delete matrix; delete list; delete cluster; delete rabund;
678 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
679 listFileNames.clear(); return listFileNames;
682 cluster->update(cutoff);
684 float dist = matrix->getSmallDist();
687 rndDist = ceilDist(dist, precision);
689 rndDist = roundDist(dist, precision);
692 if(previousDist <= 0.0000 && dist != previousDist){
693 oldList.setLabel("unique");
694 oldList.print(listFile);
695 if (labels.count("unique") == 0) { labels.insert("unique"); }
697 else if(rndDist != rndPreviousDist){
698 oldList.setLabel(toString(rndPreviousDist, length-1));
699 oldList.print(listFile);
700 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
704 rndPreviousDist = rndDist;
709 if(previousDist <= 0.0000){
710 oldList.setLabel("unique");
711 oldList.print(listFile);
712 if (labels.count("unique") == 0) { labels.insert("unique"); }
714 else if(rndPreviousDist<cutoff){
715 oldList.setLabel(toString(rndPreviousDist, length-1));
716 oldList.print(listFile);
717 if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
720 delete matrix; delete list; delete cluster; delete rabund;
723 if (m->control_pressed) { //clean up
724 for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
725 listFileNames.clear(); return listFileNames;
728 remove(thisDistFile.c_str());
729 remove(thisNamefile.c_str());
733 return listFileNames;
736 catch(exception& e) {
737 m->errorOut(e, "ClusterSplitCommand", "cluster");
744 //**********************************************************************************************************************