]> git.donarmstrong.com Git - mothur.git/blob - clustercommand.cpp
working on windows paralellization, added trimOligos class to be used by trim.flows...
[mothur.git] / clustercommand.cpp
1 /*
2  *  clustercommand.cpp
3  *  Dotur
4  *
5  *  Created by Sarah Westcott on 1/2/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "clustercommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14
15 //**********************************************************************************************************************
16 vector<string> ClusterCommand::setParameters(){ 
17         try {
18                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
19                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
20                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);              
21                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
22                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
23                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
24                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
25                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
26                 CommandParameter psim("sim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psim);
27                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
28                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
29                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30                 
31                 vector<string> myArray;
32                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
33                 return myArray;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "ClusterCommand", "setParameters");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 string ClusterCommand::getHelpString(){ 
42         try {
43                 string helpString = "";
44                 helpString += "The cluster command parameter options are phylip, column, name, method, cuttoff, hard, precision, sim, showabund and timing. Phylip or column and name are required, unless you have a valid current file.\n";
45                 helpString += "The cluster command should be in the following format: \n";
46                 helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
47                 helpString += "The acceptable cluster methods are furthest, nearest, average and weighted.  If no method is provided then average is assumed.\n";       
48                 return helpString;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "ClusterCommand", "getHelpString");
52                 exit(1);
53         }
54 }
55 //**********************************************************************************************************************
56 ClusterCommand::ClusterCommand(){       
57         try {
58                 abort = true; calledHelp = true; 
59                 setParameters();
60                 vector<string> tempOutNames;
61                 outputTypes["list"] = tempOutNames;
62                 outputTypes["rabund"] = tempOutNames;
63                 outputTypes["sabund"] = tempOutNames;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
67                 exit(1);
68         }
69 }
70 //**********************************************************************************************************************
71 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
72 ClusterCommand::ClusterCommand(string option)  {
73         try{
74                 abort = false; calledHelp = false;   
75                 
76                 //allow user to run help
77                 if(option == "help") { help(); abort = true; calledHelp = true; }
78                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
79                 
80                 else {
81                         vector<string> myArray = setParameters();
82                         
83                         OptionParser parser(option);
84                         map<string,string> parameters = parser.getParameters();
85                         map<string,string>::iterator it;
86                         
87                         ValidParameters validParameter;
88                 
89                         //check to make sure all parameters are valid for command
90                         for (it = parameters.begin(); it != parameters.end(); it++) { 
91                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
92                                         abort = true;
93                                 }
94                         }
95                         
96                         //initialize outputTypes
97                         vector<string> tempOutNames;
98                         outputTypes["list"] = tempOutNames;
99                         outputTypes["rabund"] = tempOutNames;
100                         outputTypes["sabund"] = tempOutNames;
101                 
102                         //if the user changes the output directory command factory will send this info to us in the output parameter 
103                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
104                         
105                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
106                         if (inputDir == "not found"){   inputDir = "";          }
107                         else {
108                                 string path;
109                                 it = parameters.find("phylip");
110                                 //user has given a template file
111                                 if(it != parameters.end()){ 
112                                         path = m->hasPath(it->second);
113                                         //if the user has not given a path then, add inputdir. else leave path alone.
114                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
115                                 }
116                                 
117                                 it = parameters.find("column");
118                                 //user has given a template file
119                                 if(it != parameters.end()){ 
120                                         path = m->hasPath(it->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
123                                 }
124                                 
125                                 it = parameters.find("name");
126                                 //user has given a template file
127                                 if(it != parameters.end()){ 
128                                         path = m->hasPath(it->second);
129                                         //if the user has not given a path then, add inputdir. else leave path alone.
130                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
131                                 }
132                         }
133                         
134                         //check for required parameters
135                         phylipfile = validParameter.validFile(parameters, "phylip", true);
136                         if (phylipfile == "not open") { phylipfile = ""; abort = true; }
137                         else if (phylipfile == "not found") { phylipfile = ""; }        
138                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
139                         
140                         columnfile = validParameter.validFile(parameters, "column", true);
141                         if (columnfile == "not open") { columnfile = ""; abort = true; }        
142                         else if (columnfile == "not found") { columnfile = ""; }
143                         else {  distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
144                         
145                         namefile = validParameter.validFile(parameters, "name", true);
146                         if (namefile == "not open") { abort = true; }   
147                         else if (namefile == "not found") { namefile = ""; }
148                         else { m->setNameFile(namefile); }
149                         
150                         if ((phylipfile == "") && (columnfile == "")) { 
151                                 //is there are current file available for either of these?
152                                 //give priority to column, then phylip
153                                 columnfile = m->getColumnFile(); 
154                                 if (columnfile != "") {  distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
155                                 else { 
156                                         phylipfile = m->getPhylipFile(); 
157                                         if (phylipfile != "") { distfile = phylipfile;  format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
158                                         else { 
159                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the cluster command."); m->mothurOutEndLine(); 
160                                                 abort = true;
161                                         }
162                                 }
163                         }
164                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
165                         
166                         if (columnfile != "") {
167                                 if (namefile == "") { 
168                                         namefile = m->getNameFile(); 
169                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
170                                         else { 
171                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
172                                                 abort = true; 
173                                         }       
174                                 }
175                         }
176                         
177                         //check for optional parameter and set defaults
178                         // ...at some point should added some additional type checking...
179                         //get user cutoff and precision or use defaults
180                         string temp;
181                         temp = validParameter.validFile(parameters, "precision", false);
182                         if (temp == "not found") { temp = "100"; }
183                         //saves precision legnth for formatting below
184                         length = temp.length();
185                         convert(temp, precision); 
186                         
187                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
188                         hard = m->isTrue(temp);
189                         
190                         temp = validParameter.validFile(parameters, "sim", false);                              if (temp == "not found") { temp = "F"; }
191                         sim = m->isTrue(temp); 
192                         
193                         temp = validParameter.validFile(parameters, "cutoff", false);
194                         if (temp == "not found") { temp = "10"; }
195                         convert(temp, cutoff); 
196                         cutoff += (5 / (precision * 10.0));  
197                         
198                         method = validParameter.validFile(parameters, "method", false);
199                         if (method == "not found") { method = "average"; }
200                         
201                         if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) { }
202                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest, average, and weighted."); m->mothurOutEndLine(); abort = true; }
203
204                         showabund = validParameter.validFile(parameters, "showabund", false);
205                         if (showabund == "not found") { showabund = "T"; }
206
207                         timing = validParameter.validFile(parameters, "timing", false);
208                         if (timing == "not found") { timing = "F"; }
209                         
210                         }
211         }
212         catch(exception& e) {
213                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
214                 exit(1);
215         }
216 }
217 //**********************************************************************************************************************
218 ClusterCommand::~ClusterCommand(){}
219 //**********************************************************************************************************************
220
221 int ClusterCommand::execute(){
222         try {
223         
224                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
225                 
226                 ReadMatrix* read;
227                 if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); }       //sim indicates whether its a similarity matrix
228                 else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
229                 
230                 read->setCutoff(cutoff);
231                 
232                 NameAssignment* nameMap = NULL;
233                 if(namefile != ""){     
234                         nameMap = new NameAssignment(namefile);
235                         nameMap->readMap();
236                 }
237                 
238                 read->read(nameMap);
239                 list = read->getListVector();
240                 matrix = read->getMatrix();
241                 rabund = new RAbundVector(list->getRAbundVector());
242                 delete read;
243                 
244                 if (m->control_pressed) { //clean up
245                         delete list; delete matrix; delete rabund;
246                         sabundFile.close();rabundFile.close();listFile.close();
247                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } outputTypes.clear();
248                         return 0;
249                 }
250                 
251                 //create cluster
252                 if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
253                 else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
254                 else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
255                 else if(method == "weighted"){  cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method);    }
256                 tag = cluster->getTag();
257                 
258                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
259                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
260                 
261                 m->openOutputFile(fileroot+ tag + ".sabund",    sabundFile);
262                 m->openOutputFile(fileroot+ tag + ".rabund",    rabundFile);
263                 m->openOutputFile(fileroot+ tag + ".list",              listFile);
264                 
265                 outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
266                 outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
267                 outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
268                 
269                 
270                 time_t estart = time(NULL);
271                 float previousDist = 0.00000;
272                 float rndPreviousDist = 0.00000;
273                 oldRAbund = *rabund;
274                 oldList = *list;
275
276                 print_start = true;
277                 start = time(NULL);
278                 loops = 0;
279                 double saveCutoff = cutoff;
280                 
281                 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
282                 
283                         if (m->control_pressed) { //clean up
284                                 delete list; delete matrix; delete rabund; delete cluster;
285                                 sabundFile.close();rabundFile.close();listFile.close();
286                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } outputTypes.clear();
287                                 return 0;
288                         }
289                 
290                         if (print_start && m->isTrue(timing)) {
291                                 m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" 
292                                         + toString(m->roundDist(matrix->getSmallDist(), precision)) 
293                                         + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
294                                 cout.flush();
295                                 print_start = false;
296                         }
297
298                         loops++;
299
300                         cluster->update(cutoff);
301         
302                         float dist = matrix->getSmallDist();
303                         float rndDist;
304                         if (hard) {
305                                 rndDist = m->ceilDist(dist, precision); 
306                         }else{
307                                 rndDist = m->roundDist(dist, precision); 
308                         }
309
310                         if(previousDist <= 0.0000 && dist != previousDist){
311                                 printData("unique");
312                         }
313                         else if(rndDist != rndPreviousDist){
314                                 printData(toString(rndPreviousDist,  length-1));
315                         }
316                 
317                         previousDist = dist;
318                         rndPreviousDist = rndDist;
319                         oldRAbund = *rabund;
320                         oldList = *list;
321                 }
322
323                 if (print_start && m->isTrue(timing)) {
324                         m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) 
325                                          + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
326                         cout.flush();
327                         print_start = false;
328                 }
329                 
330                 if(previousDist <= 0.0000){
331                         printData("unique");
332                 }
333                 else if(rndPreviousDist<cutoff){
334                         printData(toString(rndPreviousDist, length-1));
335                 }
336                 
337                 delete matrix;
338                 delete list;
339                 delete rabund;
340                 delete cluster;
341                 
342                 sabundFile.close();
343                 rabundFile.close();
344                 listFile.close();
345         
346                 if (saveCutoff != cutoff) { 
347                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
348                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
349
350                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
351                 }
352                 
353                 //set list file as new current listfile
354                 string current = "";
355                 itTypes = outputTypes.find("list");
356                 if (itTypes != outputTypes.end()) {
357                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
358                 }
359                 
360                 //set rabund file as new current rabundfile
361                 itTypes = outputTypes.find("rabund");
362                 if (itTypes != outputTypes.end()) {
363                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
364                 }
365                 
366                 //set sabund file as new current sabundfile
367                 itTypes = outputTypes.find("sabund");
368                 if (itTypes != outputTypes.end()) {
369                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
370                 }
371                 
372                 m->mothurOutEndLine();
373                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
374                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
375                 m->mothurOutEndLine();
376
377                 
378                 //if (m->isTrue(timing)) {
379                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
380                 //}
381                 
382                 
383                 return 0;
384         }
385         catch(exception& e) {
386                 m->errorOut(e, "ClusterCommand", "execute");
387                 exit(1);
388         }
389 }
390
391 //**********************************************************************************************************************
392
393 void ClusterCommand::printData(string label){
394         try {
395                 if (m->isTrue(timing)) {
396                         m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) 
397                      + "\tclusters. Updates: " + toString(loops)); m->mothurOutEndLine();
398                 }
399                 print_start = true;
400                 loops = 0;
401                 start = time(NULL);
402
403                 oldRAbund.setLabel(label);
404                 if (m->isTrue(showabund)) {
405                         oldRAbund.getSAbundVector().print(cout);
406                 }
407                 oldRAbund.print(rabundFile);
408                 oldRAbund.getSAbundVector().print(sabundFile);
409         
410                 oldList.setLabel(label);
411                 oldList.print(listFile);
412         }
413         catch(exception& e) {
414                 m->errorOut(e, "ClusterCommand", "printData");
415                 exit(1);
416         }
417
418
419 }
420 //**********************************************************************************************************************