]> git.donarmstrong.com Git - mothur.git/blob - clustercommand.cpp
restructuring random forest implementation
[mothur.git] / clustercommand.cpp
1 /*
2  *  clustercommand.cpp
3  *  Dotur
4  *
5  *  Created by Sarah Westcott on 1/2/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "clustercommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "clusterdoturcommand.h"
15
16
17 //**********************************************************************************************************************
18 vector<string> ClusterCommand::setParameters(){ 
19         try {
20                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
21                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
22                 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pcount);
23         CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);              
24                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
25                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
26                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
27                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
28                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
29                 CommandParameter psim("sim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psim);
30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
31                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "ClusterCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string ClusterCommand::getHelpString(){ 
45         try {
46                 string helpString = "";
47                 helpString += "The cluster command parameter options are phylip, column, name, count, method, cuttoff, hard, precision, sim, showabund and timing. Phylip or column and name are required, unless you have a valid current file.\n";
48                 helpString += "The cluster command should be in the following format: \n";
49                 helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
50                 helpString += "The acceptable cluster methods are furthest, nearest, average and weighted.  If no method is provided then average is assumed.\n";       
51                 return helpString;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "ClusterCommand", "getHelpString");
55                 exit(1);
56         }
57 }
58 //**********************************************************************************************************************
59 string ClusterCommand::getOutputFileNameTag(string type, string inputName=""){  
60         try {
61         string outputFileName = "";
62                 map<string, vector<string> >::iterator it;
63         
64         //is this a type this command creates
65         it = outputTypes.find(type);
66         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
67         else {
68             if (type == "list") {  outputFileName =  "list"; }
69             else if (type == "rabund") {  outputFileName =  "rabund"; }
70             else if (type == "sabund") {  outputFileName =  "sabund"; }
71             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
72         }
73         return outputFileName;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "ClusterCommand", "getOutputFileNameTag");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 ClusterCommand::ClusterCommand(){       
82         try {
83                 abort = true; calledHelp = true; 
84                 setParameters();
85                 vector<string> tempOutNames;
86                 outputTypes["list"] = tempOutNames;
87                 outputTypes["rabund"] = tempOutNames;
88                 outputTypes["sabund"] = tempOutNames;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
92                 exit(1);
93         }
94 }
95 //**********************************************************************************************************************
96 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
97 ClusterCommand::ClusterCommand(string option)  {
98         try{
99                 abort = false; calledHelp = false;   
100                 
101                 //allow user to run help
102                 if(option == "help") { help(); abort = true; calledHelp = true; }
103                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
104                 
105                 else {
106                         vector<string> myArray = setParameters();
107                         
108                         OptionParser parser(option);
109                         map<string,string> parameters = parser.getParameters();
110                         map<string,string>::iterator it;
111                         
112                         ValidParameters validParameter;
113                 
114                         //check to make sure all parameters are valid for command
115                         for (it = parameters.begin(); it != parameters.end(); it++) { 
116                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
117                                         abort = true;
118                                 }
119                         }
120                         
121                         //initialize outputTypes
122                         vector<string> tempOutNames;
123                         outputTypes["list"] = tempOutNames;
124                         outputTypes["rabund"] = tempOutNames;
125                         outputTypes["sabund"] = tempOutNames;
126                 
127                         //if the user changes the output directory command factory will send this info to us in the output parameter 
128                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
129                         
130                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
131                         if (inputDir == "not found"){   inputDir = "";          }
132                         else {
133                                 string path;
134                                 it = parameters.find("phylip");
135                                 //user has given a template file
136                                 if(it != parameters.end()){ 
137                                         path = m->hasPath(it->second);
138                                         //if the user has not given a path then, add inputdir. else leave path alone.
139                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
140                                 }
141                                 
142                                 it = parameters.find("column");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
148                                 }
149                                 
150                                 it = parameters.find("name");
151                                 //user has given a template file
152                                 if(it != parameters.end()){ 
153                                         path = m->hasPath(it->second);
154                                         //if the user has not given a path then, add inputdir. else leave path alone.
155                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
156                                 }
157                 
158                 it = parameters.find("count");
159                                 //user has given a template file
160                                 if(it != parameters.end()){ 
161                                         path = m->hasPath(it->second);
162                                         //if the user has not given a path then, add inputdir. else leave path alone.
163                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
164                                 }
165                         }
166                         
167                         //check for required parameters
168                         phylipfile = validParameter.validFile(parameters, "phylip", true);
169                         if (phylipfile == "not open") { phylipfile = ""; abort = true; }
170                         else if (phylipfile == "not found") { phylipfile = ""; }        
171                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
172                         
173                         columnfile = validParameter.validFile(parameters, "column", true);
174                         if (columnfile == "not open") { columnfile = ""; abort = true; }        
175                         else if (columnfile == "not found") { columnfile = ""; }
176                         else {  distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
177                         
178                         namefile = validParameter.validFile(parameters, "name", true);
179                         if (namefile == "not open") { abort = true; }   
180                         else if (namefile == "not found") { namefile = ""; }
181                         else { m->setNameFile(namefile); }
182             
183             countfile = validParameter.validFile(parameters, "count", true);
184                         if (countfile == "not open") { abort = true; countfile = ""; }  
185                         else if (countfile == "not found") { countfile = ""; }
186                         else { m->setCountTableFile(countfile); }
187                         
188                         if ((phylipfile == "") && (columnfile == "")) { 
189                                 //is there are current file available for either of these?
190                                 //give priority to column, then phylip
191                                 columnfile = m->getColumnFile(); 
192                                 if (columnfile != "") {  distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
193                                 else { 
194                                         phylipfile = m->getPhylipFile(); 
195                                         if (phylipfile != "") { distfile = phylipfile;  format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
196                                         else { 
197                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the cluster command."); m->mothurOutEndLine(); 
198                                                 abort = true;
199                                         }
200                                 }
201                         }
202                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
203                         
204                         if (columnfile != "") {
205                                 if ((namefile == "") && (countfile == "")){ 
206                                         namefile = m->getNameFile(); 
207                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
208                                         else { 
209                                                 countfile = m->getCountTableFile();
210                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
211                         else { 
212                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
213                             abort = true; 
214                         }       
215                                         }       
216                                 }
217                         }
218                         
219             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
220             
221                         //check for optional parameter and set defaults
222                         // ...at some point should added some additional type checking...
223                         //get user cutoff and precision or use defaults
224                         string temp;
225                         temp = validParameter.validFile(parameters, "precision", false);
226                         if (temp == "not found") { temp = "100"; }
227                         //saves precision legnth for formatting below
228                         length = temp.length();
229                         m->mothurConvert(temp, precision); 
230                         
231                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
232                         hard = m->isTrue(temp);
233                         
234                         temp = validParameter.validFile(parameters, "sim", false);                              if (temp == "not found") { temp = "F"; }
235                         sim = m->isTrue(temp); 
236                         
237                         temp = validParameter.validFile(parameters, "cutoff", false);
238                         if (temp == "not found") { temp = "10"; }
239                         m->mothurConvert(temp, cutoff); 
240                         cutoff += (5 / (precision * 10.0));  
241                         
242                         method = validParameter.validFile(parameters, "method", false);
243                         if (method == "not found") { method = "average"; }
244                         
245                         if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) { }
246                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest, average, and weighted."); m->mothurOutEndLine(); abort = true; }
247
248                         showabund = validParameter.validFile(parameters, "showabund", false);
249                         if (showabund == "not found") { showabund = "T"; }
250
251                         timing = validParameter.validFile(parameters, "timing", false);
252                         if (timing == "not found") { timing = "F"; }
253                         
254                 }
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
258                 exit(1);
259         }
260 }
261 //**********************************************************************************************************************
262 ClusterCommand::~ClusterCommand(){}
263 //**********************************************************************************************************************
264
265 int ClusterCommand::execute(){
266         try {
267         
268                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
269                 
270                 //phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster
271                 if ((format == "phylip") && (cutoff > 10.0)) {
272                         m->mothurOutEndLine(); m->mothurOut("You are using a phylip file and no cutoff.  I will run cluster.classic to save memory and time."); m->mothurOutEndLine();
273                         
274                         //run unique.seqs for deconvolute results
275                         string inputString = "phylip=" + distfile;
276                         if (namefile != "") { inputString += ", name=" + namefile; }
277             else if (countfile != "") { inputString += ", count=" + countfile; }
278                         inputString += ", precision=" + toString(precision);
279                         inputString += ", method=" + method;
280                         if (hard)       { inputString += ", hard=T";    }
281                         else            { inputString += ", hard=F";    }
282                         if (sim)        { inputString += ", sim=T";             }
283                         else            { inputString += ", sim=F";             }
284
285                         
286                         m->mothurOutEndLine(); 
287                         m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine(); 
288                         m->mothurOut("Running command: cluster.classic(" + inputString + ")"); m->mothurOutEndLine(); 
289                         
290                         Command* clusterClassicCommand = new ClusterDoturCommand(inputString);
291                         clusterClassicCommand->execute();
292                         delete clusterClassicCommand;
293                         
294                         m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine();  
295
296                         return 0;
297                 }
298                 
299                 ReadMatrix* read;
300                 if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); }       //sim indicates whether its a similarity matrix
301                 else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
302                 
303                 read->setCutoff(cutoff);
304                 
305                 NameAssignment* nameMap = NULL;
306         CountTable* ct = NULL;
307                 if(namefile != ""){     
308                         nameMap = new NameAssignment(namefile);
309                         nameMap->readMap();
310             read->read(nameMap);
311                 }else if (countfile != "") {
312             ct = new CountTable();
313             ct->readTable(countfile);
314             read->read(ct);
315         }
316                 
317                 list = read->getListVector();
318                 matrix = read->getDMatrix();
319         
320                 if(countfile != "") {
321             rabund = new RAbundVector();
322             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
323             delete ct;
324         }else { rabund = new RAbundVector(list->getRAbundVector()); }
325                 delete read;
326                 
327                 if (m->control_pressed) { //clean up
328                         delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
329                         listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
330                 }
331                 
332                 //create cluster
333                 if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
334                 else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
335                 else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
336                 else if(method == "weighted"){  cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method);    }
337                 tag = cluster->getTag();
338                 
339                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
340                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
341                 
342         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
343         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
344         string listFileName = fileroot+ tag + ".";
345         if (countfile != "") { listFileName += "unique_"; }
346         listFileName += getOutputFileNameTag("list");
347         
348         if (countfile == "") {
349             m->openOutputFile(sabundFileName,   sabundFile);
350             m->openOutputFile(rabundFileName,   rabundFile);
351             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
352             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
353
354         }
355                 m->openOutputFile(listFileName, listFile);
356         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
357                 
358                 
359                 time_t estart = time(NULL);
360                 float previousDist = 0.00000;
361                 float rndPreviousDist = 0.00000;
362                 oldRAbund = *rabund;
363                 oldList = *list;
364
365                 print_start = true;
366                 start = time(NULL);
367                 loops = 0;
368                 double saveCutoff = cutoff;
369                 
370                 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){  
371                 
372                         if (m->control_pressed) { //clean up
373                                 delete list; delete matrix; delete rabund; delete cluster;
374                                 if(countfile == "") {rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
375                 listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
376                         }
377                 
378                         if (print_start && m->isTrue(timing)) {
379                                 m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" 
380                                         + toString(m->roundDist(matrix->getSmallDist(), precision)) 
381                                         + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
382                                 cout.flush();
383                                 print_start = false;
384                         }
385
386                         loops++;
387
388                         cluster->update(cutoff);
389             
390             float dist = matrix->getSmallDist();
391                         float rndDist;
392                         if (hard) {
393                                 rndDist = m->ceilDist(dist, precision); 
394                         }else{
395                                 rndDist = m->roundDist(dist, precision); 
396                         }
397
398                         if(previousDist <= 0.0000 && dist != previousDist){
399                                 printData("unique");
400                         }
401                         else if(rndDist != rndPreviousDist){
402                                 printData(toString(rndPreviousDist,  length-1));
403                         }
404                 
405                         previousDist = dist;
406                         rndPreviousDist = rndDist;
407                         oldRAbund = *rabund;
408                         oldList = *list;
409                 }
410
411                 if (print_start && m->isTrue(timing)) {
412                         m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) 
413                                          + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
414                         cout.flush();
415                         print_start = false;
416                 }
417                 
418                 if(previousDist <= 0.0000){
419                         printData("unique");
420                 }
421                 else if(rndPreviousDist<cutoff){
422                         printData(toString(rndPreviousDist, length-1));
423                 }
424                 
425                 delete matrix;
426                 delete list;
427                 delete rabund;
428                 delete cluster;
429         if (countfile == "") {
430             sabundFile.close();
431             rabundFile.close();
432         }
433                 listFile.close();
434         
435                 if (saveCutoff != cutoff) { 
436                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
437                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
438
439                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
440                 }
441                 
442                 //set list file as new current listfile
443                 string current = "";
444                 itTypes = outputTypes.find("list");
445                 if (itTypes != outputTypes.end()) {
446                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
447                 }
448                 
449                 //set rabund file as new current rabundfile
450                 itTypes = outputTypes.find("rabund");
451                 if (itTypes != outputTypes.end()) {
452                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
453                 }
454                 
455                 //set sabund file as new current sabundfile
456                 itTypes = outputTypes.find("sabund");
457                 if (itTypes != outputTypes.end()) {
458                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
459                 }
460                 
461                 m->mothurOutEndLine();
462                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
463                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
464                 m->mothurOutEndLine();
465
466                 
467                 //if (m->isTrue(timing)) {
468                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
469                 //}
470                 
471                 
472                 return 0;
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "ClusterCommand", "execute");
476                 exit(1);
477         }
478 }
479
480 //**********************************************************************************************************************
481
482 void ClusterCommand::printData(string label){
483         try {
484                 if (m->isTrue(timing)) {
485                         m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) 
486                      + "\tclusters. Updates: " + toString(loops)); m->mothurOutEndLine();
487                 }
488                 print_start = true;
489                 loops = 0;
490                 start = time(NULL);
491         
492         oldRAbund.setLabel(label);
493         if (countfile == "") {
494             oldRAbund.print(rabundFile);
495             oldRAbund.getSAbundVector().print(sabundFile);
496         }
497        
498         if (m->isTrue(showabund)) {
499             oldRAbund.getSAbundVector().print(cout);
500         }
501         
502                 oldList.setLabel(label);
503                 oldList.print(listFile);
504         }
505         catch(exception& e) {
506                 m->errorOut(e, "ClusterCommand", "printData");
507                 exit(1);
508         }
509
510
511 }
512 //**********************************************************************************************************************
513
514 int ClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
515     try {
516         rabund->setLabel(list->getLabel());        
517         for(int i = 0; i < list->getNumBins(); i++) { 
518             if (m->control_pressed) { break; }
519             vector<string> binNames;
520             string bin = list->get(i);
521             m->splitAtComma(bin, binNames);
522             int total = 0;
523             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
524             rabund->push_back(total);   
525         }
526         return 0;
527     }
528     catch(exception& e) {
529                 m->errorOut(e, "ClusterCommand", "createRabund");
530                 exit(1);
531         }
532     
533 }
534 //**********************************************************************************************************************