]> git.donarmstrong.com Git - mothur.git/blob - clustercommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / clustercommand.cpp
1 /*
2  *  clustercommand.cpp
3  *  Dotur
4  *
5  *  Created by Sarah Westcott on 1/2/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "clustercommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "clusterdoturcommand.h"
15
16
17 //**********************************************************************************************************************
18 vector<string> ClusterCommand::setParameters(){ 
19         try {
20                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none","list",false,false,true); parameters.push_back(pphylip);
21                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","rabund-sabund",false,false,true); parameters.push_back(pname);
22                 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
23         CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);          
24                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "","",false,false,true); parameters.push_back(pcutoff);
25                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
26                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false,true); parameters.push_back(pmethod);
27                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
28                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
29                 CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim);
30                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
31                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
32                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
33                 
34                 vector<string> myArray;
35                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
36                 return myArray;
37         }
38         catch(exception& e) {
39                 m->errorOut(e, "ClusterCommand", "setParameters");
40                 exit(1);
41         }
42 }
43 //**********************************************************************************************************************
44 string ClusterCommand::getHelpString(){ 
45         try {
46                 string helpString = "";
47                 helpString += "The cluster command parameter options are phylip, column, name, count, method, cuttoff, hard, precision, sim, showabund and timing. Phylip or column and name are required, unless you have a valid current file.\n";
48                 helpString += "The cluster command should be in the following format: \n";
49                 helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
50                 helpString += "The acceptable cluster methods are furthest, nearest, average and weighted.  If no method is provided then average is assumed.\n";       
51                 return helpString;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "ClusterCommand", "getHelpString");
55                 exit(1);
56         }
57 }
58 //**********************************************************************************************************************
59 string ClusterCommand::getOutputPattern(string type) {
60     try {
61         string pattern = "";
62         
63         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
64         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
65         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
66         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
67         
68         return pattern;
69     }
70     catch(exception& e) {
71         m->errorOut(e, "ClusterCommand", "getOutputPattern");
72         exit(1);
73     }
74 }
75 //**********************************************************************************************************************
76 ClusterCommand::ClusterCommand(){       
77         try {
78                 abort = true; calledHelp = true; 
79                 setParameters();
80                 vector<string> tempOutNames;
81                 outputTypes["list"] = tempOutNames;
82                 outputTypes["rabund"] = tempOutNames;
83                 outputTypes["sabund"] = tempOutNames;
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
87                 exit(1);
88         }
89 }
90 //**********************************************************************************************************************
91 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
92 ClusterCommand::ClusterCommand(string option)  {
93         try{
94                 abort = false; calledHelp = false;   
95                 
96                 //allow user to run help
97                 if(option == "help") { help(); abort = true; calledHelp = true; }
98                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99                 
100                 else {
101                         vector<string> myArray = setParameters();
102                         
103                         OptionParser parser(option);
104                         map<string,string> parameters = parser.getParameters();
105                         map<string,string>::iterator it;
106                         
107                         ValidParameters validParameter;
108                 
109                         //check to make sure all parameters are valid for command
110                         for (it = parameters.begin(); it != parameters.end(); it++) { 
111                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
112                                         abort = true;
113                                 }
114                         }
115                         
116                         //initialize outputTypes
117                         vector<string> tempOutNames;
118                         outputTypes["list"] = tempOutNames;
119                         outputTypes["rabund"] = tempOutNames;
120                         outputTypes["sabund"] = tempOutNames;
121                 
122                         //if the user changes the output directory command factory will send this info to us in the output parameter 
123                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
124                         
125                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
126                         if (inputDir == "not found"){   inputDir = "";          }
127                         else {
128                                 string path;
129                                 it = parameters.find("phylip");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
135                                 }
136                                 
137                                 it = parameters.find("column");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
143                                 }
144                                 
145                                 it = parameters.find("name");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
151                                 }
152                 
153                 it = parameters.find("count");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
159                                 }
160                         }
161                         
162                         //check for required parameters
163                         phylipfile = validParameter.validFile(parameters, "phylip", true);
164                         if (phylipfile == "not open") { phylipfile = ""; abort = true; }
165                         else if (phylipfile == "not found") { phylipfile = ""; }        
166                         else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
167                         
168                         columnfile = validParameter.validFile(parameters, "column", true);
169                         if (columnfile == "not open") { columnfile = ""; abort = true; }        
170                         else if (columnfile == "not found") { columnfile = ""; }
171                         else {  distfile = columnfile; format = "column"; m->setColumnFile(columnfile); }
172                         
173                         namefile = validParameter.validFile(parameters, "name", true);
174                         if (namefile == "not open") { abort = true; }   
175                         else if (namefile == "not found") { namefile = ""; }
176                         else { m->setNameFile(namefile); }
177             
178             countfile = validParameter.validFile(parameters, "count", true);
179                         if (countfile == "not open") { abort = true; countfile = ""; }  
180                         else if (countfile == "not found") { countfile = ""; }
181                         else { m->setCountTableFile(countfile); }
182                         
183                         if ((phylipfile == "") && (columnfile == "")) { 
184                                 //is there are current file available for either of these?
185                                 //give priority to column, then phylip
186                                 columnfile = m->getColumnFile(); 
187                                 if (columnfile != "") {  distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
188                                 else { 
189                                         phylipfile = m->getPhylipFile(); 
190                                         if (phylipfile != "") { distfile = phylipfile;  format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
191                                         else { 
192                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the cluster command."); m->mothurOutEndLine(); 
193                                                 abort = true;
194                                         }
195                                 }
196                         }
197                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
198                         
199                         if (columnfile != "") {
200                                 if ((namefile == "") && (countfile == "")){ 
201                                         namefile = m->getNameFile(); 
202                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
203                                         else { 
204                                                 countfile = m->getCountTableFile();
205                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
206                         else { 
207                             m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
208                             abort = true; 
209                         }       
210                                         }       
211                                 }
212                         }
213                         
214             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
215             
216                         //check for optional parameter and set defaults
217                         // ...at some point should added some additional type checking...
218                         //get user cutoff and precision or use defaults
219                         string temp;
220                         temp = validParameter.validFile(parameters, "precision", false);
221                         if (temp == "not found") { temp = "100"; }
222                         //saves precision legnth for formatting below
223                         length = temp.length();
224                         m->mothurConvert(temp, precision); 
225                         
226                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
227                         hard = m->isTrue(temp);
228                         
229                         temp = validParameter.validFile(parameters, "sim", false);                              if (temp == "not found") { temp = "F"; }
230                         sim = m->isTrue(temp); 
231                         
232                         temp = validParameter.validFile(parameters, "cutoff", false);
233                         if (temp == "not found") { temp = "10"; }
234                         m->mothurConvert(temp, cutoff); 
235                         cutoff += (5 / (precision * 10.0));  
236                         
237                         method = validParameter.validFile(parameters, "method", false);
238                         if (method == "not found") { method = "average"; }
239                         
240                         if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) { }
241                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest, average, and weighted."); m->mothurOutEndLine(); abort = true; }
242
243                         showabund = validParameter.validFile(parameters, "showabund", false);
244                         if (showabund == "not found") { showabund = "T"; }
245
246                         timing = validParameter.validFile(parameters, "timing", false);
247                         if (timing == "not found") { timing = "F"; }
248                         
249                 }
250         }
251         catch(exception& e) {
252                 m->errorOut(e, "ClusterCommand", "ClusterCommand");
253                 exit(1);
254         }
255 }
256 //**********************************************************************************************************************
257 ClusterCommand::~ClusterCommand(){}
258 //**********************************************************************************************************************
259
260 int ClusterCommand::execute(){
261         try {
262         
263                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
264                 
265                 //phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster
266                 if ((format == "phylip") && (cutoff > 10.0)) {
267                         m->mothurOutEndLine(); m->mothurOut("You are using a phylip file and no cutoff.  I will run cluster.classic to save memory and time."); m->mothurOutEndLine();
268                         
269                         //run unique.seqs for deconvolute results
270                         string inputString = "phylip=" + distfile;
271                         if (namefile != "") { inputString += ", name=" + namefile; }
272             else if (countfile != "") { inputString += ", count=" + countfile; }
273                         inputString += ", precision=" + toString(precision);
274                         inputString += ", method=" + method;
275                         if (hard)       { inputString += ", hard=T";    }
276                         else            { inputString += ", hard=F";    }
277                         if (sim)        { inputString += ", sim=T";             }
278                         else            { inputString += ", sim=F";             }
279
280                         
281                         m->mothurOutEndLine(); 
282                         m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine(); 
283                         m->mothurOut("Running command: cluster.classic(" + inputString + ")"); m->mothurOutEndLine(); 
284                         
285                         Command* clusterClassicCommand = new ClusterDoturCommand(inputString);
286                         clusterClassicCommand->execute();
287                         delete clusterClassicCommand;
288                         
289                         m->mothurOut("/------------------------------------------------------------/"); m->mothurOutEndLine();  
290
291                         return 0;
292                 }
293                 
294                 ReadMatrix* read;
295                 if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); }       //sim indicates whether its a similarity matrix
296                 else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
297                 
298                 read->setCutoff(cutoff);
299                 
300                 NameAssignment* nameMap = NULL;
301         CountTable* ct = NULL;
302                 if(namefile != ""){     
303                         nameMap = new NameAssignment(namefile);
304                         nameMap->readMap();
305             read->read(nameMap);
306                 }else if (countfile != "") {
307             ct = new CountTable();
308             ct->readTable(countfile);
309             read->read(ct);
310         }else { read->read(nameMap); }
311                 
312                 list = read->getListVector();
313                 matrix = read->getDMatrix();
314         
315                 if(countfile != "") {
316             rabund = new RAbundVector();
317             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
318             delete ct;
319         }else { rabund = new RAbundVector(list->getRAbundVector()); }
320                 delete read;
321                 
322                 if (m->control_pressed) { //clean up
323                         delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
324                         listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
325                 }
326                 
327                 //create cluster
328                 if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
329                 else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
330                 else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);     }
331                 else if(method == "weighted"){  cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method);    }
332                 tag = cluster->getTag();
333                 
334                 if (outputDir == "") { outputDir += m->hasPath(distfile); }
335                 fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
336                 
337         map<string, string> variables; 
338         variables["[filename]"] = fileroot;
339         variables["[clustertag]"] = tag;
340         string sabundFileName = getOutputFileName("sabund", variables);
341         string rabundFileName = getOutputFileName("rabund", variables);
342         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
343         string listFileName = getOutputFileName("list", variables);
344         
345         if (countfile == "") {
346             m->openOutputFile(sabundFileName,   sabundFile);
347             m->openOutputFile(rabundFileName,   rabundFile);
348             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
349             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
350
351         }
352                 m->openOutputFile(listFileName, listFile);
353         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
354                 
355                 
356                 time_t estart = time(NULL);
357                 float previousDist = 0.00000;
358                 float rndPreviousDist = 0.00000;
359                 oldRAbund = *rabund;
360                 oldList = *list;
361
362                 print_start = true;
363                 start = time(NULL);
364                 loops = 0;
365                 double saveCutoff = cutoff;
366                 
367                 while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){  
368                 
369                         if (m->control_pressed) { //clean up
370                                 delete list; delete matrix; delete rabund; delete cluster;
371                                 if(countfile == "") {rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
372                 listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
373                         }
374                 
375                         if (print_start && m->isTrue(timing)) {
376                                 m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/" 
377                                         + toString(m->roundDist(matrix->getSmallDist(), precision)) 
378                                         + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
379                                 cout.flush();
380                                 print_start = false;
381                         }
382
383                         loops++;
384
385                         cluster->update(cutoff);
386             
387             float dist = matrix->getSmallDist();
388                         float rndDist;
389                         if (hard) {
390                                 rndDist = m->ceilDist(dist, precision); 
391                         }else{
392                                 rndDist = m->roundDist(dist, precision); 
393                         }
394
395                         if(previousDist <= 0.0000 && dist != previousDist){
396                                 printData("unique");
397                         }
398                         else if(rndDist != rndPreviousDist){
399                                 printData(toString(rndPreviousDist,  length-1));
400                         }
401                 
402                         previousDist = dist;
403                         rndPreviousDist = rndDist;
404                         oldRAbund = *rabund;
405                         oldList = *list;
406                 }
407
408                 if (print_start && m->isTrue(timing)) {
409                         m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) 
410                                          + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
411                         cout.flush();
412                         print_start = false;
413                 }
414                 
415                 if(previousDist <= 0.0000){
416                         printData("unique");
417                 }
418                 else if(rndPreviousDist<cutoff){
419                         printData(toString(rndPreviousDist, length-1));
420                 }
421                 
422                 delete matrix;
423                 delete list;
424                 delete rabund;
425                 delete cluster;
426         if (countfile == "") {
427             sabundFile.close();
428             rabundFile.close();
429         }
430                 listFile.close();
431         
432                 if (saveCutoff != cutoff) { 
433                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
434                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
435
436                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
437                 }
438                 
439                 //set list file as new current listfile
440                 string current = "";
441                 itTypes = outputTypes.find("list");
442                 if (itTypes != outputTypes.end()) {
443                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
444                 }
445                 
446                 //set rabund file as new current rabundfile
447                 itTypes = outputTypes.find("rabund");
448                 if (itTypes != outputTypes.end()) {
449                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
450                 }
451                 
452                 //set sabund file as new current sabundfile
453                 itTypes = outputTypes.find("sabund");
454                 if (itTypes != outputTypes.end()) {
455                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
456                 }
457                 
458                 m->mothurOutEndLine();
459                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
460                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
461                 m->mothurOutEndLine();
462
463                 
464                 //if (m->isTrue(timing)) {
465                         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
466                 //}
467                 
468                 
469                 return 0;
470         }
471         catch(exception& e) {
472                 m->errorOut(e, "ClusterCommand", "execute");
473                 exit(1);
474         }
475 }
476
477 //**********************************************************************************************************************
478
479 void ClusterCommand::printData(string label){
480         try {
481                 if (m->isTrue(timing)) {
482                         m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) 
483                      + "\tclusters. Updates: " + toString(loops)); m->mothurOutEndLine();
484                 }
485                 print_start = true;
486                 loops = 0;
487                 start = time(NULL);
488         
489         oldRAbund.setLabel(label);
490         if (countfile == "") {
491             oldRAbund.print(rabundFile);
492             oldRAbund.getSAbundVector().print(sabundFile);
493         }
494        
495         if (m->isTrue(showabund)) {
496             oldRAbund.getSAbundVector().print(cout);
497         }
498         
499                 oldList.setLabel(label);
500                 oldList.print(listFile);
501         }
502         catch(exception& e) {
503                 m->errorOut(e, "ClusterCommand", "printData");
504                 exit(1);
505         }
506
507
508 }
509 //**********************************************************************************************************************
510
511 int ClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
512     try {
513         rabund->setLabel(list->getLabel());        
514         for(int i = 0; i < list->getNumBins(); i++) { 
515             if (m->control_pressed) { break; }
516             vector<string> binNames;
517             string bin = list->get(i);
518             m->splitAtComma(bin, binNames);
519             int total = 0;
520             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
521             rabund->push_back(total);   
522         }
523         return 0;
524     }
525     catch(exception& e) {
526                 m->errorOut(e, "ClusterCommand", "createRabund");
527                 exit(1);
528         }
529     
530 }
531 //**********************************************************************************************************************