]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12
13 //**********************************************************************************************************************
14 vector<string> ClassifyOtuCommand::getValidParameters(){        
15         try {
16                 string AlignArray[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
17                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "ClassifyOtuCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 ClassifyOtuCommand::ClassifyOtuCommand(){       
27         try {
28                 //initialize outputTypes
29                 vector<string> tempOutNames;
30                 outputTypes["constaxonomy"] = tempOutNames;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 vector<string> ClassifyOtuCommand::getRequiredParameters(){     
39         try {
40                 string Array[] =  {"list","taxonomy"};
41                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
42                 return myArray;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ClassifyOtuCommand", "getRequiredParameters");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 vector<string> ClassifyOtuCommand::getRequiredFiles(){  
51         try {
52                 vector<string> myArray;
53                 return myArray;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "ClassifyOtuCommand", "getRequiredFiles");
57                 exit(1);
58         }
59 }
60 //**********************************************************************************************************************
61 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
62         try{
63                 abort = false;
64                 allLines = 1;
65                 labels.clear();
66                                 
67                 //allow user to run help
68                 if (option == "help") { 
69                         help(); abort = true;
70                 } else {
71                         //valid paramters for this command
72                         string Array[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
73                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
74                         
75                         OptionParser parser(option);
76                         map<string, string> parameters = parser.getParameters();
77                         
78                         ValidParameters validParameter;
79                         map<string, string>::iterator it;
80                 
81                         //check to make sure all parameters are valid for command
82                         for (it = parameters.begin(); it != parameters.end(); it++) { 
83                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
84                         }
85                         
86                         //initialize outputTypes
87                         vector<string> tempOutNames;
88                         outputTypes["constaxonomy"] = tempOutNames;
89                 
90                         //if the user changes the input directory command factory will send this info to us in the output parameter 
91                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
92                         if (inputDir == "not found"){   inputDir = "";          }
93                         else {
94                                 string path;
95                                 it = parameters.find("list");
96                                 //user has given a template file
97                                 if(it != parameters.end()){ 
98                                         path = m->hasPath(it->second);
99                                         //if the user has not given a path then, add inputdir. else leave path alone.
100                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
101                                 }
102                                 
103                                 it = parameters.find("name");
104                                 //user has given a template file
105                                 if(it != parameters.end()){ 
106                                         path = m->hasPath(it->second);
107                                         //if the user has not given a path then, add inputdir. else leave path alone.
108                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
109                                 }
110                                 
111                                 it = parameters.find("taxonomy");
112                                 //user has given a template file
113                                 if(it != parameters.end()){ 
114                                         path = m->hasPath(it->second);
115                                         //if the user has not given a path then, add inputdir. else leave path alone.
116                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
117                                 }
118                         }
119
120                         
121                         //if the user changes the output directory command factory will send this info to us in the output parameter 
122                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
123                         
124                         //check for required parameters
125                         listfile = validParameter.validFile(parameters, "list", true);
126                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
127                         else if (listfile == "not open") { abort = true; }      
128                         
129                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
130                         if (taxfile == "not found") {  m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
131                         else if (taxfile == "not open") { abort = true; }       
132         
133                         namefile = validParameter.validFile(parameters, "name", true);
134                         if (namefile == "not open") { abort = true; }   
135                         else if (namefile == "not found") { namefile = ""; }
136                         
137                         //check for optional parameter and set defaults
138                         // ...at some point should added some additional type checking...
139                         label = validParameter.validFile(parameters, "label", false);                   
140                         if (label == "not found") { label = ""; allLines = 1;  }
141                         else { 
142                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
143                                 else { allLines = 1;  }
144                         }
145                         
146                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
147                         convert(temp, cutoff); 
148                         
149                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
150                         probs = m->isTrue(temp);
151                         
152                         
153                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
154                         
155                 }
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
159                 exit(1);
160         }
161 }
162
163 //**********************************************************************************************************************
164
165 void ClassifyOtuCommand::help(){
166         try {
167                 m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, label and probs.  The taxonomy and list parameters are required.\n");
168                 m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
169                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
170                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
171                 m->mothurOut("The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
172                 m->mothurOut("The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
173                 m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
174                 m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
175                 m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
176         }
177         catch(exception& e) {
178                 m->errorOut(e, "ClassifyOtuCommand", "help");
179                 exit(1);
180         }
181 }
182
183 //**********************************************************************************************************************
184
185 ClassifyOtuCommand::~ClassifyOtuCommand(){}
186
187 //**********************************************************************************************************************
188
189 int ClassifyOtuCommand::execute(){
190         try {
191         
192                 if (abort == true) { return 0; }
193                 
194                 //if user gave a namesfile then use it
195                 if (namefile != "") {   readNamesFile();        }
196                 
197                 //read taxonomy file and save in map for easy access in building bin trees
198                 readTaxonomyFile();
199                 
200                 if (m->control_pressed) { return 0; }
201                 
202                 input = new InputData(listfile, "list");
203                 list = input->getListVector();
204                 string lastLabel = list->getLabel();
205
206                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
207                 set<string> processedLabels;
208                 set<string> userLabels = labels;
209                 
210                 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  }  return 0; }
211         
212                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
213                         
214                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
215                         
216                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
217                                         process(list);
218                                         if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
219                                                                                 
220                                         processedLabels.insert(list->getLabel());
221                                         userLabels.erase(list->getLabel());
222                         }
223                         
224                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
225                                         string saveLabel = list->getLabel();
226                                         
227                                         delete list;
228                                         list = input->getListVector(lastLabel);
229                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
230                                         process(list);
231                                 
232                                         
233                                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
234                                                                                 
235                                         processedLabels.insert(list->getLabel());
236                                         userLabels.erase(list->getLabel());
237                                         
238                                         //restore real lastlabel to save below
239                                         list->setLabel(saveLabel);
240                         }
241                         
242                         lastLabel = list->getLabel();
243         
244                         delete list;
245                         list = input->getListVector();
246                 }
247                 
248                 //output error messages about any remaining user labels
249                 bool needToRun = false;
250                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
251                         m->mothurOut("Your file does not include the label " + (*it)); 
252                         if (processedLabels.count(lastLabel) != 1) {
253                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
254                                 needToRun = true;
255                         }else {
256                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
257                         }
258                 }
259                 
260                 //run last label if you need to
261                 if (needToRun == true)  {
262                         if (list != NULL) {     delete list;    }
263                         list = input->getListVector(lastLabel);
264                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
265                         
266                         process(list);
267                         delete list;
268                         
269                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
270                 }
271                 
272                 delete input;  
273                                 
274                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   remove(outputNames[i].c_str());  } return 0; }
275                 
276                 m->mothurOutEndLine();
277                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
278                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
279                 m->mothurOutEndLine();
280                 
281                 return 0;
282         }
283         catch(exception& e) {
284                 m->errorOut(e, "ClassifyOtuCommand", "execute");
285                 exit(1);
286         }
287 }
288
289 //**********************************************************************************************************************
290 int ClassifyOtuCommand::readNamesFile() {
291         try {
292                 
293                 ifstream inNames;
294                 m->openInputFile(namefile, inNames);
295                 
296                 string name, names;
297         
298                 while(!inNames.eof()){
299                         inNames >> name;                        //read from first column  A
300                         inNames >> names;               //read from second column  A,B,C,D
301                         m->gobble(inNames);
302                         
303                         //parse names into vector
304                         vector<string> theseNames;
305                         m->splitAtComma(names, theseNames);
306
307                         for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = name;  }
308                         
309                         if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
310                 }
311                 inNames.close();
312                 
313                 return 0;
314         }
315         catch(exception& e) {
316                 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
317                 exit(1);
318         }
319 }
320 //**********************************************************************************************************************
321 int ClassifyOtuCommand::readTaxonomyFile() {
322         try {
323                 
324                 ifstream in;
325                 m->openInputFile(taxfile, in);
326                 
327                 string name, tax;
328         
329                 while(!in.eof()){
330                         in >> name >> tax;              
331                         m->gobble(in);
332                         
333                         //are there confidence scores, if so remove them
334                         if (tax.find_first_of('(') != -1) {  removeConfidences(tax);    }
335                         
336                         taxMap[name] = tax;
337                         
338                         if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
339                 }
340                 in.close();
341                 
342                 return 0;
343         }
344         catch(exception& e) {
345                 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
346                 exit(1);
347         }
348 }
349 //**********************************************************************************************************************
350 string ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size) {
351         try{
352                 string conTax = "";
353                 vector<string> names;
354                 map<string, string>::iterator it;
355                 map<string, string>::iterator it2;
356
357                 //parse names into vector
358                 string binnames = thisList->get(bin);
359                 m->splitAtComma(binnames, names);
360
361                 //create a tree containing sequences from this bin
362                 PhyloTree* phylo = new PhyloTree();
363                 
364                 size = 0;
365                 for (int i = 0; i < names.size(); i++) {
366         
367                         //if namesfile include the names
368                         if (namefile != "") {
369         
370                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
371                                 it2 = nameMap.find(names[i]);
372                                 
373                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
374                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
375                                 }else{
376                                         
377                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
378                                         it = taxMap.find(it2->second);
379                         
380                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
381                                         
382                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
383                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
384                                         }else{
385                                 
386                                                 //add seq to tree
387                                                 phylo->addSeqToTree(names[i], it->second);
388                                                 size++;
389                                         }
390                                 }
391                                 
392                         }else{
393                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
394                                 it = taxMap.find(names[i]);
395                 
396                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
397                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
398                                 }else{
399                                         //add seq to tree
400                                         phylo->addSeqToTree(names[i], it->second);
401                                         size++;
402                                 }
403                         }
404
405                         
406                         if (m->control_pressed) { delete phylo; return conTax; }
407                         
408                 }
409                 
410                 //build tree
411                 phylo->assignHeirarchyIDs(0);
412                 
413                 TaxNode currentNode = phylo->get(0);
414                 
415                 //at each level
416                 while (currentNode.children.size() != 0) { //you still have more to explore
417                 
418                         TaxNode bestChild;
419                         int bestChildSize = 0;
420                         
421                         //go through children
422                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
423                                 
424                                 TaxNode temp = phylo->get(itChild->second);
425                                 
426                                 //select child with largest accesions - most seqs assigned to it
427                                 if (temp.accessions.size() > bestChildSize) {
428                                         bestChild = phylo->get(itChild->second);
429                                         bestChildSize = temp.accessions.size();
430                                 }
431                                 
432                         }
433                                 
434                         //is this taxonomy above cutoff
435                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
436                         
437                         if (consensusConfidence >= cutoff) { //if yes, add it
438                                 if (probs) {
439                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
440                                 }else{
441                                         conTax += bestChild.name + ";";
442                                 }
443                         }else{ //if no, quit
444                                 break;
445                         }
446                         
447                         //move down a level
448                         currentNode = bestChild;
449                 }
450                 
451                                 
452                 if (conTax == "") {  conTax = "no_consensus;";  }
453                 
454                 delete phylo;   
455                 
456                 return conTax;
457                         
458         }
459         catch(exception& e) {
460                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
461                 exit(1);
462         }
463 }
464
465 //**********************************************************************************************************************
466 int ClassifyOtuCommand::process(ListVector* processList) {
467         try{
468                 string conTax;
469                 int size;
470                 
471                 //create output file
472                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
473                                 
474                 ofstream out;
475                 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
476                 m->openOutputFile(outputFile, out);
477                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
478                 
479                 out << "OTU\tSize\tTaxonomy" << endl;
480                 
481                 //for each bin in the list vector
482                 for (int i = 0; i < processList->getNumBins(); i++) {
483         
484                         conTax  = findConsensusTaxonomy(i, processList, size);
485                 
486                         if (m->control_pressed) { out.close();  return 0; }
487                         
488                         //output to new names file
489                         out << (i+1) << '\t' << size << '\t' << conTax << endl;
490                 }
491
492                 out.close();
493                 
494                 return 0;
495
496         }
497         catch(exception& e) {
498                 m->errorOut(e, "ClassifyOtuCommand", "process");
499                 exit(1);
500         }
501 }
502 /**************************************************************************************************/
503 void ClassifyOtuCommand::removeConfidences(string& tax) {
504         try {
505                 
506                 string taxon;
507                 string newTax = "";
508                 
509                 while (tax.find_first_of(';') != -1) {
510                         //get taxon
511                         taxon = tax.substr(0,tax.find_first_of(';'));
512                         
513                         int pos = taxon.find_first_of('(');
514                         if (pos != -1) {
515                                 taxon = taxon.substr(0, pos); //rip off confidence 
516                         }
517                         
518                         taxon += ";";
519                         
520                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
521                         newTax += taxon;
522                 }
523                 
524                 tax = newTax;
525         }
526         catch(exception& e) {
527                 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
528                 exit(1);
529         }
530 }
531 //**********************************************************************************************************************
532
533