]> git.donarmstrong.com Git - mothur.git/blob - classifyotucommand.cpp
fixes while testing
[mothur.git] / classifyotucommand.cpp
1 /*
2  *  classifyotucommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12
13 //**********************************************************************************************************************
14 vector<string> ClassifyOtuCommand::getValidParameters(){        
15         try {
16                 string AlignArray[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
17                 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "ClassifyOtuCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 ClassifyOtuCommand::ClassifyOtuCommand(){       
27         try {
28                 abort = true;
29                 //initialize outputTypes
30                 vector<string> tempOutNames;
31                 outputTypes["constaxonomy"] = tempOutNames;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 vector<string> ClassifyOtuCommand::getRequiredParameters(){     
40         try {
41                 string Array[] =  {"list","taxonomy"};
42                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43                 return myArray;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "ClassifyOtuCommand", "getRequiredParameters");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 vector<string> ClassifyOtuCommand::getRequiredFiles(){  
52         try {
53                 vector<string> myArray;
54                 return myArray;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "ClassifyOtuCommand", "getRequiredFiles");
58                 exit(1);
59         }
60 }
61 //**********************************************************************************************************************
62 ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
63         try{
64                 abort = false;
65                 allLines = 1;
66                 labels.clear();
67                                 
68                 //allow user to run help
69                 if (option == "help") { 
70                         help(); abort = true;
71                 } else {
72                         //valid paramters for this command
73                         string Array[] =  {"list","label","name","taxonomy","cutoff","probs","outputdir","inputdir"};
74                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75                         
76                         OptionParser parser(option);
77                         map<string, string> parameters = parser.getParameters();
78                         
79                         ValidParameters validParameter;
80                         map<string, string>::iterator it;
81                 
82                         //check to make sure all parameters are valid for command
83                         for (it = parameters.begin(); it != parameters.end(); it++) { 
84                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
85                         }
86                         
87                         //initialize outputTypes
88                         vector<string> tempOutNames;
89                         outputTypes["constaxonomy"] = tempOutNames;
90                 
91                         //if the user changes the input directory command factory will send this info to us in the output parameter 
92                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
93                         if (inputDir == "not found"){   inputDir = "";          }
94                         else {
95                                 string path;
96                                 it = parameters.find("list");
97                                 //user has given a template file
98                                 if(it != parameters.end()){ 
99                                         path = m->hasPath(it->second);
100                                         //if the user has not given a path then, add inputdir. else leave path alone.
101                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
102                                 }
103                                 
104                                 it = parameters.find("name");
105                                 //user has given a template file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
110                                 }
111                                 
112                                 it = parameters.find("taxonomy");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
118                                 }
119                         }
120
121                         
122                         //if the user changes the output directory command factory will send this info to us in the output parameter 
123                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
124                         
125                         //check for required parameters
126                         listfile = validParameter.validFile(parameters, "list", true);
127                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
128                         else if (listfile == "not open") { abort = true; }      
129                         
130                         taxfile = validParameter.validFile(parameters, "taxonomy", true);
131                         if (taxfile == "not found") {  m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
132                         else if (taxfile == "not open") { abort = true; }       
133         
134                         namefile = validParameter.validFile(parameters, "name", true);
135                         if (namefile == "not open") { abort = true; }   
136                         else if (namefile == "not found") { namefile = ""; }
137                         
138                         //check for optional parameter and set defaults
139                         // ...at some point should added some additional type checking...
140                         label = validParameter.validFile(parameters, "label", false);                   
141                         if (label == "not found") { label = ""; allLines = 1;  }
142                         else { 
143                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
144                                 else { allLines = 1;  }
145                         }
146                         
147                         string temp = validParameter.validFile(parameters, "cutoff", false);                    if (temp == "not found") { temp = "51"; }
148                         convert(temp, cutoff); 
149                         
150                         temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
151                         probs = m->isTrue(temp);
152                         
153                         
154                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
155                         
156                 }
157         }
158         catch(exception& e) {
159                 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
160                 exit(1);
161         }
162 }
163
164 //**********************************************************************************************************************
165
166 void ClassifyOtuCommand::help(){
167         try {
168                 m->mothurOut("The classify.otu command parameters are list, taxonomy, name, cutoff, label and probs.  The taxonomy and list parameters are required.\n");
169                 m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
170                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
171                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
172                 m->mothurOut("The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
173                 m->mothurOut("The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
174                 m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
175                 m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
176                 m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
177         }
178         catch(exception& e) {
179                 m->errorOut(e, "ClassifyOtuCommand", "help");
180                 exit(1);
181         }
182 }
183
184 //**********************************************************************************************************************
185
186 ClassifyOtuCommand::~ClassifyOtuCommand(){}
187
188 //**********************************************************************************************************************
189
190 int ClassifyOtuCommand::execute(){
191         try {
192         
193                 if (abort == true) { return 0; }
194                 
195                 //if user gave a namesfile then use it
196                 if (namefile != "") {   readNamesFile();        }
197                 
198                 //read taxonomy file and save in map for easy access in building bin trees
199                 readTaxonomyFile();
200                 
201                 if (m->control_pressed) { return 0; }
202                 
203                 input = new InputData(listfile, "list");
204                 list = input->getListVector();
205                 string lastLabel = list->getLabel();
206
207                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
208                 set<string> processedLabels;
209                 set<string> userLabels = labels;
210                 
211                 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());  }  return 0; }
212         
213                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
214                         
215                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
216                         
217                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
218                                         process(list);
219                                         if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
220                                                                                 
221                                         processedLabels.insert(list->getLabel());
222                                         userLabels.erase(list->getLabel());
223                         }
224                         
225                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
226                                         string saveLabel = list->getLabel();
227                                         
228                                         delete list;
229                                         list = input->getListVector(lastLabel);
230                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
231                                         process(list);
232                                 
233                                         
234                                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
235                                                                                 
236                                         processedLabels.insert(list->getLabel());
237                                         userLabels.erase(list->getLabel());
238                                         
239                                         //restore real lastlabel to save below
240                                         list->setLabel(saveLabel);
241                         }
242                         
243                         lastLabel = list->getLabel();
244         
245                         delete list;
246                         list = input->getListVector();
247                 }
248                 
249                 //output error messages about any remaining user labels
250                 bool needToRun = false;
251                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
252                         m->mothurOut("Your file does not include the label " + (*it)); 
253                         if (processedLabels.count(lastLabel) != 1) {
254                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
255                                 needToRun = true;
256                         }else {
257                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
258                         }
259                 }
260                 
261                 //run last label if you need to
262                 if (needToRun == true)  {
263                         if (list != NULL) {     delete list;    }
264                         list = input->getListVector(lastLabel);
265                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
266                         
267                         process(list);
268                         delete list;
269                         
270                         if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } delete input; delete list; return 0; }
271                 }
272                 
273                 delete input;  
274                                 
275                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   remove(outputNames[i].c_str());  } return 0; }
276                 
277                 m->mothurOutEndLine();
278                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
279                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
280                 m->mothurOutEndLine();
281                 
282                 return 0;
283         }
284         catch(exception& e) {
285                 m->errorOut(e, "ClassifyOtuCommand", "execute");
286                 exit(1);
287         }
288 }
289
290 //**********************************************************************************************************************
291 int ClassifyOtuCommand::readNamesFile() {
292         try {
293                 
294                 ifstream inNames;
295                 m->openInputFile(namefile, inNames);
296                 
297                 string name, names;
298         
299                 while(!inNames.eof()){
300                         inNames >> name;                        //read from first column  A
301                         inNames >> names;               //read from second column  A,B,C,D
302                         m->gobble(inNames);
303                         
304                         //parse names into vector
305                         vector<string> theseNames;
306                         m->splitAtComma(names, theseNames);
307
308                         for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = name;  }
309                         
310                         if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
311                 }
312                 inNames.close();
313                 
314                 return 0;
315         }
316         catch(exception& e) {
317                 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
318                 exit(1);
319         }
320 }
321 //**********************************************************************************************************************
322 int ClassifyOtuCommand::readTaxonomyFile() {
323         try {
324                 
325                 ifstream in;
326                 m->openInputFile(taxfile, in);
327                 
328                 string name, tax;
329         
330                 while(!in.eof()){
331                         in >> name >> tax;              
332                         m->gobble(in);
333                         
334                         //are there confidence scores, if so remove them
335                         if (tax.find_first_of('(') != -1) {  removeConfidences(tax);    }
336                         
337                         taxMap[name] = tax;
338                         
339                         if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
340                 }
341                 in.close();
342                 
343                 return 0;
344         }
345         catch(exception& e) {
346                 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
347                 exit(1);
348         }
349 }
350 //**********************************************************************************************************************
351 string ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size) {
352         try{
353                 string conTax = "";
354                 vector<string> names;
355                 map<string, string>::iterator it;
356                 map<string, string>::iterator it2;
357
358                 //parse names into vector
359                 string binnames = thisList->get(bin);
360                 m->splitAtComma(binnames, names);
361
362                 //create a tree containing sequences from this bin
363                 PhyloTree* phylo = new PhyloTree();
364                 
365                 size = 0;
366                 for (int i = 0; i < names.size(); i++) {
367         
368                         //if namesfile include the names
369                         if (namefile != "") {
370         
371                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
372                                 it2 = nameMap.find(names[i]);
373                                 
374                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
375                                         m->mothurOut(names[i] + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
376                                 }else{
377                                         
378                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
379                                         it = taxMap.find(it2->second);
380                         
381                                         if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
382                                         
383                                                 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
384                                                 else {  m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
385                                         }else{
386                                 
387                                                 //add seq to tree
388                                                 phylo->addSeqToTree(names[i], it->second);
389                                                 size++;
390                                         }
391                                 }
392                                 
393                         }else{
394                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
395                                 it = taxMap.find(names[i]);
396                 
397                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
398                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
399                                 }else{
400                                         //add seq to tree
401                                         phylo->addSeqToTree(names[i], it->second);
402                                         size++;
403                                 }
404                         }
405
406                         
407                         if (m->control_pressed) { delete phylo; return conTax; }
408                         
409                 }
410                 
411                 //build tree
412                 phylo->assignHeirarchyIDs(0);
413                 
414                 TaxNode currentNode = phylo->get(0);
415                 
416                 //at each level
417                 while (currentNode.children.size() != 0) { //you still have more to explore
418                 
419                         TaxNode bestChild;
420                         int bestChildSize = 0;
421                         
422                         //go through children
423                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
424                                 
425                                 TaxNode temp = phylo->get(itChild->second);
426                                 
427                                 //select child with largest accesions - most seqs assigned to it
428                                 if (temp.accessions.size() > bestChildSize) {
429                                         bestChild = phylo->get(itChild->second);
430                                         bestChildSize = temp.accessions.size();
431                                 }
432                                 
433                         }
434                                 
435                         //is this taxonomy above cutoff
436                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
437                         
438                         if (consensusConfidence >= cutoff) { //if yes, add it
439                                 if (probs) {
440                                         conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
441                                 }else{
442                                         conTax += bestChild.name + ";";
443                                 }
444                         }else{ //if no, quit
445                                 break;
446                         }
447                         
448                         //move down a level
449                         currentNode = bestChild;
450                 }
451                 
452                                 
453                 if (conTax == "") {  conTax = "no_consensus;";  }
454                 
455                 delete phylo;   
456                 
457                 return conTax;
458                         
459         }
460         catch(exception& e) {
461                 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
462                 exit(1);
463         }
464 }
465
466 //**********************************************************************************************************************
467 int ClassifyOtuCommand::process(ListVector* processList) {
468         try{
469                 string conTax;
470                 int size;
471                 
472                 //create output file
473                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
474                                 
475                 ofstream out;
476                 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
477                 m->openOutputFile(outputFile, out);
478                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
479                 
480                 out << "OTU\tSize\tTaxonomy" << endl;
481                 
482                 //for each bin in the list vector
483                 for (int i = 0; i < processList->getNumBins(); i++) {
484         
485                         conTax  = findConsensusTaxonomy(i, processList, size);
486                 
487                         if (m->control_pressed) { out.close();  return 0; }
488                         
489                         //output to new names file
490                         out << (i+1) << '\t' << size << '\t' << conTax << endl;
491                 }
492
493                 out.close();
494                 
495                 return 0;
496
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "ClassifyOtuCommand", "process");
500                 exit(1);
501         }
502 }
503 /**************************************************************************************************/
504 void ClassifyOtuCommand::removeConfidences(string& tax) {
505         try {
506                 
507                 string taxon;
508                 string newTax = "";
509                 
510                 while (tax.find_first_of(';') != -1) {
511                         //get taxon
512                         taxon = tax.substr(0,tax.find_first_of(';'));
513                         
514                         int pos = taxon.find_first_of('(');
515                         if (pos != -1) {
516                                 taxon = taxon.substr(0, pos); //rip off confidence 
517                         }
518                         
519                         taxon += ";";
520                         
521                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
522                         newTax += taxon;
523                 }
524                 
525                 tax = newTax;
526         }
527         catch(exception& e) {
528                 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
529                 exit(1);
530         }
531 }
532 //**********************************************************************************************************************
533
534