2 * classifyotucommand.cpp
5 * Created by westcott on 6/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "classifyotucommand.h"
11 #include "phylotree.h"
12 #include "phylosummary.h"
14 //**********************************************************************************************************************
15 vector<string> ClassifyOtuCommand::getValidParameters(){
17 string AlignArray[] = {"list","label","name","taxonomy","basis","cutoff","probs","group","reftaxonomy","outputdir","inputdir"};
18 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
22 m->errorOut(e, "ClassifyOtuCommand", "getValidParameters");
26 //**********************************************************************************************************************
27 ClassifyOtuCommand::ClassifyOtuCommand(){
29 abort = true; calledHelp = true;
30 vector<string> tempOutNames;
31 outputTypes["constaxonomy"] = tempOutNames;
32 outputTypes["taxsummary"] = tempOutNames;
35 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
39 //**********************************************************************************************************************
40 vector<string> ClassifyOtuCommand::getRequiredParameters(){
42 string Array[] = {"list","taxonomy"};
43 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
47 m->errorOut(e, "ClassifyOtuCommand", "getRequiredParameters");
51 //**********************************************************************************************************************
52 vector<string> ClassifyOtuCommand::getRequiredFiles(){
54 vector<string> myArray;
58 m->errorOut(e, "ClassifyOtuCommand", "getRequiredFiles");
62 //**********************************************************************************************************************
63 ClassifyOtuCommand::ClassifyOtuCommand(string option) {
65 abort = false; calledHelp = false;
69 //allow user to run help
70 if (option == "help") {
71 help(); abort = true; calledHelp = true;
73 //valid paramters for this command
74 string Array[] = {"list","label","name","taxonomy","cutoff","probs","basis","reftaxonomy","group","outputdir","inputdir"};
75 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
77 OptionParser parser(option);
78 map<string, string> parameters = parser.getParameters();
80 ValidParameters validParameter;
81 map<string, string>::iterator it;
83 //check to make sure all parameters are valid for command
84 for (it = parameters.begin(); it != parameters.end(); it++) {
85 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
88 //initialize outputTypes
89 vector<string> tempOutNames;
90 outputTypes["constaxonomy"] = tempOutNames;
91 outputTypes["taxsummary"] = tempOutNames;
93 //if the user changes the input directory command factory will send this info to us in the output parameter
94 string inputDir = validParameter.validFile(parameters, "inputdir", false);
95 if (inputDir == "not found"){ inputDir = ""; }
98 it = parameters.find("list");
99 //user has given a template file
100 if(it != parameters.end()){
101 path = m->hasPath(it->second);
102 //if the user has not given a path then, add inputdir. else leave path alone.
103 if (path == "") { parameters["list"] = inputDir + it->second; }
106 it = parameters.find("name");
107 //user has given a template file
108 if(it != parameters.end()){
109 path = m->hasPath(it->second);
110 //if the user has not given a path then, add inputdir. else leave path alone.
111 if (path == "") { parameters["name"] = inputDir + it->second; }
114 it = parameters.find("taxonomy");
115 //user has given a template file
116 if(it != parameters.end()){
117 path = m->hasPath(it->second);
118 //if the user has not given a path then, add inputdir. else leave path alone.
119 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
122 it = parameters.find("reftaxonomy");
123 //user has given a template file
124 if(it != parameters.end()){
125 path = m->hasPath(it->second);
126 //if the user has not given a path then, add inputdir. else leave path alone.
127 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
130 it = parameters.find("group");
131 //user has given a template file
132 if(it != parameters.end()){
133 path = m->hasPath(it->second);
134 //if the user has not given a path then, add inputdir. else leave path alone.
135 if (path == "") { parameters["group"] = inputDir + it->second; }
140 //if the user changes the output directory command factory will send this info to us in the output parameter
141 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
143 //check for required parameters
144 listfile = validParameter.validFile(parameters, "list", true);
145 if (listfile == "not found") { m->mothurOut("list is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
146 else if (listfile == "not open") { abort = true; }
148 taxfile = validParameter.validFile(parameters, "taxonomy", true);
149 if (taxfile == "not found") { m->mothurOut("taxonomy is a required parameter for the classify.otu command."); m->mothurOutEndLine(); abort = true; }
150 else if (taxfile == "not open") { abort = true; }
152 refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
153 if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
154 else if (refTaxonomy == "not open") { abort = true; }
156 namefile = validParameter.validFile(parameters, "name", true);
157 if (namefile == "not open") { abort = true; }
158 else if (namefile == "not found") { namefile = ""; }
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
164 //check for optional parameter and set defaults
165 // ...at some point should added some additional type checking...
166 label = validParameter.validFile(parameters, "label", false);
167 if (label == "not found") { label = ""; allLines = 1; }
169 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
170 else { allLines = 1; }
173 basis = validParameter.validFile(parameters, "basis", false);
174 if (basis == "not found") { basis = "otu"; }
176 if ((basis != "otu") && (basis != "sequence")) { m->mothurOut("Invalid option for basis. basis options are otu and sequence, using otu."); m->mothurOutEndLine(); }
178 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
179 convert(temp, cutoff);
181 temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
182 probs = m->isTrue(temp);
185 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
189 catch(exception& e) {
190 m->errorOut(e, "ClassifyOtuCommand", "ClassifyOtuCommand");
195 //**********************************************************************************************************************
197 void ClassifyOtuCommand::help(){
199 m->mothurOut("The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required.\n");
200 m->mothurOut("The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n");
201 m->mothurOut("The name parameter allows you add a names file with your taxonomy file.\n");
202 m->mothurOut("The group parameter allows you provide a group file to use in creating the summary file breakdown.\n");
203 m->mothurOut("The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n");
204 m->mothurOut("For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n");
205 m->mothurOut("16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n");
206 m->mothurOut("Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n");
207 m->mothurOut("6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n");
208 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
209 m->mothurOut("The default value for label is all labels in your inputfile.\n");
210 m->mothurOut("The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n");
211 m->mothurOut("The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n");
212 m->mothurOut("The classify.otu command should be in the following format: classify.otu(taxonomy=yourTaxonomyFile, list=yourListFile, name=yourNamesFile, label=yourLabels).\n");
213 m->mothurOut("Example classify.otu(taxonomy=abrecovery.silva.full.taxonomy, list=abrecovery.fn.list, label=0.10).\n");
214 m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n");
216 catch(exception& e) {
217 m->errorOut(e, "ClassifyOtuCommand", "help");
222 //**********************************************************************************************************************
224 ClassifyOtuCommand::~ClassifyOtuCommand(){}
226 //**********************************************************************************************************************
228 int ClassifyOtuCommand::execute(){
231 if (abort == true) { if (calledHelp) { return 0; } return 2; }
233 //if user gave a namesfile then use it
234 if (namefile != "") { readNamesFile(); }
236 //read taxonomy file and save in map for easy access in building bin trees
239 if (m->control_pressed) { return 0; }
241 input = new InputData(listfile, "list");
242 list = input->getListVector();
243 string lastLabel = list->getLabel();
245 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
246 set<string> processedLabels;
247 set<string> userLabels = labels;
249 if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
251 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
253 if (allLines == 1 || labels.count(list->getLabel()) == 1){
255 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
257 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
259 processedLabels.insert(list->getLabel());
260 userLabels.erase(list->getLabel());
263 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
264 string saveLabel = list->getLabel();
267 list = input->getListVector(lastLabel);
268 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
272 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
274 processedLabels.insert(list->getLabel());
275 userLabels.erase(list->getLabel());
277 //restore real lastlabel to save below
278 list->setLabel(saveLabel);
281 lastLabel = list->getLabel();
284 list = input->getListVector();
287 //output error messages about any remaining user labels
288 bool needToRun = false;
289 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
290 m->mothurOut("Your file does not include the label " + (*it));
291 if (processedLabels.count(lastLabel) != 1) {
292 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
295 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
299 //run last label if you need to
300 if (needToRun == true) {
301 if (list != NULL) { delete list; }
302 list = input->getListVector(lastLabel);
303 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
308 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete input; delete list; return 0; }
313 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
315 m->mothurOutEndLine();
316 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
317 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
318 m->mothurOutEndLine();
322 catch(exception& e) {
323 m->errorOut(e, "ClassifyOtuCommand", "execute");
328 //**********************************************************************************************************************
329 int ClassifyOtuCommand::readNamesFile() {
333 m->openInputFile(namefile, inNames);
337 while(!inNames.eof()){
338 inNames >> name; //read from first column A
339 inNames >> names; //read from second column A,B,C,D
342 //parse names into vector
343 vector<string> theseNames;
344 m->splitAtComma(names, theseNames);
346 for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
348 if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
354 catch(exception& e) {
355 m->errorOut(e, "ClassifyOtuCommand", "readNamesFile");
359 //**********************************************************************************************************************
360 int ClassifyOtuCommand::readTaxonomyFile() {
364 m->openInputFile(taxfile, in);
372 //are there confidence scores, if so remove them
373 if (tax.find_first_of('(') != -1) { removeConfidences(tax); }
377 if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
383 catch(exception& e) {
384 m->errorOut(e, "ClassifyOtuCommand", "readTaxonomyFile");
388 //**********************************************************************************************************************
389 vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
392 vector<string> names;
393 vector<string> allNames;
394 map<string, string>::iterator it;
395 map<string, string>::iterator it2;
397 //parse names into vector
398 string binnames = thisList->get(bin);
399 m->splitAtComma(binnames, names);
401 //create a tree containing sequences from this bin
402 PhyloTree* phylo = new PhyloTree();
405 for (int i = 0; i < names.size(); i++) {
407 //if namesfile include the names
408 if (namefile != "") {
410 //is this sequence in the name file - namemap maps seqName -> repSeqName
411 it2 = nameMap.find(names[i]);
413 if (it2 == nameMap.end()) { //this name is not in name file, skip it
414 m->mothurOut(names[i] + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
417 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
418 it = taxMap.find(it2->second);
420 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
422 if (names[i] != it2->second) { m->mothurOut(names[i] + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
423 else { m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
427 phylo->addSeqToTree(names[i], it->second);
429 allNames.push_back(names[i]);
434 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
435 it = taxMap.find(names[i]);
437 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
438 m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
441 phylo->addSeqToTree(names[i], it->second);
443 allNames.push_back(names[i]);
448 if (m->control_pressed) { delete phylo; return allNames; }
453 phylo->assignHeirarchyIDs(0);
455 TaxNode currentNode = phylo->get(0);
458 while (currentNode.children.size() != 0) { //you still have more to explore
461 int bestChildSize = 0;
463 //go through children
464 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
466 TaxNode temp = phylo->get(itChild->second);
468 //select child with largest accesions - most seqs assigned to it
469 if (temp.accessions.size() > bestChildSize) {
470 bestChild = phylo->get(itChild->second);
471 bestChildSize = temp.accessions.size();
476 //is this taxonomy above cutoff
477 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
479 if (consensusConfidence >= cutoff) { //if yes, add it
481 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
483 conTax += bestChild.name + ";";
490 currentNode = bestChild;
494 if (conTax == "") { conTax = "no_consensus;"; }
501 catch(exception& e) {
502 m->errorOut(e, "ClassifyOtuCommand", "findConsensusTaxonomy");
507 //**********************************************************************************************************************
508 int ClassifyOtuCommand::process(ListVector* processList) {
514 if (outputDir == "") { outputDir += m->hasPath(listfile); }
517 string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.taxonomy";
518 m->openOutputFile(outputFile, out);
519 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
522 string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".cons.tax.summary";
523 m->openOutputFile(outputSumFile, outSum);
524 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
526 out << "OTU\tSize\tTaxonomy" << endl;
528 PhyloSummary* taxaSum;
529 if (refTaxonomy != "") {
530 taxaSum = new PhyloSummary(refTaxonomy, groupfile);
532 taxaSum = new PhyloSummary(groupfile);
535 //for each bin in the list vector
536 for (int i = 0; i < processList->getNumBins(); i++) {
538 if (m->control_pressed) { break; }
540 vector<string> names;
541 names = findConsensusTaxonomy(i, processList, size, conTax);
543 if (m->control_pressed) { out.close(); return 0; }
545 //output to new names file
546 out << (i+1) << '\t' << size << '\t' << conTax << endl;
548 string noConfidenceConTax = conTax;
549 removeConfidences(noConfidenceConTax);
551 //add this bins taxonomy to summary
552 if (basis == "sequence") {
553 for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); }
555 taxaSum->addSeqToTree(noConfidenceConTax, names);
562 taxaSum->print(outSum);
570 catch(exception& e) {
571 m->errorOut(e, "ClassifyOtuCommand", "process");
575 /**************************************************************************************************/
576 void ClassifyOtuCommand::removeConfidences(string& tax) {
582 while (tax.find_first_of(';') != -1) {
584 taxon = tax.substr(0,tax.find_first_of(';'));
586 int pos = taxon.find_first_of('(');
588 taxon = taxon.substr(0, pos); //rip off confidence
593 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
599 catch(exception& e) {
600 m->errorOut(e, "ClassifyOtuCommand", "removeConfidences");
604 //**********************************************************************************************************************