X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classifyotucommand.cpp;h=170c234ac245e6c32e3174ad80145fbcc359f2f2;hp=660d53c4bdf05c3249b74d84b7a2ba1eb7e5e50e;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=6c2b1e530a5c0bb87040e58a3e410097acdfcc3d diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index 660d53c..170c234 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -10,22 +10,24 @@ #include "classifyotucommand.h" #include "phylotree.h" #include "phylosummary.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector ClassifyOtuCommand::setParameters(){ try { - CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); - CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); - CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis); - CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); - CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(plist); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","constaxonomy",false,true,true); parameters.push_back(ptaxonomy); + CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(preftaxonomy); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); + CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppersample); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "","",false,false); parameters.push_back(pbasis); + CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "","",false,true); parameters.push_back(pcutoff); + CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -40,7 +42,7 @@ vector ClassifyOtuCommand::setParameters(){ string ClassifyOtuCommand::getHelpString(){ try { string helpString = ""; - helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; + helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, persample, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n"; helpString += "The name parameter allows you add a names file with your taxonomy file.\n"; helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n"; @@ -51,6 +53,7 @@ string ClassifyOtuCommand::getHelpString(){ helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n"; helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n"; helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"; + helpString += "The persample parameter allows you to find a consensus taxonomy for each group. Default=f\n"; helpString += "The default value for label is all labels in your inputfile.\n"; helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n"; helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n"; @@ -65,25 +68,20 @@ string ClassifyOtuCommand::getHelpString(){ } } //********************************************************************************************************************** -string ClassifyOtuCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string outputFileName = ""; - map >::iterator it; +string ClassifyOtuCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "constaxonomy") { outputFileName = "cons.taxonomy"; } - else if (type == "taxsummary") { outputFileName = "cons.tax.summary"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return outputFileName; - } - catch(exception& e) { - m->errorOut(e, "ClassifyOtuCommand", "getOutputFileNameTag"); - exit(1); - } + if (type == "constaxonomy") { pattern = "[filename],[distance],cons.taxonomy"; } + else if (type == "taxsummary") { pattern = "[filename],[distance],cons.tax.summary"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ClassifyOtuCommand", "getOutputPattern"); + exit(1); + } } //********************************************************************************************************************** ClassifyOtuCommand::ClassifyOtuCommand(){ @@ -244,7 +242,7 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } else { allLines = 1; } } - + basis = validParameter.validFile(parameters, "basis", false); if (basis == "not found") { basis = "otu"; } @@ -255,7 +253,18 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } + persample = m->isTrue(temp); + if ((groupfile == "") && (countfile == "")) { if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } + } + if (countfile != "") { + CountTable cts; + if (!cts.testGroups(countfile)) { + if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } + } + } if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } @@ -282,11 +291,11 @@ int ClassifyOtuCommand::execute(){ //if user gave a namesfile then use it if (namefile != "") { m->readNames(namefile, nameMap, true); } - if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); } + if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); groups = groupMap->getNamesOfGroups(); } else { groupMap = NULL; } - if (countfile != "") { ct = new CountTable(); ct->readTable(countfile); } + if (countfile != "") { ct = new CountTable(); ct->readTable(countfile, true); if (ct->hasGroupInfo()) { groups = ct->getNamesOfGroups(); } } else { ct = NULL; } - + //read taxonomy file and save in map for easy access in building bin trees m->readTax(taxfile, taxMap); @@ -381,18 +390,13 @@ int ClassifyOtuCommand::execute(){ } } //********************************************************************************************************************** -vector ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) { +vector ClassifyOtuCommand::findConsensusTaxonomy(vector names, int& size, string& conTax) { try{ conTax = ""; - vector names; vector allNames; map::iterator it; map::iterator it2; - //parse names into vector - string binnames = thisList->get(bin); - m->splitAtComma(binnames, names); - //create a tree containing sequences from this bin PhyloTree* phylo = new PhyloTree(); @@ -524,12 +528,15 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (outputDir == "") { outputDir += m->hasPath(listfile); } ofstream out; - string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy"); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); + variables["[distance]"] = processList->getLabel(); + string outputFile = getOutputFileName("constaxonomy", variables); m->openOutputFile(outputFile, out); outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile); ofstream outSum; - string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary"); + string outputSumFile = getOutputFileName("taxsummary", variables); m->openOutputFile(outputSumFile, outSum); outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile); @@ -543,7 +550,40 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); } else { taxaSum = new PhyloSummary(groupMap); } } - + + vector outSums; + vector outs; + vector taxaSums; + map groupIndex; + if (persample) { + for (int i = 0; i < groups.size(); i++) { + groupIndex[groups[i]] = i; + ofstream* temp = new ofstream(); + variables["[distance]"] = processList->getLabel() + "." + groups[i]; + string outputFile = getOutputFileName("constaxonomy", variables); + m->openOutputFile(outputFile, *temp); + (*temp) << "OTU\tSize\tTaxonomy" << endl; + outs.push_back(temp); + outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile); + + ofstream* tempSum = new ofstream(); + string outputSumFile = getOutputFileName("taxsummary", variables); + m->openOutputFile(outputSumFile, *tempSum); + outSums.push_back(tempSum); + outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile); + + PhyloSummary* taxaSumt; + if (countfile != "") { + if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, ct); } + else { taxaSumt = new PhyloSummary(ct); } + }else { + if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, groupMap); } + else { taxaSumt = new PhyloSummary(groupMap); } + } + taxaSums.push_back(taxaSumt); + } + } + //for each bin in the list vector string snumBins = toString(processList->getNumBins()); for (int i = 0; i < processList->getNumBins(); i++) { @@ -551,9 +591,13 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (m->control_pressed) { break; } vector names; - names = findConsensusTaxonomy(i, processList, size, conTax); + string binnames = processList->get(i); + vector thisNames; + m->splitAtComma(binnames, thisNames); + + names = findConsensusTaxonomy(thisNames, size, conTax); - if (m->control_pressed) { out.close(); return 0; } + if (m->control_pressed) { break; } //output to new names file string binLabel = "Otu"; @@ -571,7 +615,11 @@ int ClassifyOtuCommand::process(ListVector* processList) { //add this bins taxonomy to summary if (basis == "sequence") { - for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } + for(int j = 0; j < names.size(); j++) { + int numReps = 1; + if (countfile != "") { numReps = ct->getNumSeqs(names[j]); } + for(int k = 0; k < numReps; k++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } + } }else { //otu map containsGroup; if (countfile != "") { @@ -602,6 +650,66 @@ int ClassifyOtuCommand::process(ListVector* processList) { } taxaSum->addSeqToTree(noConfidenceConTax, containsGroup); } + + + if (persample) { + //divide names by group + map > parsedNames; + map >::iterator itParsed; + + //parse names by group + for (int j = 0; j < names.size(); j++) { + if (groupfile != "") { + string group = groupMap->getGroup(names[j]); + itParsed = parsedNames.find(group); + + if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); } + else { vector tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; } + }else { //count file was used + vector thisSeqsGroups = ct->getGroups(names[j]); + for (int k = 0; k < thisSeqsGroups.size(); k++) { + string group = thisSeqsGroups[k]; + itParsed = parsedNames.find(group); + + if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); } + else { vector tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; } + } + } + } + + for (itParsed = parsedNames.begin(); itParsed != parsedNames.end(); itParsed++) { + vector theseNames = findConsensusTaxonomy(itParsed->second, size, conTax); + + if (m->control_pressed) { break; } + + //output to new names file + string binLabel = "Otu"; + string sbinNumber = toString(i+1); + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { binLabel += "0"; } + } + binLabel += sbinNumber; + + (*outs[groupIndex[itParsed->first]]) << binLabel << '\t' << size << '\t' << conTax << endl; + + string noConfidenceConTax = conTax; + m->removeConfidences(noConfidenceConTax); + + //add this bins taxonomy to summary + if (basis == "sequence") { + for(int j = 0; j < theseNames.size(); j++) { + int numReps = 1; + if (countfile != "") { numReps = ct->getGroupCount(theseNames[j], itParsed->first); } //get num seqs for this seq from this group + for(int k = 0; k < numReps; k++) { (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(theseNames[j], noConfidenceConTax); } + } + }else { //otu + map containsGroup; + containsGroup[itParsed->first] = true; + (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(noConfidenceConTax, containsGroup); + } + } + } } out.close(); @@ -609,6 +717,17 @@ int ClassifyOtuCommand::process(ListVector* processList) { //print summary file taxaSum->print(outSum); outSum.close(); + + if (persample) { + for (int i = 0; i < groups.size(); i++) { + (*outs[i]).close(); + taxaSums[i]->print(*outSums[i]); + (*outSums[i]).close(); + delete outs[i]; + delete outSums[i]; + delete taxaSums[i]; + } + } delete taxaSum;