]> git.donarmstrong.com Git - mothur.git/blobdiff - classifyotucommand.cpp
working on pam
[mothur.git] / classifyotucommand.cpp
index 660d53c4bdf05c3249b74d84b7a2ba1eb7e5e50e..160928f3aa46906900ac5262f198112fb70fda36 100644 (file)
 #include "classifyotucommand.h"
 #include "phylotree.h"
 #include "phylosummary.h"
+#include "sharedutilities.h"
 
 //**********************************************************************************************************************
 vector<string> ClassifyOtuCommand::setParameters(){    
        try {
-               CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
-               CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
-               CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
-        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
-        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
-               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
-               CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
-               CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
-               CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
-               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+               CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(plist);
+               CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","constaxonomy",false,true,true); parameters.push_back(ptaxonomy);
+               CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(preftaxonomy);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
+        CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppersample);
+        CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+               CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "","",false,false); parameters.push_back(pbasis);
+               CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "","",false,true); parameters.push_back(pcutoff);
+               CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
                
                vector<string> myArray;
                for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
@@ -40,7 +42,7 @@ vector<string> ClassifyOtuCommand::setParameters(){
 string ClassifyOtuCommand::getHelpString(){    
        try {
                string helpString = "";
-               helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
+               helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, persample, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
                helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
                helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
                helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
@@ -51,6 +53,7 @@ string ClassifyOtuCommand::getHelpString(){
                helpString += "Now for basis=otu could give Clostridiales       3       7       6       1       2, where 7 is the number of otus that classified to Clostridiales.\n";
                helpString += "6 is the number of otus containing sequences from groupA, 1 is the number of otus containing sequences from groupB, and 2 is the number of otus containing sequences from groupC.\n";
                helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
+        helpString += "The persample parameter allows you to find a consensus taxonomy for each group. Default=f\n";
                helpString += "The default value for label is all labels in your inputfile.\n";
                helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
                helpString += "The probs parameter shuts off the outputting of the consensus confidence results. The default is true, meaning you want the confidence to be shown.\n";
@@ -65,25 +68,20 @@ string ClassifyOtuCommand::getHelpString(){
        }
 }
 //**********************************************************************************************************************
-string ClassifyOtuCommand::getOutputFileNameTag(string type, string inputName=""){     
-       try {
-        string outputFileName = "";
-               map<string, vector<string> >::iterator it;
+string ClassifyOtuCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
         
-        //is this a type this command creates
-        it = outputTypes.find(type);
-        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
-        else {
-            if (type == "constaxonomy") {  outputFileName =  "cons.taxonomy"; }
-            else if (type == "taxsummary") {  outputFileName =  "cons.tax.summary"; }
-            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
-        }
-        return outputFileName;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ClassifyOtuCommand", "getOutputFileNameTag");
-               exit(1);
-       }
+        if (type == "constaxonomy") {  pattern = "[filename],[distance],cons.taxonomy"; } 
+        else if (type == "taxsummary") {  pattern = "[filename],[distance],cons.tax.summary"; } 
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ClassifyOtuCommand", "getOutputPattern");
+        exit(1);
+    }
 }
 //**********************************************************************************************************************
 ClassifyOtuCommand::ClassifyOtuCommand(){      
@@ -244,7 +242,7 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
                                if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
                                else { allLines = 1;  }
                        }
-                       
+            
                        basis = validParameter.validFile(parameters, "basis", false);
                        if (basis == "not found") { basis = "otu"; }    
                        
@@ -255,7 +253,18 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
                        
                        temp = validParameter.validFile(parameters, "probs", false);                                    if (temp == "not found"){       temp = "true";                  }
                        probs = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "persample", false);           if (temp == "not found"){       temp = "f";             }
+                       persample = m->isTrue(temp);
                        
+            if ((groupfile == "") && (countfile == "")) { if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; } 
+            }
+            if (countfile != "") {
+                CountTable cts;
+                if (!cts.testGroups(countfile)) { 
+                    if (persample) { m->mothurOut("persample is only valid with a group file, or count file with group information. Setting persample=f.\n"); persample = false; }
+                }
+            }
                        
                        if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
                        
@@ -282,11 +291,11 @@ int ClassifyOtuCommand::execute(){
                
                //if user gave a namesfile then use it
                if (namefile != "")     {       m->readNames(namefile, nameMap, true);  }
-        if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap(); }
+        if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap();  groups = groupMap->getNamesOfGroups(); }
         else { groupMap = NULL;  }
-        if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile);    }
+        if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile, true, false);  if (ct->hasGroupInfo()) { groups = ct->getNamesOfGroups(); } }
         else {  ct = NULL;    }
-               
+        
                //read taxonomy file and save in map for easy access in building bin trees
                m->readTax(taxfile, taxMap);
                
@@ -381,18 +390,13 @@ int ClassifyOtuCommand::execute(){
        }
 }
 //**********************************************************************************************************************
-vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, int& size, string& conTax) {
+vector<string> ClassifyOtuCommand::findConsensusTaxonomy(vector<string> names, int& size, string& conTax) {
        try{
                conTax = "";
-               vector<string> names;
                vector<string> allNames;
                map<string, string>::iterator it;
                map<string, string>::iterator it2;
 
-               //parse names into vector
-               string binnames = thisList->get(bin);
-               m->splitAtComma(binnames, names);
-
                //create a tree containing sequences from this bin
                PhyloTree* phylo = new PhyloTree();
                
@@ -524,12 +528,15 @@ int ClassifyOtuCommand::process(ListVector* processList) {
                if (outputDir == "") { outputDir += m->hasPath(listfile); }
                                
                ofstream out;
-               string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy");
+        map<string, string> variables; 
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
+        variables["[distance]"] = processList->getLabel();
+               string outputFile = getOutputFileName("constaxonomy", variables);
                m->openOutputFile(outputFile, out);
                outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
                
                ofstream outSum;
-               string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary");
+               string outputSumFile = getOutputFileName("taxsummary", variables);
                m->openOutputFile(outputSumFile, outSum);
                outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
                
@@ -543,35 +550,69 @@ int ClassifyOtuCommand::process(ListVector* processList) {
             if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap);  }
             else {  taxaSum = new PhyloSummary(groupMap); }
         }
-
+        
+        vector<ofstream*> outSums;
+        vector<ofstream*> outs;
+        vector<PhyloSummary*> taxaSums;
+        map<string, int> groupIndex;
+        if (persample) {
+            for (int i = 0; i < groups.size(); i++) {
+                groupIndex[groups[i]] = i;
+                ofstream* temp = new ofstream();
+                variables["[distance]"] = processList->getLabel() + "." + groups[i];
+                string outputFile = getOutputFileName("constaxonomy", variables);
+                m->openOutputFile(outputFile, *temp);
+                (*temp) << "OTU\tSize\tTaxonomy" << endl;
+                outs.push_back(temp);
+                outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
+                
+                ofstream* tempSum = new ofstream();
+                string outputSumFile = getOutputFileName("taxsummary", variables);
+                m->openOutputFile(outputSumFile, *tempSum);
+                outSums.push_back(tempSum);
+                outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
+                
+                PhyloSummary* taxaSumt;
+                if (countfile != "") {
+                    if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, ct);  }
+                    else {  taxaSumt = new PhyloSummary(ct); }
+                }else {
+                    if (refTaxonomy != "") { taxaSumt = new PhyloSummary(refTaxonomy, groupMap);  }
+                    else {  taxaSumt = new PhyloSummary(groupMap); }
+                }
+                taxaSums.push_back(taxaSumt);
+            }
+        }
+        
                //for each bin in the list vector
         string snumBins = toString(processList->getNumBins());
+        vector<string> binLabels = processList->getLabels();
                for (int i = 0; i < processList->getNumBins(); i++) {
                        
                        if (m->control_pressed) { break; }
                        
                        vector<string> names;
-                       names = findConsensusTaxonomy(i, processList, size, conTax);
+            string binnames = processList->get(i);
+            vector<string> thisNames;
+            m->splitAtComma(binnames, thisNames);
+            
+                       names = findConsensusTaxonomy(thisNames, size, conTax);
                
-                       if (m->control_pressed) { out.close();  return 0; }
-                       
-                       //output to new names file
-            string binLabel = "Otu";
-            string sbinNumber = toString(i+1);
-            if (sbinNumber.length() < snumBins.length()) { 
-                int diff = snumBins.length() - sbinNumber.length();
-                for (int h = 0; h < diff; h++) { binLabel += "0"; }
-            }
-            binLabel += sbinNumber;
+                       if (m->control_pressed) { break; }
 
-                       out << binLabel << '\t' << size << '\t' << conTax << endl;
+                       out << binLabels[i] << '\t' << size << '\t' << conTax << endl;
                        
                        string noConfidenceConTax = conTax;
                        m->removeConfidences(noConfidenceConTax);
                        
                        //add this bins taxonomy to summary
                        if (basis == "sequence") {
-                               for(int j = 0; j < names.size(); j++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
+                               for(int j = 0; j < names.size(); j++) {  
+                    //int numReps = 1;
+                    //if (countfile != "") {  numReps = ct->getNumSeqs(names[j]); }
+                    //for(int k = 0; k < numReps; k++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
+                    taxaSum->addSeqToTree(names[j], noConfidenceConTax);
+                }
                        }else { //otu
                 map<string, bool> containsGroup; 
                 if (countfile != "") {
@@ -602,6 +643,58 @@ int ClassifyOtuCommand::process(ListVector* processList) {
                 }
                                taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
                        }
+            
+            
+            if (persample) {
+                //divide names by group
+                map<string, vector<string> > parsedNames;
+                map<string, vector<string> >::iterator itParsed;
+                
+                //parse names by group
+                for (int j = 0; j < names.size(); j++) {
+                    if (groupfile != "") { 
+                        string group = groupMap->getGroup(names[j]); 
+                        itParsed = parsedNames.find(group);
+                        
+                        if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
+                        else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
+                    }else { //count file was used
+                        vector<string> thisSeqsGroups = ct->getGroups(names[j]);
+                        for (int k = 0; k < thisSeqsGroups.size(); k++) {
+                            string group = thisSeqsGroups[k]; 
+                            itParsed = parsedNames.find(group);
+                            
+                            if (itParsed != parsedNames.end()) { itParsed->second.push_back(names[j]); }
+                            else { vector<string> tempNames; tempNames.push_back(names[j]); parsedNames[group] = tempNames; }
+                        }
+                    }
+                }
+                
+                for (itParsed = parsedNames.begin(); itParsed != parsedNames.end(); itParsed++) {
+                    vector<string> theseNames = findConsensusTaxonomy(itParsed->second, size, conTax);
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    
+                    (*outs[groupIndex[itParsed->first]]) << binLabels[i] << '\t' << size << '\t' << conTax << endl;
+                    
+                    string noConfidenceConTax = conTax;
+                    m->removeConfidences(noConfidenceConTax);
+                    
+                    //add this bins taxonomy to summary
+                    if (basis == "sequence") {
+                        for(int j = 0; j < theseNames.size(); j++) {  
+                            int numReps = 1;
+                            if (countfile != "") {  numReps = ct->getGroupCount(theseNames[j], itParsed->first); } //get num seqs for this seq from this group
+                            for(int k = 0; k < numReps; k++) {  (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(theseNames[j], noConfidenceConTax);  }
+                        }
+                    }else { //otu
+                        map<string, bool> containsGroup; 
+                        containsGroup[itParsed->first] = true;
+                        (taxaSums[groupIndex[itParsed->first]])->addSeqToTree(noConfidenceConTax, containsGroup);
+                    }
+                }
+            }
                }
 
                out.close();
@@ -609,6 +702,17 @@ int ClassifyOtuCommand::process(ListVector* processList) {
                //print summary file
                taxaSum->print(outSum);
                outSum.close();
+        
+        if (persample) {
+            for (int i = 0; i < groups.size(); i++) {
+                (*outs[i]).close();
+                taxaSums[i]->print(*outSums[i]);
+                (*outSums[i]).close();
+                delete outs[i];
+                delete outSums[i];
+                delete taxaSums[i];
+            }
+        }
                
                delete taxaSum;