From 6c2b1e530a5c0bb87040e58a3e410097acdfcc3d Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Fri, 7 Sep 2012 14:30:06 -0400 Subject: [PATCH] major change to the tree class to use the count table class instead of tree map. This change involved changes to subsample class for trees, changes to the tree reader and readTree class. This change fixed bug with parsimony random tree generation that was leaving out the name file info. added count file to unifrac.unweighted, unifrac.weighted, parsimony, bin.seqs, classify.otu, classify.seqs, classify.tree, cluster.fragments, consensus.seqs, unique.seqs, phylo.diversity, summary.tax. --- Mothur.xcodeproj/project.pbxproj | 4 +- aligncommand.cpp | 1 - binsequencecommand.cpp | 98 ++++++-- binsequencecommand.h | 5 +- chimeraperseuscommand.cpp | 2 +- classifyotucommand.cpp | 114 +++++++-- classifyotucommand.h | 6 +- classifyseqscommand.cpp | 197 +++++++++++---- classifyseqscommand.h | 5 +- classifytreecommand.cpp | 59 +++-- classifytreecommand.h | 4 +- clusterfragmentscommand.cpp | 65 +++-- clusterfragmentscommand.h | 4 +- consensus.cpp | 5 +- consensusseqscommand.cpp | 312 +++++++++++++---------- consensusseqscommand.h | 10 +- countseqscommand.cpp | 3 +- counttable.cpp | 418 ++++++++++++++++++++++++++++++- counttable.h | 19 +- deconvolutecommand.cpp | 88 +++++-- deconvolutecommand.h | 3 +- deuniquetreecommand.cpp | 5 +- indicatorcommand.cpp | 39 +-- indicatorcommand.h | 4 +- parsimony.cpp | 20 +- parsimony.h | 6 +- parsimonycommand.cpp | 88 ++++--- parsimonycommand.h | 8 +- phylodiversitycommand.cpp | 59 +++-- phylodiversitycommand.h | 6 +- phylosummary.cpp | 235 ++++++++++------- phylosummary.h | 12 +- readtree.cpp | 103 +++----- readtree.h | 15 +- secondarystructurecommand.cpp | 5 +- sensspeccommand.cpp | 18 +- shhhercommand.cpp | 2 +- subsample.cpp | 180 +++---------- subsample.h | 12 +- summarytaxcommand.cpp | 76 ++++-- summarytaxcommand.h | 3 +- tree.cpp | 262 ++++++++++--------- tree.h | 21 +- treegroupscommand.cpp | 41 +-- treegroupscommand.h | 4 +- treemap.cpp | 4 +- treemap.h | 4 +- treereader.cpp | 126 ++++------ treereader.h | 15 +- trimseqscommand.cpp | 27 +- unifracunweightedcommand.cpp | 116 +++++---- unifracunweightedcommand.h | 8 +- unifracweightedcommand.cpp | 124 +++++---- unifracweightedcommand.h | 8 +- unweighted.cpp | 46 ++-- unweighted.h | 10 +- validparameter.cpp | 8 + weighted.cpp | 34 +-- weighted.h | 6 +- 59 files changed, 2038 insertions(+), 1144 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 4c2e330..37932a4 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -2286,8 +2286,8 @@ GCC_MODEL_TUNING = ""; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( - "VERSION=\"\\\"1.26.0\\\"\"", - "RELEASE_DATE=\"\\\"7/9/2012\\\"\"", + "VERSION=\"\\\"1.27.0\\\"\"", + "RELEASE_DATE=\"\\\"8/8/2012\\\"\"", ); GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; diff --git a/aligncommand.cpp b/aligncommand.cpp index a68fbfc..efc8ce4 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -572,7 +572,6 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { alignment->resize(candidateSeq->getUnaligned().length()+1); } - Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp index 0c867e3..ad71d10 100644 --- a/binsequencecommand.cpp +++ b/binsequencecommand.cpp @@ -15,8 +15,9 @@ vector BinSeqCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,7 +35,7 @@ vector BinSeqCommand::setParameters(){ string BinSeqCommand::getHelpString(){ try { string helpString = ""; - helpString += "The bin.seqs command parameters are list, fasta, name, label and group. The fasta and list are required, unless you have a valid current list and fasta file.\n"; + helpString += "The bin.seqs command parameters are list, fasta, name, count, label and group. The fasta and list are required, unless you have a valid current list and fasta file.\n"; helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n"; helpString += "The bin.seqs command should be in the following format: bin.seqs(fasta=yourFastaFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"; helpString += "Example bin.seqs(fasta=amazon.fasta, group=amazon.groups, name=amazon.names).\n"; @@ -147,6 +148,14 @@ BinSeqCommand::BinSeqCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -195,11 +204,26 @@ BinSeqCommand::BinSeqCommand(string option) { if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namesfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } - if (namesfile == ""){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + + if (countfile == "") { + if (namesfile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -229,9 +253,8 @@ int BinSeqCommand::execute(){ fasta->readFastaFile(fastafile); //if user gave a namesfile then use it - if (namesfile != "") { - readNamesFile(); - } + if (namesfile != "") { readNamesFile(); } + if (countfile != "") { ct.readTable(countfile); } input = new InputData(listfile, "list"); list = input->getListVector(); @@ -362,7 +385,7 @@ void BinSeqCommand::readNamesFile() { //return 1 if error, 0 otherwise int BinSeqCommand::process(ListVector* list) { try { - string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + getOutputFileNameTag("fasta"); + string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + "." + getOutputFileNameTag("fasta"); m->openOutputFile(outputFileName, out); outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); @@ -381,24 +404,47 @@ int BinSeqCommand::process(ListVector* list) { //do work for that name string sequence = fasta->getSequence(name); - if (sequence != "not found") { - //if you don't have groups - if (groupfile == "") { - name = name + "\t" + toString(i+1); - out << ">" << name << endl; - out << sequence << endl; - }else {//if you do have groups - string group = groupMap->getGroup(name); - if (group == "not found") { - m->mothurOut(name + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); - return 1; - }else{ - name = name + "\t" + group + "\t" + toString(i+1); + + if (countfile != "") { + if (sequence != "not found") { + if (ct.hasGroupInfo()) { + vector groups = ct.getGroups(name); + string groupInfo = ""; + for (int k = 0; k < groups.size()-1; k++) { + groupInfo += groups[k] + "-"; + } + if (groups.size() != 0) { groupInfo += groups[groups.size()-1]; } + else { groupInfo = "not found"; } + name = name + "\t" + groupInfo + "\t" + toString(i+1)+ "\tNumRep=" + toString(ct.getNumSeqs(name)); + out << ">" << name << endl; + out << sequence << endl; + }else { + name = name + "\t" + toString(i+1) + "\tNumRep=" + toString(ct.getNumSeqs(name)); + out << ">" << name << endl; + out << sequence << endl; + } + + }else { m->mothurOut(name + " is missing from your fasta. Does your list file contain all sequence names or just the uniques?"); m->mothurOutEndLine(); return 1; } + }else { + if (sequence != "not found") { + //if you don't have groups + if (groupfile == "") { + name = name + "\t" + toString(i+1); out << ">" << name << endl; out << sequence << endl; + }else {//if you do have groups + string group = groupMap->getGroup(name); + if (group == "not found") { + m->mothurOut(name + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); + return 1; + }else{ + name = name + "\t" + group + "\t" + toString(i+1); + out << ">" << name << endl; + out << sequence << endl; + } } - } - }else { m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); return 1; } + }else { m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); return 1; } + } } } diff --git a/binsequencecommand.h b/binsequencecommand.h index 1fb5664..5bdd401 100644 --- a/binsequencecommand.h +++ b/binsequencecommand.h @@ -16,6 +16,7 @@ #include "listvector.hpp" #include "fastamap.h" #include "groupmap.h" +#include "counttable.h" class BinSeqCommand : public Command { @@ -36,14 +37,14 @@ public: void help() { m->mothurOut(getHelpString()); } private: - + CountTable ct; ListVector* list; InputData* input; FastaMap* fasta; GroupMap* groupMap; bool abort, allLines; set labels; //holds labels to be used - string filename, fastafile, listfile, namesfile, groupfile, label, outputDir; + string filename, fastafile, listfile, namesfile, groupfile, countfile, label, outputDir; ofstream out; ifstream in, inNames; vector outputNames; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index e57c48a..7ae5d69 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -295,7 +295,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { bool ignore = false; if (countfileNames[i] == "current") { countfileNames[i] = m->getCountTableFile(); - if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + if (countfileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; //erase from file list diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index 00ae690..660d53c 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -17,8 +17,9 @@ vector ClassifyOtuCommand::setParameters(){ CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis); CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); @@ -39,11 +40,12 @@ vector ClassifyOtuCommand::setParameters(){ string ClassifyOtuCommand::getHelpString(){ try { string helpString = ""; - helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; + helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n"; helpString += "The name parameter allows you add a names file with your taxonomy file.\n"; helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n"; - helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n"; + helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n"; + helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n"; helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n"; helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n"; helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n"; @@ -172,6 +174,14 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -211,6 +221,20 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -235,10 +259,12 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } - if (namefile == ""){ - vector files; files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == ""){ + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -255,7 +281,11 @@ int ClassifyOtuCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } //if user gave a namesfile then use it - if (namefile != "") { m->readNames(namefile, nameMap, true); } + if (namefile != "") { m->readNames(namefile, nameMap, true); } + if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); } + else { groupMap = NULL; } + if (countfile != "") { ct = new CountTable(); ct->readTable(countfile); } + else { ct = NULL; } //read taxonomy file and save in map for easy access in building bin trees m->readTax(taxfile, taxMap); @@ -270,7 +300,7 @@ int ClassifyOtuCommand::execute(){ set processedLabels; set userLabels = labels; - if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -278,7 +308,7 @@ int ClassifyOtuCommand::execute(){ m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine(); process(list); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; } processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -293,7 +323,7 @@ int ClassifyOtuCommand::execute(){ process(list); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -329,10 +359,12 @@ int ClassifyOtuCommand::execute(){ process(list); delete list; - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } } delete input; + if (groupMap != NULL) { delete groupMap; } + if (ct != NULL) { delete ct; } if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -400,10 +432,16 @@ vector ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* th if (it == taxMap.end()) { //this name is not in taxonomy file, skip it m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ + if (countfile != "") { + int numDups = ct->getNumSeqs(names[i]); + for (int j = 0; j < numDups; j++) { phylo->addSeqToTree(names[i], it->second); } + size += numDups; + }else{ //add seq to tree - phylo->addSeqToTree(names[i], it->second); - size++; - allNames.push_back(names[i]); + phylo->addSeqToTree(names[i], it->second); + size++; + } + allNames.push_back(names[i]); } } @@ -486,24 +524,25 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (outputDir == "") { outputDir += m->hasPath(listfile); } ofstream out; - string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("constaxonomy"); + string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy"); m->openOutputFile(outputFile, out); outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile); ofstream outSum; - string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("taxsummary"); + string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary"); m->openOutputFile(outputSumFile, outSum); outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile); out << "OTU\tSize\tTaxonomy" << endl; PhyloSummary* taxaSum; - if (refTaxonomy != "") { - taxaSum = new PhyloSummary(refTaxonomy, groupfile); + if (countfile != "") { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); } + else { taxaSum = new PhyloSummary(ct); } }else { - taxaSum = new PhyloSummary(groupfile); - } - + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); } + else { taxaSum = new PhyloSummary(groupMap); } + } //for each bin in the list vector string snumBins = toString(processList->getNumBins()); @@ -534,7 +573,34 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (basis == "sequence") { for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } }else { //otu - taxaSum->addSeqToTree(noConfidenceConTax, names); + map containsGroup; + if (countfile != "") { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int k = 0; k < names.size(); k++) { + vector counts = ct->getGroupCounts(names[k]); + for (int h = 0; h < counts.size(); h++) { + if (counts[h] != 0) { containsGroup[mGroups[h]] = true; } + } + } + } + }else { + if (groupfile != "") { + vector mGroups = groupMap->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; } + + for (int k = 0; k < names.size(); k++) { + //find out the sequences group + string group = groupMap->getGroup(names[k]); + + if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } + else { + containsGroup[group] = true; + } + } + } + } + taxaSum->addSeqToTree(noConfidenceConTax, containsGroup); } } diff --git a/classifyotucommand.h b/classifyotucommand.h index b924775..45e377c 100644 --- a/classifyotucommand.h +++ b/classifyotucommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "listvector.hpp" #include "inputdata.h" +#include "counttable.h" class ClassifyOtuCommand : public Command { @@ -34,10 +35,11 @@ public: void help() { m->mothurOut(getHelpString()); } private: - + GroupMap* groupMap; + CountTable* ct; ListVector* list; InputData* input; - string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis; + string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis, countfile; bool abort, allLines, probs; int cutoff; set labels; //holds labels to be used diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index c76b047..43a021e 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -17,8 +17,10 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod); @@ -50,11 +52,12 @@ string ClassifySeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"; - helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; + helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"; helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"; helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"; helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n"; @@ -127,7 +130,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { abort = false; calledHelp = false; - rdb = ReferenceDB::getInstance(); + rdb = ReferenceDB::getInstance(); hasName = false; hasCount=false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -185,6 +188,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } fastaFileName = validParameter.validFile(parameters, "fasta", false); @@ -333,11 +344,90 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } } } - + + if (namefileNames.size() != 0) { hasName = true; } + if (namefile != "") { if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + //check for required parameters + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (countfileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } + } + + if (countfileNames.size() != 0) { hasCount = true; if (countfileNames.size() != fastaFileNames.size()) {m->mothurOut("If you provide a count file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + groupfile = validParameter.validFile(parameters, "group", false); if (groupfile == "not found") { groupfile = ""; } else { @@ -393,6 +483,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (groupfile != "") { if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); } + if (hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } }else { for (int i = 0; i < fastaFileNames.size(); i++) { groupfileNames.push_back(""); } } @@ -481,10 +572,12 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } if (!abort) { - if (namefileNames.size() == 0){ - if (fastaFileNames.size() != 0) { - vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); - parser.getNameFile(files); + if (!hasCount) { + if (namefileNames.size() == 0){ + if (fastaFileNames.size() != 0) { + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } } } } @@ -694,46 +787,58 @@ int ClassifySeqsCommand::execute(){ } #endif - string group = ""; - if (groupfile != "") { group = groupfileNames[s]; } - - PhyloSummary taxaSum(baseTName, group); - - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } - - if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } - else { - ifstream in; - m->openInputFile(tempTaxonomyFile, in); - - //read in users taxonomy file and add sequences to tree - string name, taxon; - - while(!in.eof()){ - in >> name >> taxon; m->gobble(in); - - itNames = nameMap.find(name); - - if (itNames == nameMap.end()) { - m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); - }else{ - for (int i = 0; i < itNames->second.size(); i++) { - taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs - } - itNames->second.clear(); - nameMap.erase(itNames->first); - } - } - in.close(); - } + string group = ""; + GroupMap* groupMap = NULL; + CountTable* ct = NULL; + PhyloSummary* taxaSum; + if (hasCount) { + ct = new CountTable(); + ct->readTable(countfileNames[s]); + taxaSum = new PhyloSummary(baseTName, ct); + taxaSum->summarize(tempTaxonomyFile); + }else { + if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } + + taxaSum = new PhyloSummary(baseTName, groupMap); + + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + if (namefile == "") { taxaSum->summarize(tempTaxonomyFile); } + else { + ifstream in; + m->openInputFile(tempTaxonomyFile, in); + + //read in users taxonomy file and add sequences to tree + string name, taxon; + + while(!in.eof()){ + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + in >> name >> taxon; m->gobble(in); + + itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { + m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); + }else{ + for (int i = 0; i < itNames->second.size(); i++) { + taxaSum->addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs + } + itNames->second.clear(); + nameMap.erase(itNames->first); + } + } + in.close(); + } + } m->mothurRemove(tempTaxonomyFile); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; m->openOutputFile(taxSummary, outTaxTree); - taxaSum.print(outTaxTree); + taxaSum->print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added @@ -745,12 +850,12 @@ int ClassifySeqsCommand::execute(){ m->openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add - int maxLevel = taxaSum.getMaxLevel(); + int maxLevel = taxaSum->getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; while (!inTax.eof()) { - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); @@ -761,6 +866,8 @@ int ClassifySeqsCommand::execute(){ inTax.close(); outTax.close(); + if (ct != NULL) { delete ct; } + if (groupMap != NULL) { delete groupMap; } delete taxaSum; m->mothurRemove(newTaxonomyFile); rename(unclass.c_str(), newTaxonomyFile.c_str()); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 4965642..6d43dcb 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -62,6 +62,7 @@ private: vector lines; vector fastaFileNames; vector namefileNames; + vector countfileNames; vector groupfileNames; vector outputNames; map > nameMap; @@ -70,10 +71,10 @@ private: Classify* classify; ReferenceDB* rdb; - string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; + string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs, save, flip; + bool abort, probs, save, flip, hasName, hasCount; int driver(linePair*, string, string, string, string); int createProcesses(string, string, string, string); diff --git a/classifytreecommand.cpp b/classifytreecommand.cpp index 7861a01..69da8e0 100644 --- a/classifytreecommand.cpp +++ b/classifytreecommand.cpp @@ -15,8 +15,9 @@ vector ClassifyTreeCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -37,8 +38,9 @@ string ClassifyTreeCommand::getHelpString(){ helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n"; helpString += "If you provide a group file, the concensus for each group will also be provided. \n"; helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n"; - helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n"; + helpString += "The classify.tree command parameters are tree, group, name, count and taxonomy. The tree and taxonomy files are required.\n"; helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n"; helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n"; helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; @@ -147,6 +149,14 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -178,16 +188,30 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; } m->mothurConvert(temp, cutoff); if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } - } } catch(exception& e) { @@ -213,7 +237,7 @@ int ClassifyTreeCommand::execute(){ TreeReader* reader = new TreeReader(treefile, groupfile, namefile); vector T = reader->getTrees(); - TreeMap* tmap = T[0]->getTreeMap(); + CountTable* tmap = T[0]->getCountTable(); Tree* outputTree = T[0]; delete reader; @@ -367,10 +391,15 @@ string ClassifyTreeCommand::getTaxonomy(set names, int& size) { if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ - //add seq to tree - phylo->addSeqToTree((*it), itTax->second); - size++; - } + if (countfile != "") { + int numDups = ct->getNumSeqs((*it)); + for (int j = 0; j < numDups; j++) { phylo->addSeqToTree((*it), itTax->second); } + size += numDups; + }else{ + //add seq to tree + phylo->addSeqToTree((*it), itTax->second); + size++; + } } } if (m->control_pressed) { delete phylo; return conTax; } @@ -444,12 +473,12 @@ map > ClassifyTreeCommand::getDescendantList(Tree*& T, int i int lc = T->tree[i].getLChild(); int rc = T->tree[i].getRChild(); - TreeMap* tmap = T->getTreeMap(); + // TreeMap* tmap = T->getTreeMap(); if (lc == -1) { //you are a leaf your only descendant is yourself - string group = tmap->getGroup(T->tree[i].getName()); + vector groups = T->tree[i].getGroup(); set mynames; mynames.insert(T->tree[i].getName()); - names[group] = mynames; //mygroup -> me + for (int j = 0; j < groups.size(); j++) { names[groups[j]] = mynames; } //mygroup -> me names["AllGroups"] = mynames; }else{ //your descedants are the combination of your childrens descendants names = descendants[lc]; diff --git a/classifytreecommand.h b/classifytreecommand.h index 758a438..dd972b6 100644 --- a/classifytreecommand.h +++ b/classifytreecommand.h @@ -12,6 +12,7 @@ #include "command.hpp" #include "readtree.h" #include "treemap.h" +#include "counttable.h" class ClassifyTreeCommand : public Command { public: @@ -31,13 +32,14 @@ public: void help() { m->mothurOut(getHelpString()); } private: - string treefile, taxonomyfile, groupfile, namefile, outputDir; + string treefile, taxonomyfile, groupfile, namefile, countfile, outputDir; bool abort; vector outputNames; int numUniquesInName, cutoff; map nameMap; map nameCount; map taxMap; + CountTable* ct; int getClassifications(Tree*&); map > getDescendantList(Tree*&, int, map > >); diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index 4a33841..378e8ab 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -29,7 +29,8 @@ inline bool comparePriority(seqRNode first, seqRNode second) { vector ClusterFragmentsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs); CommandParameter ppercent("percent", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppercent); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -49,8 +50,8 @@ string ClusterFragmentsCommand::getHelpString(){ try { string helpString = ""; helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n"; - helpString += "The cluster.fragments command outputs a new fasta and name file.\n"; - helpString += "The cluster.fragments command parameters are fasta, name, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n"; + helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n"; + helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n"; helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n"; helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n"; @@ -78,6 +79,7 @@ string ClusterFragmentsCommand::getOutputFileNameTag(string type, string inputNa else { if (type == "fasta") { outputFileName = "fragclust.fasta"; } else if (type == "name") { outputFileName = "fragclust.names"; } + else if (type == "count") { outputFileName = "fragclust.count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -96,6 +98,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand"); @@ -129,6 +132,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -150,6 +154,14 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -171,6 +183,13 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { namefile = ""; abort = true; } else { readNameFile(); m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { ct.readTable(countfile); m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } string temp; temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found"){ temp = "0"; } @@ -179,10 +198,12 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { temp = validParameter.validFile(parameters, "percent", false); if (temp == "not found"){ temp = "0"; } m->mothurConvert(temp, percent); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } @@ -229,10 +250,13 @@ int ClusterFragmentsCommand::execute(){ string jBases = alignSeqs[j].seq.getUnaligned(); if (isFragment(iBases, jBases)) { - //merge - alignSeqs[i].names += ',' + alignSeqs[j].names; - alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; - + if (countfile != "") { + ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names); + }else { + //merge + alignSeqs[i].names += ',' + alignSeqs[j].names; + alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; + } alignSeqs[j].active = 0; alignSeqs[j].numIdentical = 0; count++; @@ -254,6 +278,7 @@ int ClusterFragmentsCommand::execute(){ string newFastaFile = fileroot + getOutputFileNameTag("fasta"); string newNamesFile = fileroot + getOutputFileNameTag("name"); + if (countfile != "") { newNamesFile = fileroot + getOutputFileNameTag("count"); } if (m->control_pressed) { return 0; } @@ -285,6 +310,11 @@ int ClusterFragmentsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; @@ -372,7 +402,10 @@ int ClusterFragmentsCommand::readFASTA(){ else{ seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length()); alignSeqs.push_back(tempNode); - } + } + }else if(countfile != "") { + seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length()); + alignSeqs.push_back(tempNode); }else { //no names file, you are identical to yourself seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length()); alignSeqs.push_back(tempNode); @@ -396,17 +429,18 @@ void ClusterFragmentsCommand::printData(string newfasta, string newname){ ofstream outNames; m->openOutputFile(newfasta, outFasta); - m->openOutputFile(newname, outNames); + if (countfile == "") { m->openOutputFile(newname, outNames); } for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (countfile == "") { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } } } outFasta.close(); - outNames.close(); + if (countfile == "") { outNames.close(); } + else { ct.printTable(newname); } } catch(exception& e) { m->errorOut(e, "ClusterFragmentsCommand", "printData"); @@ -438,6 +472,5 @@ void ClusterFragmentsCommand::readNameFile(){ exit(1); } } - /**************************************************************************************************/ diff --git a/clusterfragmentscommand.h b/clusterfragmentscommand.h index c322529..e3d861a 100644 --- a/clusterfragmentscommand.h +++ b/clusterfragmentscommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "sequence.hpp" +#include "counttable.h" /************************************************************/ struct seqRNode { @@ -46,8 +47,9 @@ public: void help() { m->mothurOut(getHelpString()); } private: + CountTable ct; bool abort; - string fastafile, namefile, outputDir; + string fastafile, namefile, countfile, outputDir; int diffs, percent; vector alignSeqs; map names; //represents the names file first column maps to second column diff --git a/consensus.cpp b/consensus.cpp index 1be052f..d45a395 100644 --- a/consensus.cpp +++ b/consensus.cpp @@ -21,7 +21,7 @@ Tree* Consensus::getTree(vector& t){ if (m->control_pressed) { return 0; } - consensusTree = new Tree(t[0]->getTreeMap()); + consensusTree = new Tree(t[0]->getCountTable()); it2 = nodePairs.find(treeSet); @@ -37,8 +37,7 @@ Tree* Consensus::getTree(vector& t){ if (m->control_pressed) { delete consensusTree; return 0; } - map empty; - consensusTree->assembleTree(empty); + consensusTree->assembleTree(); if (m->control_pressed) { delete consensusTree; return 0; } diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index d6158ba..4c7aefb 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -15,7 +15,8 @@ vector ConsensusSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plist); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcutoff("cutoff", "Number", "", "100", "", "", "",false,false); parameters.push_back(pcutoff); @@ -36,7 +37,7 @@ string ConsensusSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The consensus.seqs command can be used in 2 ways: create a consensus sequence from a fastafile, or with a listfile create a consensus sequence for each otu. Sequences must be aligned.\n"; - helpString += "The consensus.seqs command parameters are fasta, list, name, cutoff and label.\n"; + helpString += "The consensus.seqs command parameters are fasta, list, name, count, cutoff and label.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The list parameter allows you to enter a your list file. \n"; helpString += "The name parameter allows you to enter a names file associated with the fasta file. \n"; @@ -65,6 +66,7 @@ string ConsensusSeqsCommand::getOutputFileNameTag(string type, string inputName= else { if (type == "fasta") { outputFileName = "cons.fasta"; } else if (type == "name") { outputFileName = "cons.names"; } + else if (type == "count") { outputFileName = "cons.count.table"; } else if (type == "summary") { outputFileName = "cons.summary"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -84,6 +86,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["summary"] = tempOutNames; } catch(exception& e) { @@ -120,6 +123,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["summary"] = tempOutNames; @@ -151,6 +155,14 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["list"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -168,6 +180,13 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + listfile = validParameter.validFile(parameters, "list", true); if (listfile == "not open") { abort = true; } else if (listfile == "not found") { listfile = ""; } @@ -186,10 +205,12 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } - if (namefile == ""){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -209,6 +230,7 @@ int ConsensusSeqsCommand::execute(){ if (m->control_pressed) { return 0; } if (namefile != "") { readNames(); } + if (countfile != "") { ct.readTable(countfile); } if (m->control_pressed) { return 0; } @@ -227,25 +249,12 @@ int ConsensusSeqsCommand::execute(){ string outputFastaFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta"); m->openOutputFile(outputFastaFile, outFasta); outputNames.push_back(outputFastaFile); outputTypes["fasta"].push_back(outputFastaFile); - - vector seqs; - int seqLength = 0; - for (map::iterator it = nameMap.begin(); it != nameMap.end(); it++) { - - if (m->control_pressed) { outSummary.close(); outFasta.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - string seq = fastaMap[it->second]; - seqs.push_back(seq); - - if (seqLength == 0) { seqLength = seq.length(); } - else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } - - } - + vector< vector > percentages; percentages.resize(5); for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); } string consSeq = ""; + int thisCount; //get counts for (int j = 0; j < seqLength; j++) { @@ -253,41 +262,55 @@ int ConsensusSeqsCommand::execute(){ vector counts; counts.resize(5, 0); //A,T,G,C,Gap int numDots = 0; - - for (int i = 0; i < seqs.size(); i++) { + thisCount = 0; + for (map::iterator it = fastaMap.begin(); it != fastaMap.end(); it++) { - if (seqs[i][j] == '.') { numDots++; } - - char base = toupper(seqs[i][j]); - if (base == 'A') { counts[0]++; } - else if (base == 'T') { counts[1]++; } - else if (base == 'G') { counts[2]++; } - else if (base == 'C') { counts[3]++; } - else { counts[4]++; } + string thisSeq = it->second; + int size = 0; + + if (countfile != "") { size = ct.getNumSeqs(it->first); } + else { + map::iterator itCount = nameFileMap.find(it->first); + if (itCount != nameFileMap.end()) { + size = itCount->second; + }else { m->mothurOut("[ERROR]: file mismatch, aborting.\n"); m->control_pressed = true; break; } + } + + for (int k = 0; k < size; k++) { + if (thisSeq[j] == '.') { numDots++; } + + char base = toupper(thisSeq[j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + thisCount++; + } } char conBase = '.'; - if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } + if (numDots != thisCount) { conBase = getBase(counts, thisCount); } consSeq += conBase; - percentages[0][j] = counts[0] / (float) seqs.size(); - percentages[1][j] = counts[1] / (float) seqs.size(); - percentages[2][j] = counts[2] / (float) seqs.size(); - percentages[3][j] = counts[3] / (float) seqs.size(); - percentages[4][j] = counts[4] / (float) seqs.size(); - + percentages[0][j] = counts[0] / (float) thisCount; + percentages[1][j] = counts[1] / (float) thisCount; + percentages[2][j] = counts[2] / (float) thisCount; + percentages[3][j] = counts[3] / (float) thisCount; + percentages[4][j] = counts[4] / (float) thisCount; } for (int j = 0; j < seqLength; j++) { - outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl; + outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << thisCount << '\t' << consSeq[j] << endl; } outFasta << ">conseq" << endl << consSeq << endl; outSummary.close(); outFasta.close(); - + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } }else { @@ -414,12 +437,10 @@ int ConsensusSeqsCommand::processList(ListVector*& list){ if (m->control_pressed) { outSummary.close(); outName.close(); outFasta.close(); return 0; } string bin = list->get(i); - - string newName = ""; - string consSeq = getConsSeq(bin, outSummary, newName, i); + string consSeq = getConsSeq(bin, outSummary, i); outFasta << ">seq" << (i+1) << endl << consSeq << endl; - outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << newName << endl; + outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << bin << endl; } outSummary.close(); outName.close(); outFasta.close(); @@ -434,96 +455,127 @@ int ConsensusSeqsCommand::processList(ListVector*& list){ } //*************************************************************************************************************** -//made this smart enough to owrk with unique or non unique list file -string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, string& name, int binNumber){ +string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, int binNumber){ try{ string consSeq = ""; bool error = false; - - //the whole bin is the second column if no names file, otherwise build it - name = bin; - if (namefile != "") { name = ""; } - + int totalSize=0; + vector binNames; m->splitAtComma(bin, binNames); - - //get sequence strings for each name in the bin - vector seqs; - - set addedAlready; - int seqLength = 0; - for (int i = 0; i < binNames.size(); i++) { - - map::iterator it; - - it = nameMap.find(binNames[i]); - if (it == nameMap.end()) { - if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; } - else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; } - break; - }else { - - //add sequence string to seqs vector to process below - string seq = fastaMap[it->second]; - seqs.push_back(seq); - - if (seqLength == 0) { seqLength = seq.length(); } - else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); error = true; break; } - - if (namefile != "") { - //did we add this line from name file already? - if (addedAlready.count(it->second) == 0) { - name += "," + nameFileMap[it->second]; - addedAlready.insert(it->second); - } - } - - } - } - - if (error) { m->control_pressed = true; return consSeq; } - - if (namefile != "") { name = name.substr(1); } - - vector< vector > percentages; percentages.resize(5); + + vector< vector > percentages; percentages.resize(5); for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); } - - //get counts - for (int j = 0; j < seqLength; j++) { - - if (m->control_pressed) { return consSeq; } - - vector counts; counts.resize(5, 0); //A,T,G,C,Gap - int numDots = 0; - - for (int i = 0; i < seqs.size(); i++) { - - if (seqs[i][j] == '.') { numDots++; } - - char base = toupper(seqs[i][j]); - if (base == 'A') { counts[0]++; } - else if (base == 'T') { counts[1]++; } - else if (base == 'G') { counts[2]++; } - else if (base == 'C') { counts[3]++; } - else { counts[4]++; } - } - - char conBase = '.'; - if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } - - consSeq += conBase; - - percentages[0][j] = counts[0] / (float) seqs.size(); - percentages[1][j] = counts[1] / (float) seqs.size(); - percentages[2][j] = counts[2] / (float) seqs.size(); - percentages[3][j] = counts[3] / (float) seqs.size(); - percentages[4][j] = counts[4] / (float) seqs.size(); - + + if (countfile != "") { + //get counts + for (int j = 0; j < seqLength; j++) { + + if (m->control_pressed) { return consSeq; } + + vector counts; counts.resize(5, 0); //A,T,G,C,Gap + int numDots = 0; + totalSize = 0; + for (int i = 0; i < binNames.size(); i++) { + if (m->control_pressed) { return consSeq; } + + string thisSeq = ""; + map::iterator itFasta = fastaMap.find(binNames[i]); + if (itFasta != fastaMap.end()) { + thisSeq = itFasta->second; + }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + + int size = ct.getNumSeqs(binNames[i]); + if (size != 0) { + for (int k = 0; k < size; k++) { + if (thisSeq[j] == '.') { numDots++; } + + char base = toupper(thisSeq[j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + totalSize++; + } + }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + } + char conBase = '.'; + if (numDots != totalSize) { conBase = getBase(counts, totalSize); } + + consSeq += conBase; + + percentages[0][j] = counts[0] / (float) totalSize; + percentages[1][j] = counts[1] / (float) totalSize; + percentages[2][j] = counts[2] / (float) totalSize; + percentages[3][j] = counts[3] / (float) totalSize; + percentages[4][j] = counts[4] / (float) totalSize; + } + + }else { + + //get sequence strings for each name in the bin + vector seqs; + for (int i = 0; i < binNames.size(); i++) { + + map::iterator it; + it = nameMap.find(binNames[i]); + if (it == nameMap.end()) { + if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; } + else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; } + break; + }else { + //add sequence string to seqs vector to process below + map::iterator itFasta = fastaMap.find(it->second); + + if (itFasta != fastaMap.end()) { + string seq = itFasta->second; + seqs.push_back(seq); + }else { m->mothurOut("[ERROR]: file mismatch, aborting. \n"); } + } + } + + if (error) { m->control_pressed = true; return consSeq; } + totalSize = seqs.size(); + //get counts + for (int j = 0; j < seqLength; j++) { + + if (m->control_pressed) { return consSeq; } + + vector counts; counts.resize(5, 0); //A,T,G,C,Gap + int numDots = 0; + + for (int i = 0; i < seqs.size(); i++) { + + if (seqs[i][j] == '.') { numDots++; } + + char base = toupper(seqs[i][j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + } + + char conBase = '.'; + if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } + + consSeq += conBase; + + percentages[0][j] = counts[0] / (float) seqs.size(); + percentages[1][j] = counts[1] / (float) seqs.size(); + percentages[2][j] = counts[2] / (float) seqs.size(); + percentages[3][j] = counts[3] / (float) seqs.size(); + percentages[4][j] = counts[4] / (float) seqs.size(); + + } } - + + + for (int j = 0; j < seqLength; j++) { - outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl; + outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << totalSize << '\t' << consSeq[j] << endl; } return consSeq; @@ -646,7 +698,8 @@ int ConsensusSeqsCommand::readFasta(){ ifstream in; m->openInputFile(fastafile, in); - + seqLength = 0; + while (!in.eof()) { if (m->control_pressed) { break; } @@ -657,7 +710,10 @@ int ConsensusSeqsCommand::readFasta(){ if (name != "") { fastaMap[name] = seq.getAligned(); nameMap[name] = name; //set nameMap incase no names file - nameFileMap[name] = name; + nameFileMap[name] = 1; + + if (seqLength == 0) { seqLength = seq.getAligned().length(); } + else if (seqLength != seq.getAligned().length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; break; } } } @@ -688,7 +744,7 @@ int ConsensusSeqsCommand::readNames(){ it = nameMap.find(thisname); if (it != nameMap.end()) { //then this sequence was in the fastafile - nameFileMap[thisname] = repnames; //for later when outputting the new namesFile if the list file is unique + nameFileMap[thisname] = m->getNumNames(repnames); //for later when outputting the new namesFile if the list file is unique vector splitRepNames; m->splitAtComma(repnames, splitRepNames); diff --git a/consensusseqscommand.h b/consensusseqscommand.h index 1459b43..e0c9715 100644 --- a/consensusseqscommand.h +++ b/consensusseqscommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "listvector.hpp" +#include "counttable.h" class ConsensusSeqsCommand : public Command { public: @@ -34,19 +35,20 @@ public: private: + CountTable ct; bool abort, allLines; - string fastafile, listfile, namefile, label, outputDir; + string fastafile, listfile, namefile, countfile, label, outputDir; set labels; vector outputNames; map fastaMap; map nameMap; - map nameFileMap; - int cutoff; + map nameFileMap; + int cutoff, seqLength; int readFasta(); int readNames(); int processList(ListVector*&); - string getConsSeq(string, ofstream&, string&, int); + string getConsSeq(string, ofstream&, int); char getBase(vector, int); }; diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 7b9aebf..75d21c2 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -10,6 +10,7 @@ #include "countseqscommand.h" #include "groupmap.h" #include "sharedutilities.h" +#include "counttable.h" //********************************************************************************************************************** vector CountSeqsCommand::setParameters(){ @@ -175,7 +176,7 @@ int CountSeqsCommand::execute(){ int total = 0; if (!large) { total = processSmall(outputFileName); } else { total = processLarge(outputFileName); } - + if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } //set rabund file as new current rabundfile diff --git a/counttable.cpp b/counttable.cpp index 376bd73..5307bee 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -8,6 +8,57 @@ #include "counttable.h" +/************************************************************/ +int CountTable::createTable(set& n, map& g, set& gs) { + try { + int numGroups = 0; + groups.clear(); + totalGroups.clear(); + indexGroupMap.clear(); + indexNameMap.clear(); + counts.clear(); + for (set::iterator it = gs.begin(); it != gs.end(); it++) { groups.push_back(*it); hasGroups = true; } + numGroups = groups.size(); + totalGroups.resize(numGroups, 0); + + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { indexGroupMap[groups[i]] = i; } + m->setAllGroups(groups); + + uniques = 0; + total = 0; + for (set::iterator it = n.begin(); it != n.end(); it++) { + + if (m->control_pressed) { break; } + + string seqName = *it; + + vector groupCounts; groupCounts.resize(numGroups, 0); + map::iterator itGroup = g.find(seqName); + + if (itGroup != g.end()) { + groupCounts[indexGroupMap[itGroup->second]] = 1; + totalGroups[indexGroupMap[itGroup->second]]++; + }else { m->mothurOut("[ERROR]: Your group file does not contain " + seqName + ". Please correct."); m->mothurOutEndLine(); } + + map::iterator it2 = indexNameMap.find(seqName); + if (it2 == indexNameMap.end()) { + if (hasGroups) { counts.push_back(groupCounts); } + indexNameMap[seqName] = uniques; + totals.push_back(1); + total++; + uniques++; + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "createTable"); + exit(1); + } +} /************************************************************/ bool CountTable::testGroups(string file) { try { @@ -26,6 +77,118 @@ bool CountTable::testGroups(string file) { } } /************************************************************/ +int CountTable::createTable(string namefile, string groupfile, bool createGroup) { + try { + + if (namefile == "") { m->mothurOut("[ERROR]: namefile cannot be blank when creating a count table.\n"); m->control_pressed = true; } + + GroupMap* groupMap; + int numGroups = 0; + groups.clear(); + totalGroups.clear(); + indexGroupMap.clear(); + indexNameMap.clear(); + counts.clear(); + map originalGroupIndexes; + + if (groupfile != "") { + hasGroups = true; + groupMap = new GroupMap(groupfile); groupMap->readMap(); + numGroups = groupMap->getNumGroups(); + groups = groupMap->getNamesOfGroups(); + totalGroups.resize(numGroups, 0); + }else if(createGroup) { + hasGroups = true; + numGroups = 1; + groups.push_back("Group1"); + totalGroups.resize(numGroups, 0); + } + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { indexGroupMap[groups[i]] = i; } + m->setAllGroups(groups); + + bool error = false; + string name; + uniques = 0; + total = 0; + + + //open input file + ifstream in; + m->openInputFile(namefile, in); + + int total = 0; + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol, secondCol; + in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); + + vector names; + m->splitAtChar(secondCol, names, ','); + + map groupCounts; + int thisTotal = 0; + if (groupfile != "") { + //set to 0 + for (int i = 0; i < groups.size(); i++) { groupCounts[groups[i]] = 0; } + + //get counts for each of the users groups + for (int i = 0; i < names.size(); i++) { + string group = groupMap->getGroup(names[i]); + + if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); error=true; } + else { + map::iterator it = groupCounts.find(group); + + //if not found, then this sequence is not from a group we care about + if (it != groupCounts.end()) { + it->second++; + thisTotal++; + } + } + } + }else if (createGroup) { + groupCounts["Group1"]=0; + for (int i = 0; i < names.size(); i++) { + string group = "Group1"; + groupCounts["Group1"]++; thisTotal++; + } + }else { thisTotal = names.size(); } + + //if group info, then read it + vector thisGroupsCount; thisGroupsCount.resize(numGroups, 0); + for (int i = 0; i < numGroups; i++) { + thisGroupsCount[i] = groupCounts[groups[i]]; + totalGroups[i] += thisGroupsCount[i]; + } + + map::iterator it = indexNameMap.find(firstCol); + if (it == indexNameMap.end()) { + if (hasGroups) { counts.push_back(thisGroupsCount); } + indexNameMap[firstCol] = uniques; + totals.push_back(thisTotal); + total += thisTotal; + uniques++; + }else { + error = true; + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + firstCol + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); + } + } + in.close(); + + if (error) { m->control_pressed = true; } + if (groupfile != "") { delete groupMap; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "createTable"); + exit(1); + } +} +/************************************************************/ int CountTable::readTable(string file) { try { filename = file; @@ -89,6 +252,68 @@ int CountTable::readTable(string file) { } } /************************************************************/ +int CountTable::printTable(string file) { + try { + ofstream out; + m->openOutputFile(file, out); + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; } + out << endl; + + for (map::iterator itNames = indexNameMap.begin(); itNames != indexNameMap.end(); itNames++) { + out << itNames->first << '\t' << totals[itNames->second] << '\t'; + if (hasGroups) { + + for (int i = 0; i < groups.size(); i++) { + out << counts[itNames->second][i] << '\t'; + } + } + out << endl; + } + out.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printTable"); + exit(1); + } +} +/************************************************************/ +int CountTable::printHeaders(ofstream& out) { + try { + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; } + out << endl; + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printHeaders"); + exit(1); + } +} +/************************************************************/ +int CountTable::printSeq(ofstream& out, string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + out << it->first << '\t' << totals[it->second] << '\t'; + if (hasGroups) { + for (int i = 0; i < groups.size(); i++) { + out << counts[it->second][i] << '\t'; + } + } + out << endl; + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printSeq"); + exit(1); + } +} +/************************************************************/ //group counts for a seq vector CountTable::getGroupCounts(string seqName) { try { @@ -154,6 +379,123 @@ int CountTable::getGroupCount(string seqName, string groupName) { exit(1); } } +/************************************************************/ +//set the number of sequences for the seq for the group +int CountTable::setAbund(string seqName, string groupName, int num) { + try { + if (hasGroups) { + map::iterator it = indexGroupMap.find(groupName); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + map::iterator it2 = indexNameMap.find(seqName); + if (it2 == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int oldCount = counts[it2->second][it->second]; + counts[it2->second][it->second] = num; + totalGroups[it->second] += (num - oldCount); + total += (num - oldCount); + totals[it2->second] += (num - oldCount); + } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "set"); + exit(1); + } +} +/************************************************************/ +//add group +int CountTable::addGroup(string groupName) { + try { + bool sanity = m->inUsersGroups(groupName, groups); + if (sanity) { m->mothurOut("[ERROR]: " + groupName + " is already in the count table, cannot add again.\n"); m->control_pressed = true; return 0; } + + groups.push_back(groupName); + if (!hasGroups) { counts.resize(uniques); } + + for (int i = 0; i < counts.size(); i++) { counts[i].push_back(0); } + totalGroups.push_back(0); + indexGroupMap[groupName] = groups.size()-1; + map originalGroupMap = indexGroupMap; + + //important to play well with others, :) + sort(groups.begin(), groups.end()); + + //fix indexGroupMap && totalGroups + vector newTotals; newTotals.resize(groups.size(), 0); + for (int i = 0; i < groups.size(); i++) { + indexGroupMap[groups[i]] = i; + //find original spot of group[i] + int index = originalGroupMap[groups[i]]; + newTotals[i] = totalGroups[index]; + } + totalGroups = newTotals; + + //fix counts vectors + for (int i = 0; i < counts.size(); i++) { + vector newCounts; newCounts.resize(groups.size(), 0); + for (int j = 0; j < groups.size(); j++) { + //find original spot of group[i] + int index = originalGroupMap[groups[j]]; + newCounts[j] = counts[i][index]; + } + counts[i] = newCounts; + } + hasGroups = true; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "addGroup"); + exit(1); + } +} +/************************************************************/ +//vector of groups for the seq +vector CountTable::getGroups(string seqName) { + try { + vector thisGroups; + if (hasGroups) { + vector thisCounts = getGroupCounts(seqName); + for (int i = 0; i < thisCounts.size(); i++) { + if (thisCounts[i] != 0) { thisGroups.push_back(groups[i]); } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return thisGroups; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getGroups"); + exit(1); + } +} +/************************************************************/ +//total number of seqs represented by seq +int CountTable::renameSeq(string oldSeqName, string newSeqName) { + try { + + map::iterator it = indexNameMap.find(oldSeqName); + if (it == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + oldSeqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int index = it->second; + indexNameMap.erase(it); + indexNameMap[newSeqName] = index; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "renameSeq"); + exit(1); + } +} + /************************************************************/ //total number of seqs represented by seq int CountTable::getNumSeqs(string seqName) { @@ -213,6 +555,30 @@ int CountTable::push_back(string seqName) { } } /************************************************************/ +//remove sequence +int CountTable::remove(string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + uniques--; + if (hasGroups){ //remove this sequences counts from group totals + for (int i = 0; i < totalGroups.size(); i++) { totalGroups[i] -= counts[it->second][i]; counts[it->second][i] = 0; } + } + int thisTotal = totals[it->second]; totals[it->second] = 0; + total -= thisTotal; + indexNameMap.erase(it); + }else { + m->mothurOut("[ERROR]: Your count table contains does not include " + seqName + ", cannot remove."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ //add seqeunce without group info int CountTable::push_back(string seqName, int thisTotal) { try { @@ -243,6 +609,7 @@ int CountTable::push_back(string seqName, vector groupCounts) { if ((hasGroups) && (groupCounts.size() != getNumGroups())) { m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true; } int thisTotal = 0; for (int i = 0; i < getNumGroups(); i++) { totalGroups[i] += groupCounts[i]; thisTotal += groupCounts[i]; } + if (hasGroups) { counts.push_back(groupCounts); } indexNameMap[seqName] = uniques; totals.push_back(thisTotal); total+= thisTotal; @@ -293,7 +660,30 @@ vector CountTable::getNamesOfSeqs() { } } /************************************************************/ -//returns names of seqs +//returns the names of all unique sequences in file +vector CountTable::getNamesOfSeqs(string group) { + try { + vector names; + if (hasGroups) { + map::iterator it = indexGroupMap.find(group); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + group + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + for (map::iterator it2 = indexNameMap.begin(); it2 != indexNameMap.end(); it2++) { + if (counts[it2->second][it->second] != 0) { names.push_back(it2->first); } + } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return names; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getNamesOfSeqs"); + exit(1); + } +} +/************************************************************/ +//merges counts of seq1 and seq2, saving in seq1 int CountTable::mergeCounts(string seq1, string seq2) { try { map::iterator it = indexNameMap.find(seq1); @@ -305,17 +695,12 @@ int CountTable::mergeCounts(string seq1, string seq2) { m->mothurOut("[ERROR]: " + seq2 + " is not in your count table. Please correct.\n"); m->control_pressed = true; }else { //merge data - for (int i = 0; i < groups.size(); i++) { - counts[it->second][i] += counts[it2->second][i]; - counts[it2->second][i] = 0; - } + for (int i = 0; i < groups.size(); i++) { counts[it->second][i] += counts[it2->second][i]; } totals[it->second] += totals[it2->second]; - totals[it2->second] = 0; uniques--; indexNameMap.erase(it2); } } - return 0; } catch(exception& e) { @@ -323,6 +708,25 @@ int CountTable::mergeCounts(string seq1, string seq2) { exit(1); } } +/************************************************************/ +int CountTable::copy(CountTable* ct) { + try { + vector thisGroups = ct->getNamesOfGroups(); + for (int i = 0; i < thisGroups.size(); i++) { addGroup(thisGroups[i]); } + vector names = ct->getNamesOfSeqs(); + + for (int i = 0; i < names.size(); i++) { + vector thisCounts = ct->getGroupCounts(names[i]); + push_back(names[i], thisCounts); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "copy"); + exit(1); + } +} /************************************************************/ diff --git a/counttable.h b/counttable.h index 04e26d7..68ba8d2 100644 --- a/counttable.h +++ b/counttable.h @@ -38,36 +38,49 @@ #include "mothurout.h" #include "listvector.hpp" +#include "groupmap.h" class CountTable { public: - CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; } + CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; uniques = 0; } ~CountTable() {} - int readTable(string); + int createTable(set&, map&, set&); //seqNames, seqName->group, groupNames + int createTable(string, string, bool); //namefile, groupfile, createGroup + int readTable(string); + int printTable(string); + int printHeaders(ofstream&); + int printSeq(ofstream&, string); bool testGroups(string file); //used to check if file has group data without reading it. + int copy(CountTable*); bool hasGroupInfo() { return hasGroups; } int getNumGroups() { return groups.size(); } vector getNamesOfGroups() { return groups; } //returns group names, if no group info vector is blank. + int addGroup(string); + int renameSeq(string, string); //used to change name of sequence for use with trees + int setAbund(string, string, int); //set abundance number of seqs for that group for that seq int push_back(string); //add a sequence int push_back(string, int); //add a sequence int push_back(string, vector); //add a sequence with group info + int remove(string); //remove seq int get(string); //returns unique sequence index for reading distance matrices like NameAssignment int size() { return indexNameMap.size(); } + vector getGroups(string); //returns vector of groups represented by this sequences vector getGroupCounts(string); //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function. int getGroupCount(string, string); //returns number of seqs for that group for that seq int getGroupCount(string); // returns total seqs for that group - int getNumSeqs(string); //returns total seqs for that seq + int getNumSeqs(string); //returns total seqs for that seq, 0 if not found int getNumSeqs() { return total; } //return total number of seqs int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs int getGroupIndex(string); //returns index in getGroupCounts vector of specific group vector getNamesOfSeqs(); + vector getNamesOfSeqs(string); int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in. ListVector getListVector(); diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index bab5a63..b6f71c8 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -14,7 +14,8 @@ vector DeconvoluteCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -31,7 +32,7 @@ vector DeconvoluteCommand::setParameters(){ string DeconvoluteCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unique.seqs command reads a fastafile and creates a namesfile.\n"; + helpString += "The unique.seqs command reads a fastafile and creates a name or count file.\n"; helpString += "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. \n"; helpString += "If the sequence is unique the second column will just contain its name. \n"; helpString += "The unique.seqs command parameters are fasta and name. fasta is required, unless there is a valid current fasta file.\n"; @@ -56,6 +57,7 @@ string DeconvoluteCommand::getOutputFileNameTag(string type, string inputName="" else { if (type == "fasta") { outputFileName = "unique" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -73,6 +75,7 @@ DeconvoluteCommand::DeconvoluteCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand"); @@ -106,6 +109,7 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -127,6 +131,14 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -149,11 +161,21 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; } else if (oldNameMapFName == "not found"){ oldNameMapFName = ""; } else { m->setNameFile(oldNameMapFName); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } - if (oldNameMapFName == "") { - vector files; files.push_back(inFastaName); - parser.getNameFile(files); - } + if ((countfile != "") && (oldNameMapFName != "")) { m->mothurOut("When executing a unique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + + if (countfile == "") { + if (oldNameMapFName == "") { + vector files; files.push_back(inFastaName); + parser.getNameFile(files); + } + } } @@ -171,6 +193,7 @@ int DeconvoluteCommand::execute() { //prepare filenames and open files string outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("name"); + string outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("count"); string outFastaFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("fasta", inFastaName); map nameMap; @@ -179,6 +202,11 @@ int DeconvoluteCommand::execute() { m->readNames(oldNameMapFName, nameMap); if (oldNameMapFName == outNameFile){ outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("name"); } } + CountTable ct; + if (countfile != "") { + ct.readTable(countfile); + if (countfile == outCountFile){ outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("count"); } + } if (m->control_pressed) { return 0; } @@ -222,7 +250,10 @@ int DeconvoluteCommand::execute() { sequenceStrings[seq.getAligned()] = itNames->second; nameFileOrder.push_back(seq.getAligned()); } - }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } + }else if (countfile != "") { + ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); + }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } }else { //this is a dup if (oldNameMapFName != "") { itNames = nameMap.find(seq.getName()); @@ -232,7 +263,12 @@ int DeconvoluteCommand::execute() { }else { sequenceStrings[seq.getAligned()] += "," + itNames->second; } - }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } + }else if (countfile != "") { + int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + if (num != 0) { //its in the table + ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name + } + }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } } count++; @@ -252,34 +288,35 @@ int DeconvoluteCommand::execute() { //print new names file ofstream outNames; - m->openOutputFile(outNameFile, outNames); + if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile); } + else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); } for (int i = 0; i < nameFileOrder.size(); i++) { - //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) { - if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; } + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } itStrings = sequenceStrings.find(nameFileOrder[i]); if (itStrings != sequenceStrings.end()) { - //get rep name - int pos = (itStrings->second).find_first_of(','); - - if (pos == string::npos) { // only reps itself - outNames << itStrings->second << '\t' << itStrings->second << endl; - }else { - outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; - } + if (countfile == "") { + //get rep name + int pos = (itStrings->second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << itStrings->second << '\t' << itStrings->second << endl; + }else { + outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; + } + }else { ct.printSeq(outNames, itStrings->second); } }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; } } outNames.close(); - if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); m->mothurRemove(outNameFile); return 0; } + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outFastaFile); m->mothurOutEndLine(); - m->mothurOut(outNameFile); m->mothurOutEndLine(); - outputNames.push_back(outFastaFile); outputNames.push_back(outNameFile); outputTypes["fasta"].push_back(outFastaFile); outputTypes["name"].push_back(outNameFile); + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //set fasta file as new current fastafile @@ -293,6 +330,11 @@ int DeconvoluteCommand::execute() { if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; } diff --git a/deconvolutecommand.h b/deconvolutecommand.h index 7d4cb50..673ffc9 100644 --- a/deconvolutecommand.h +++ b/deconvolutecommand.h @@ -11,6 +11,7 @@ #include "command.hpp" #include "fastamap.h" +#include "counttable.h" /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file containing 2 columns. The first being the groupname and the second the list of identical sequence names. */ @@ -37,7 +38,7 @@ public: private: - string inFastaName, oldNameMapFName, outputDir; + string inFastaName, oldNameMapFName, outputDir, countfile; vector outputNames; bool abort; diff --git a/deuniquetreecommand.cpp b/deuniquetreecommand.cpp index 662282b..d334f8f 100644 --- a/deuniquetreecommand.cpp +++ b/deuniquetreecommand.cpp @@ -161,7 +161,8 @@ int DeuniqueTreeCommand::execute() { TreeReader* reader = new TreeReader(treefile, "", namefile); vector T = reader->getTrees(); - map nameMap = reader->getNameMap(); + map nameMap; + m->readNames(namefile, nameMap); delete reader; //print new Tree @@ -172,7 +173,7 @@ int DeuniqueTreeCommand::execute() { T[0]->print(out, nameMap); out.close(); - delete (T[0]->getTreeMap()); + delete (T[0]->getCountTable()); for (int i = 0; i < T.size(); i++) { delete T[i]; } //set phylip file as new current phylipfile diff --git a/indicatorcommand.cpp b/indicatorcommand.cpp index f98620b..dc9f121 100644 --- a/indicatorcommand.cpp +++ b/indicatorcommand.cpp @@ -287,17 +287,22 @@ int IndicatorCommand::execute(){ string groupfile = ""; m->setTreeFile(treefile); Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap - treeMap = new TreeMap(); + ct = new CountTable(); bool mismatch = false; - - for (int i = 0; i < m->Treenames.size(); i++) { - //sanity check - is this a group that is not in the sharedfile? + + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->Treenames.size(); i++) { + nameMap.insert(m->Treenames[i]); + //sanity check - is this a group that is not in the sharedfile? if (designfile == "") { + if (i == 0) { gps.insert("Group1"); } if (!(m->inUsersGroups(m->Treenames[i], m->getAllGroups()))) { m->mothurOut("[ERROR]: " + m->Treenames[i] + " is not a group in your shared or relabund file."); m->mothurOutEndLine(); mismatch = true; } - treeMap->addSeq(m->Treenames[i], "Group1"); + groupMap[m->Treenames[i]] = "Group1"; }else{ vector myGroups; myGroups.push_back(m->Treenames[i]); vector myNames = designMap->getNamesSeqs(myGroups); @@ -308,9 +313,10 @@ int IndicatorCommand::execute(){ mismatch = true; } } - treeMap->addSeq(m->Treenames[i], "Group1"); + groupMap[m->Treenames[i]] = "Group1"; } - } + } + ct->createTable(nameMap, groupMap, gps); if ((designfile != "") && (m->Treenames.size() != Groups.size())) { cout << Groups.size() << '\t' << m->Treenames.size() << endl; m->mothurOut("[ERROR]: You design file does not match your tree, aborting."); m->mothurOutEndLine(); mismatch = true; } @@ -318,14 +324,14 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - delete treeMap; + delete ct; return 0; } read = new ReadNewickTree(treefile); - int readOk = read->read(treeMap); + int readOk = read->read(ct); - if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete treeMap; delete read; return 0; } + if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete ct; delete read; return 0; } vector T = read->getTrees(); @@ -335,19 +341,18 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - for (int i = 0; i < T.size(); i++) { delete T[i]; } delete treeMap; return 0; + for (int i = 0; i < T.size(); i++) { delete T[i]; } delete ct; return 0; } - map nameMap; - T[0]->assembleTree(nameMap); + T[0]->assembleTree(); /***************************************************/ // create ouptut tree - respecting pickedGroups // /***************************************************/ - Tree* outputTree = new Tree(m->getNumGroups(), treeMap); + Tree* outputTree = new Tree(m->getNumGroups(), ct); outputTree->getSubTree(T[0], m->getGroups()); - outputTree->assembleTree(nameMap); + outputTree->assembleTree(); //no longer need original tree, we have output tree to use and label for (int i = 0; i < T.size(); i++) { delete T[i]; } @@ -356,14 +361,14 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - delete outputTree; delete treeMap; return 0; + delete outputTree; delete ct; return 0; } /***************************************************/ // get indicator species values // /***************************************************/ GetIndicatorSpecies(outputTree); - delete outputTree; delete treeMap; + delete outputTree; delete ct; }else { //run with design file only //get indicator species diff --git a/indicatorcommand.h b/indicatorcommand.h index 2c36c35..3c24dfb 100644 --- a/indicatorcommand.h +++ b/indicatorcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "readtree.h" -#include "treemap.h" +#include "counttable.h" #include "sharedrabundvector.h" #include "sharedrabundfloatvector.h" #include "inputdata.h" @@ -36,7 +36,7 @@ public: private: ReadTree* read; - TreeMap* treeMap; + CountTable* ct; GroupMap* designMap; string treefile, sharedfile, relabundfile, groups, label, inputFileName, outputDir, designfile; bool abort; diff --git a/parsimony.cpp b/parsimony.cpp index 3b0f317..6a0485c 100644 --- a/parsimony.cpp +++ b/parsimony.cpp @@ -15,7 +15,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { try { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups vector mGroups = m->getGroups(); @@ -38,7 +38,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { vector groups; if (numGroups == 0) { //get score for all users groups - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); for (int i = 0; i < tGroups.size(); i++) { if (tGroups[i] != "xxx") { groups.push_back(tGroups[i]); @@ -57,7 +57,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ lines.clear(); int numPairs = namesOfGroupCombos.size(); @@ -74,10 +74,10 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -90,7 +90,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -107,7 +107,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGr process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); if (m->control_pressed) { exit(0); } @@ -127,7 +127,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGr } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i > namesOfGr } } /**************************************************************************************************/ -EstOutput Parsimony::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Parsimony::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { EstOutput results; results.resize(num); - Tree* copyTree = new Tree(tmap); + Tree* copyTree = new Tree(ct); int count = 0; for (int h = start; h < (start+num); h++) { diff --git a/parsimony.h b/parsimony.h index 7316d50..bf0e0d4 100644 --- a/parsimony.h +++ b/parsimony.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -35,8 +35,8 @@ class Parsimony : public TreeCalculator { int processors; string outputDir; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); }; /***********************************************************************/ diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp index f124b60..eabbb59 100644 --- a/parsimonycommand.cpp +++ b/parsimonycommand.cpp @@ -14,8 +14,9 @@ vector ParsimonyCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter prandom("random", "String", "", "", "", "", "",false,false); parameters.push_back(prandom); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); @@ -36,7 +37,7 @@ vector ParsimonyCommand::setParameters(){ string ParsimonyCommand::getHelpString(){ try { string helpString = ""; - helpString += "The parsimony command parameters are tree, group, name, random, groups, processors and iters. tree parameter is required unless you have valid current tree file or are using random.\n"; + helpString += "The parsimony command parameters are tree, group, name, count, random, groups, processors and iters. tree parameter is required unless you have valid current tree file or are using random.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The parsimony command should be in the following format: parsimony(random=yourOutputFilename, groups=yourGroups, iters=yourIters).\n"; @@ -145,6 +146,14 @@ ParsimonyCommand::ParsimonyCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -172,6 +181,20 @@ ParsimonyCommand::ParsimonyCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + } //if the user changes the output directory command factory will send this info to us in the output parameter @@ -193,10 +216,12 @@ ParsimonyCommand::ParsimonyCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } @@ -219,9 +244,11 @@ int ParsimonyCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); + ct = T[0]->getCountTable(); delete reader; if(outputDir == "") { outputDir += m->hasPath(treefile); } @@ -245,7 +272,7 @@ int ParsimonyCommand::execute() { //set users groups to analyze SharedUtil util; vector mGroups = m->getGroups(); - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); util.setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony"); //sets the groups the user wants to analyze util.getCombos(groupComb, mGroups, numComp); m->setGroups(mGroups); @@ -260,7 +287,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); m->clearGroups(); @@ -285,7 +312,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); m->clearGroups(); @@ -314,7 +341,7 @@ int ParsimonyCommand::execute() { for (int j = 0; j < iters; j++) { //create new tree with same num nodes and leaves as users - randT = new Tree(tmap); + randT = new Tree(ct); //create random relationships between nodes randT->assembleRandomTree(); @@ -326,7 +353,7 @@ int ParsimonyCommand::execute() { delete reading; delete output; delete randT; if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } m->clearGroups(); return 0; } @@ -355,13 +382,13 @@ int ParsimonyCommand::execute() { for (int j = 0; j < iters; j++) { //create new tree with same num nodes and leaves as users - randT = new Tree(tmap); + randT = new Tree(ct); //create random relationships between nodes randT->assembleRandomTree(); if (m->control_pressed) { - delete reading; delete output; delete randT; delete tmap; + delete reading; delete output; delete randT; delete ct; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } @@ -370,7 +397,7 @@ int ParsimonyCommand::execute() { randomData = pars.getValues(randT, processors, outputDir); if (m->control_pressed) { - delete reading; delete output; delete randT; delete tmap; + delete reading; delete output; delete randT; delete ct; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } @@ -424,7 +451,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -437,7 +464,7 @@ int ParsimonyCommand::execute() { printParsimonyFile(); if (randomtree == "") { printUSummaryFile(); } - delete output; delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete output; delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;} @@ -529,7 +556,7 @@ void ParsimonyCommand::getUserInput() { try { //create treemap - tmap = new TreeMap(); + ct = new CountTable(); m->mothurOut("Please enter the number of groups you would like to analyze: "); cin >> numGroups; @@ -539,30 +566,31 @@ void ParsimonyCommand::getUserInput() { count = 1; numEachGroup.resize(numGroups, 0); - + set nameMap; + map groupMap; + set gps; + for (int i = 1; i <= numGroups; i++) { m->mothurOut("Please enter the number of sequences in group " + toString(i) + ": "); cin >> num; m->mothurOutJustToLog(toString(num)); m->mothurOutEndLine(); - - //set tmaps seqsPerGroup - tmap->seqsPerGroup[toString(i)] = num; - tmap->addGroup(toString(i)); + gps.insert(toString(i)); + //set tmaps namesOfSeqs for (int j = 0; j < num; j++) { - tmap->namesOfSeqs.push_back(toString(count)); - tmap->treemap[toString(count)].groupname = toString(i); + groupMap[toString(count)] = i; + nameMap.insert(toString(count)); count++; } } - + ct->createTable(nameMap, groupMap, gps); + //clears buffer so next command doesn't have error string s; getline(cin, s); - m->Treenames = tmap->namesOfSeqs; - + m->Treenames = ct->getNamesOfSeqs(); } catch(exception& e) { m->errorOut(e, "ParsimonyCommand", "getUserInput"); diff --git a/parsimonycommand.h b/parsimonycommand.h index 79613f5..38a7505 100644 --- a/parsimonycommand.h +++ b/parsimonycommand.h @@ -11,7 +11,7 @@ #include "command.hpp" #include "parsimony.h" -#include "treemap.h" +#include "counttable.h" #include "progress.hpp" #include "sharedutilities.h" #include "fileoutput.h" @@ -41,10 +41,10 @@ private: vector T; //user trees Tree* randT; //random tree Tree* copyUserTree; - TreeMap* tmap; - TreeMap* savetmap; + CountTable* ct; + CountTable* savect; vector groupComb; // AB. AC, BC... - string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile; + string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile, countfile; int iters, numGroups, numComp, counter, processors, numUniquesInName; vector numEachGroup; //vector containing the number of sequences in each group the users wants for random distrib. vector< vector > userTreeScores; //scores for users trees for each comb. diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index ddd2b31..b0c11f6 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -15,8 +15,9 @@ vector PhyloDiversityCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); @@ -41,7 +42,7 @@ vector PhyloDiversityCommand::setParameters(){ string PhyloDiversityCommand::getHelpString(){ try { string helpString = ""; - helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary. tree and group are required, unless you have valid current files.\n"; + helpString += "The phylo.diversity command parameters are tree, group, name, count, groups, iters, freq, processors, scale, rarefy, collect and summary. tree and group are required, unless you have valid current files.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n"; helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n"; helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; @@ -156,6 +157,14 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -179,6 +188,19 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } string temp; @@ -214,10 +236,12 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } } @@ -236,14 +260,16 @@ int PhyloDiversityCommand::execute(){ int start = time(NULL); m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } vector trees = reader->getTrees(); - tmap = trees[0]->getTreeMap(); + ct = trees[0]->getCountTable(); delete reader; SharedUtil util; vector mGroups = m->getGroups(); - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); util.setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed @@ -255,7 +281,7 @@ int PhyloDiversityCommand::execute(){ //for each of the users trees for(int i = 0; i < trees.size(); i++) { - if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { delete ct; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } ofstream outSum, outRare, outCollect; string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + "." + getOutputFileNameTag("summary"); @@ -286,15 +312,16 @@ int PhyloDiversityCommand::execute(){ //find largest group total int largestGroup = 0; - for (int j = 0; j < mGroups.size(); j++) { - if (tmap->seqsPerGroup[mGroups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[mGroups[j]]; } + for (int j = 0; j < mGroups.size(); j++) { + int numSeqsThisGroup = ct->getGroupCount(mGroups[j]); + if (numSeqsThisGroup > largestGroup) { largestGroup = numSeqsThisGroup; } //initialize diversity - diversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0); //numSampled + diversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0); //numSampled //groupA 0.0 0.0 //initialize sumDiversity - sumDiversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0); + sumDiversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0); } //convert freq percentage to number @@ -649,7 +676,7 @@ map PhyloDiversityCommand::getRootForGroups(Tree* t){ map done; //initialize root for all groups to -1 - for (int k = 0; k < (t->getTreeMap())->getNamesOfGroups().size(); k++) { done[(t->getTreeMap())->getNamesOfGroups()[k]] = false; } + for (int k = 0; k < (t->getCountTable())->getNamesOfGroups().size(); k++) { done[(t->getCountTable())->getNamesOfGroups()[k]] = false; } for (int i = 0; i < t->getNumLeaves(); i++) { diff --git a/phylodiversitycommand.h b/phylodiversitycommand.h index 9527692..ee76f05 100644 --- a/phylodiversitycommand.h +++ b/phylodiversitycommand.h @@ -11,7 +11,7 @@ */ #include "command.hpp" -#include "treemap.h" +#include "counttable.h" #include "sharedutilities.h" #include "tree.h" @@ -33,11 +33,11 @@ class PhyloDiversityCommand : public Command { int execute(); void help() { m->mothurOut(getHelpString()); } private: - TreeMap* tmap; + CountTable* ct; float freq; int iters, processors, numUniquesInName; bool abort, rarefy, summary, collect, scale; - string groups, outputDir, treefile, groupfile, namefile; + string groups, outputDir, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used, and outputFile names map getRootForGroups(Tree* t); diff --git a/phylosummary.cpp b/phylosummary.cpp index 5f7bbc3..ab6bb83 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -8,21 +8,68 @@ */ #include "phylosummary.h" - /**************************************************************************************************/ -PhyloSummary::PhyloSummary(string refTfile, string groupFile){ +PhyloSummary::PhyloSummary(string refTfile, CountTable* c){ try { m = MothurOut::getInstance(); maxLevel = 0; ignore = false; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + //check for necessary files + string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); + ifstream FileTest(taxFileNameTest.c_str()); - if (groupFile != "") { - groupmap = new GroupMap(groupFile); - groupmap->readMap(); + if (!FileTest) { + m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1); }else{ - groupmap = NULL; + readTreeStruct(FileTest); } + + tree[0].rank = "0"; + assignRank(0); + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} + +/**************************************************************************************************/ + +PhyloSummary::PhyloSummary(CountTable* c){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = true; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "0"; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} +/**************************************************************************************************/ +PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = false; + numSeqs = 0; + + groupmap = g; + ct = NULL; //check for necessary files string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); @@ -46,23 +93,18 @@ PhyloSummary::PhyloSummary(string refTfile, string groupFile){ /**************************************************************************************************/ -PhyloSummary::PhyloSummary(string groupFile){ +PhyloSummary::PhyloSummary(GroupMap* g){ try { m = MothurOut::getInstance(); maxLevel = 0; ignore = true; + numSeqs = 0; - if (groupFile != "") { - groupmap = new GroupMap(groupFile); - groupmap->readMap(); - }else{ - groupmap = NULL; - } + groupmap = g; + ct = NULL; tree.push_back(rawTaxNode("Root")); tree[0].rank = "0"; - - } catch(exception& e) { m->errorOut(e, "PhyloSummary", "PhyloSummary"); @@ -78,7 +120,6 @@ int PhyloSummary::summarize(string userTfile){ for (map::iterator itTemp = temp.begin(); itTemp != temp.end();) { addSeqToTree(itTemp->first, itTemp->second); - numSeqs++; temp.erase(itTemp++); } @@ -137,7 +178,9 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ childPointer = tree[currentNode].children.find(taxon); if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on - if (groupmap != NULL) { + int thisCount = 1; + + if (groupmap != NULL) { //find out the sequences group string group = groupmap->getGroup(seqName); @@ -150,9 +193,27 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ if (itGroup != tree[childPointer->second].groupCount.end()) { tree[childPointer->second].groupCount[group]++; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + for (int i = 0; i < groups.size(); i++) { + + if (groupCounts[i] != 0) { + //do you have a count for this group? + map::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[childPointer->second].groupCount.end()) { + tree[childPointer->second].groupCount[groups[i]] += groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } - tree[childPointer->second].total++; + tree[childPointer->second].total += thisCount; currentNode = childPointer->second; }else{ @@ -163,8 +224,8 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ tree[index].parent = currentNode; tree[index].level = (level+1); - tree[index].total = 1; tree[currentNode].children[taxon] = index; + int thisCount = 1; //initialize groupcounts if (groupmap != NULL) { @@ -184,9 +245,33 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ //if yes, increment it - there should not be a case where we can't find it since we load group in read if (itGroup != tree[index].groupCount.end()) { tree[index].groupCount[group]++; - } - } + } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { + tree[index].groupCount[mGroups[j]] = 0; + } + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + if (groupCounts[i] != 0) { + + //do you have a count for this group? + map::iterator itGroup = tree[index].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[index].groupCount.end()) { + tree[index].groupCount[groups[i]]+=groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } + tree[index].total = thisCount; currentNode = index; }else{ //otherwise, error @@ -210,7 +295,7 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ } /**************************************************************************************************/ -int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ +int PhyloSummary::addSeqToTree(string seqTaxonomy, map containsGroup){ try { numSeqs++; @@ -235,32 +320,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ childPointer = tree[currentNode].children.find(taxon); if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on - if (groupmap != NULL) { - - map containsGroup; - vector mGroups = groupmap->getNamesOfGroups(); - for (int j = 0; j < mGroups.size(); j++) { - containsGroup[mGroups[j]] = false; - } - - for (int k = 0; k < names.size(); k++) { - //find out the sequences group - string group = groupmap->getGroup(names[k]); - - if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } - else { - containsGroup[group] = true; - } - } + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[childPointer->second].groupCount[itGroup->first]++; + } + } - for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { - if (itGroup->second == true) { - tree[childPointer->second].groupCount[itGroup->first]++; - } - } - - } - tree[childPointer->second].total++; currentNode = childPointer->second; @@ -274,33 +339,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ tree[index].level = (level+1); tree[index].total = 1; tree[currentNode].children[taxon] = index; - - //initialize groupcounts - if (groupmap != NULL) { - map containsGroup; - vector mGroups = groupmap->getNamesOfGroups(); - for (int j = 0; j < mGroups.size(); j++) { - tree[index].groupCount[mGroups[j]] = 0; - containsGroup[mGroups[j]] = false; - } - - for (int k = 0; k < names.size(); k++) { - //find out the sequences group - string group = groupmap->getGroup(names[k]); - - if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } - else { - containsGroup[group] = true; - } - } - - for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { - if (itGroup->second == true) { - tree[index].groupCount[itGroup->first]++; - } - } - } + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[index].groupCount[itGroup->first]++; + } + } currentNode = index; @@ -349,17 +393,24 @@ void PhyloSummary::print(ofstream& out){ try { if (ignore) { assignRank(0); } - + vector mGroups; //print labels out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t"; if (groupmap != NULL) { //so the labels match the counts below, since the map sorts them automatically... //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); - vector mGroups = groupmap->getNamesOfGroups(); + mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { out << mGroups[i] << '\t'; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { + out << mGroups[i] << '\t'; + } + } + } out << endl; @@ -373,9 +424,10 @@ void PhyloSummary::print(ofstream& out){ tree[0].total += tree[it->second].total; if (groupmap != NULL) { - vector mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } - } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } } + } } } @@ -384,12 +436,10 @@ void PhyloSummary::print(ofstream& out){ if (groupmap != NULL) { - //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { - // out << itGroup->second << '\t'; - //} - vector mGroups = groupmap->getNamesOfGroups(); - for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } - } + for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } } + } out << endl; //print rest @@ -427,7 +477,12 @@ void PhyloSummary::print(int i, ofstream& out){ //} vector mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } + } + } out << endl; } @@ -473,7 +528,13 @@ void PhyloSummary::readTreeStruct(ifstream& in){ for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) { tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) { + tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0; + } + } + } tree[i].total = 0; diff --git a/phylosummary.h b/phylosummary.h index cdec0d0..65a4674 100644 --- a/phylosummary.h +++ b/phylosummary.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "mothurout.h" #include "groupmap.h" +#include "counttable.h" /**************************************************************************************************/ @@ -32,13 +33,15 @@ struct rawTaxNode { class PhyloSummary { public: - PhyloSummary(string); - PhyloSummary(string, string); - ~PhyloSummary() { if (groupmap != NULL) { delete groupmap; } } + PhyloSummary(GroupMap*); + PhyloSummary(string, GroupMap*); + PhyloSummary(CountTable*); + PhyloSummary(string, CountTable*); + ~PhyloSummary() {} int summarize(string); //pass it a taxonomy file and a group file and it makes the tree int addSeqToTree(string, string); - int addSeqToTree(string, vector); + int addSeqToTree(string, map); void print(ofstream&); int getMaxLevel() { return maxLevel; } @@ -49,6 +52,7 @@ private: void assignRank(int); void readTreeStruct(ifstream&); GroupMap* groupmap; + CountTable* ct; bool ignore; int numNodes; diff --git a/readtree.cpp b/readtree.cpp index 6fa4c3d..71c4bd5 100644 --- a/readtree.cpp +++ b/readtree.cpp @@ -20,12 +20,12 @@ ReadTree::ReadTree() { } } /***********************************************************************/ -int ReadTree::AssembleTrees(map nameMap) { +int ReadTree::AssembleTrees() { try { //assemble users trees for (int i = 0; i < Trees.size(); i++) { if (m->control_pressed) { return 0; } - Trees[i]->assembleTree(nameMap); + Trees[i]->assembleTree(); } return 0; } @@ -107,7 +107,7 @@ float ReadTree::readBranchLength(istream& f) { /***********************************************************************/ //This class reads a file in Newick form and stores it in a tree. -int ReadNewickTree::read(TreeMap* tmap) { +int ReadNewickTree::read(CountTable* ct) { try { holder = ""; int c, error; @@ -129,12 +129,12 @@ int ReadNewickTree::read(TreeMap* tmap) { } //make new tree - T = new Tree(tmap); + T = new Tree(ct); numNodes = T->getNumNodes(); numLeaves = T->getNumLeaves(); - error = readTreeString(tmap); + error = readTreeString(ct); //save trees for later commands Trees.push_back(T); @@ -143,9 +143,9 @@ int ReadNewickTree::read(TreeMap* tmap) { //if you are a nexus file }else if ((c = filehandle.peek()) == '#') { //get right number of seqs from nexus file. - Tree* temp = new Tree(tmap); delete temp; + Tree* temp = new Tree(ct); delete temp; - nexusTranslation(tmap); //reads file through the translation and updates treemap + nexusTranslation(ct); //reads file through the translation and updates treemap while((c = filehandle.peek()) != EOF) { // get past comments while ((c = filehandle.peek()) != EOF) { @@ -166,12 +166,12 @@ int ReadNewickTree::read(TreeMap* tmap) { filehandle.putback(c); //put back first ( of tree. //make new tree - T = new Tree(tmap); + T = new Tree(ct); numNodes = T->getNumNodes(); numLeaves = T->getNumLeaves(); //read tree info - error = readTreeString(tmap); + error = readTreeString(ct); //save trees for later commands Trees.push_back(T); @@ -191,7 +191,7 @@ int ReadNewickTree::read(TreeMap* tmap) { } /**************************************************************************************************/ //This function read the file through the translation of the sequences names and updates treemap. -string ReadNewickTree::nexusTranslation(TreeMap* tmap) { +string ReadNewickTree::nexusTranslation(CountTable* ct) { try { holder = ""; @@ -209,42 +209,14 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) { filehandle >> holder; if(holder == "tree" && comment != 1){return holder;} } - - //update treemap - tmap->namesOfSeqs.clear(); - - /*char c; - string number, name; - while ((c = filehandle.peek()) != EOF) { - - filehandle >> number; - - if ((number == "tree") || (number == ";") ) { name = number; break; } - - filehandle >> name; - - char lastChar; - if (name.length() != 0) { lastChar = name[name.length()-1]; } - - if ((name == "tree") || (name == ";") ) { break; } - - if (lastChar == ',') { name.erase(name.end()-1); } //erase the comma - */ - + string number, name; for(int i=0;i> number; filehandle >> name; name.erase(name.end()-1); //erase the comma - - //insert new one with new name - string group = tmap->getGroup(name); - tmap->treemap[toString(number)].groupname = group; - tmap->treemap[toString(number)].vectorIndex = tmap->getIndex(name); - //erase old one. so treemap[sarah].groupnumber is now treemap[1].groupnumber. if number is 1 and name is sarah. - tmap->treemap.erase(name); - tmap->namesOfSeqs.push_back(number); + ct->renameSeq(name, toString(number)); } return name; @@ -256,7 +228,7 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) { } /**************************************************************************************************/ -int ReadNewickTree::readTreeString(TreeMap* tmap) { +int ReadNewickTree::readTreeString(CountTable* ct) { try { int n = 0; @@ -269,7 +241,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { if(ch == '('){ n = numLeaves; //number of leaves / sequences, we want node 1 to start where the leaves left off - lc = readNewickInt(filehandle, n, T, tmap); + lc = readNewickInt(filehandle, n, T, ct); if (lc == -1) { m->mothurOut("error with lc"); m->mothurOutEndLine(); return -1; } //reports an error in reading if(filehandle.peek()==','){ @@ -281,7 +253,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { } if(rooted != 1){ - rc = readNewickInt(filehandle, n, T, tmap); + rc = readNewickInt(filehandle, n, T, ct); if (rc == -1) { m->mothurOut("error with rc"); m->mothurOutEndLine(); return -1; } //reports an error in reading if(filehandle.peek() == ')'){ readSpecialChar(filehandle,')',"right parenthesis"); @@ -326,7 +298,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { } /**************************************************************************************************/ -int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { +int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, CountTable* ct) { try { if (m->control_pressed) { return -1; } @@ -339,7 +311,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { //read all children vector childrenNodes; while(f.peek() != ')'){ - int child = readNewickInt(f, n, T, tmap); + int child = readNewickInt(f, n, T, ct); if (child == -1) { return -1; } //reports an error in reading //cout << "child = " << child << endl; childrenNodes.push_back(child); @@ -387,12 +359,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { }else{ T->tree[n].setBranchLength(0.0); } - - //T->tree[n].setChildren(lc,rc); - //T->tree[lc].setParent(n); - //T->tree[rc].setParent(n); - //T->printTree(); cout << endl; - + return n++; }else{ @@ -410,33 +377,27 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { f.putback(d); //set group info - string group = tmap->getGroup(name); + vector group = ct->getGroups(name); //find index in tree of name int n1 = T->getIndex(name); //adds sequence names that are not in group file to the "xxx" group - if(group == "not found") { + if(group.size() == 0) { m->mothurOut("Name: " + name + " is not in your groupfile, and will be disregarded. \n"); //readOk = -1; return n1; - tmap->namesOfSeqs.push_back(name); - tmap->treemap[name].groupname = "xxx"; - - map::iterator it; - it = tmap->seqsPerGroup.find("xxx"); - if (it == tmap->seqsPerGroup.end()) { //its a new group - tmap->addGroup("xxx"); - tmap->seqsPerGroup["xxx"] = 1; - }else { - tmap->seqsPerGroup["xxx"]++; - } - - group = "xxx"; - } - - vector tempGroup; tempGroup.push_back(group); - - T->tree[n1].setGroup(tempGroup); + vector currentGroups = ct->getNamesOfGroups(); + if (!m->inUsersGroups("xxx", currentGroups)) { ct->addGroup("xxx"); } + currentGroups = ct->getNamesOfGroups(); + vector thisCounts; thisCounts.resize(currentGroups.size(), 0); + for (int h = 0; h < currentGroups.size(); h++) { + if (currentGroups[h] == "xxx") { thisCounts[h] = 1; break; } + } + ct->push_back(name, thisCounts); + + group.push_back("xxx"); + } + T->tree[n1].setGroup(group); T->tree[n1].setChildren(-1,-1); if(blen == 1){ diff --git a/readtree.h b/readtree.h index 6b074de..8a69243 100644 --- a/readtree.h +++ b/readtree.h @@ -11,6 +11,7 @@ #include "mothur.h" #include "tree.h" +#include "counttable.h" #define MAX_LINE 513 #define SKIPLINE(f,c) {while((c=f.get())!=EOF && ((c) != '\n')){}} @@ -24,17 +25,17 @@ class ReadTree { ReadTree(); virtual ~ReadTree() {}; - virtual int read(TreeMap*) = 0; + virtual int read(CountTable*) = 0; int readSpecialChar(istream&, char, string); int readNodeChar(istream& f); float readBranchLength(istream& f); vector getTrees() { return Trees; } - int AssembleTrees(map); + int AssembleTrees(); protected: vector Trees; - TreeMap* treeMap; + CountTable* ct; int numNodes, numLeaves; MothurOut* m; @@ -48,13 +49,13 @@ class ReadNewickTree : public ReadTree { public: ReadNewickTree(string file) : treeFile(file) { m->openInputFile(file, filehandle); readOk = 0; } ~ReadNewickTree() {}; - int read(TreeMap*); + int read(CountTable*); private: Tree* T; - int readNewickInt(istream&, int&, Tree*, TreeMap*); - int readTreeString(TreeMap*); - string nexusTranslation(TreeMap*); + int readNewickInt(istream&, int&, Tree*, CountTable*); + int readTreeString(CountTable*); + string nexusTranslation(CountTable*); ifstream filehandle; string treeFile; string holder; diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index ee50ab1..869df02 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -15,7 +15,8 @@ vector AlignCheckCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pmap("map", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pmap); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); vector myArray; @@ -31,7 +32,7 @@ vector AlignCheckCommand::setParameters(){ string AlignCheckCommand::getHelpString(){ try { string helpString = ""; - helpString += "The align.check command reads a fasta file and map file.\n"; + helpString += "The align.check command reads a fasta file and map file as well as an optional name file.\n"; helpString += "It outputs a file containing the secondary structure matches in the .align.check file.\n"; helpString += "The align.check command parameters are fasta and map, both are required.\n"; helpString += "The align.check command should be in the following format: align.check(fasta=yourFasta, map=yourMap).\n"; diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index cfa1f5b..b62bb00 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -14,7 +14,6 @@ vector SensSpecCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip); - //CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pcolumn); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "",false,false); parameters.push_back(pcutoff); @@ -136,16 +135,7 @@ SensSpecCommand::SensSpecCommand(string option) { path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["column"] = inputDir + it->second; } - } - - //it = parameters.find("name"); - //user has given a template file - //if(it != parameters.end()){ - //path = m->hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - //if (path == "") { parameters["name"] = inputDir + it->second; } - //} - + } } //check for required parameters listFile = validParameter.validFile(parameters, "list", true); @@ -196,12 +186,6 @@ SensSpecCommand::SensSpecCommand(string option) { else if(!m->isTrue(temp)) { hard = 0; } else if(m->isTrue(temp)) { hard = 1; } -// temp = validParameter.validFile(parameters, "name", true); -// if (temp == "not found") { nameFile = ""; } -// else if(temp == "not open") { abort = true; } -// else { nameFile = temp; } -// cout << "name:\t" << nameFile << endl; - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "-1.00"; } m->mothurConvert(temp, cutoff); // cout << cutoff << endl; diff --git a/shhhercommand.cpp b/shhhercommand.cpp index fe0685b..19ffc89 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -1931,7 +1931,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ int ShhherCommand::execute(){ try { - if (abort == true) { return 0; } + if (abort == true) { if (calledHelp) { return 0; } return 2; } getSingleLookUp(); if (m->control_pressed) { return 0; } getJointLookUp(); if (m->control_pressed) { return 0; } diff --git a/subsample.cpp b/subsample.cpp index 261297d..c55accd 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -8,62 +8,54 @@ #include "subsample.h" //********************************************************************************************************************** -Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map originalNameMap) { +Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) { try { Tree* newTree = NULL; - map > newGroups; - vector subsampledSeqs = getSample(tmap, size, newGroups); + //remove seqs not in sample from counttable + vector Groups = ct->getNamesOfGroups(); + newCt->copy(ct); + newCt->addGroup("doNotIncludeMe"); - //remove seqs not in sample from treemap - for (map >::iterator it = newGroups.begin(); it != newGroups.end(); it++) { - for (int i = 0; i < (it->second).size(); i++) { - newTmap->addSeq((it->second)[i], it->first); - } - } - - newTree = new Tree(newTmap); - newTree->getCopy(T, originalNameMap); - - return newTree; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-Tree"); - exit(1); - } -} -/********************************************************************************************************************** -Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size) { - try { - Tree* newTree = NULL; - - vector subsampledSeqs = getSample(tmap, size); - map sampledNameMap = deconvolute(whole, subsampledSeqs); + map doNotIncludeTotals; + vector namesSeqs = ct->getNamesOfSeqs(); + for (int i = 0; i < namesSeqs.size(); i++) { doNotIncludeTotals[namesSeqs[i]] = 0; } + + for (int i = 0; i < Groups.size(); i++) { + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } - //remove seqs not in sample from treemap - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { - //is that name in the subsample? - int count = 0; - for (int j = 0; j < subsampledSeqs.size(); j++) { - if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it - count++; + int thisSize = ct->getGroupCount(Groups[i]); + + if (thisSize >= size) { + + vector names = ct->getNamesOfSeqs(Groups[i]); + vector random; + for (int j = 0; j < names.size(); j++) { + int num = ct->getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { random.push_back(j); } + } + random_shuffle(random.begin(), random.end()); + + vector sampleRandoms; sampleRandoms.resize(names.size(), 0); + for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { + newCt->setAbund(names[j], Groups[i], sampleRandoms[j]); + } + sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0); + for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { doNotIncludeTotals[names[j]] += sampleRandoms[j]; } + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } } - if (m->control_pressed) { return newTree; } - - //if you didnt find it, remove it - if (count == subsampledSeqs.size()) { - tmap->removeSeq(tmap->namesOfSeqs[i]); - i--; //need this because removeSeq removes name from namesOfSeqs - } } - //create new tree - int numUniques = sampledNameMap.size(); - if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); } + for (map::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) { + newCt->setAbund(it->first, "doNotIncludeMe", it->second); + } - newTree = new Tree(numUniques, tmap); //numNodes, treemap - newTree->getSubTree(T, subsampledSeqs, sampledNameMap); + newTree = new Tree(newCt); + newTree->getCopy(T, true); return newTree; } @@ -71,7 +63,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, in m->errorOut(e, "SubSample", "getSample-Tree"); exit(1); } -}*/ +} //********************************************************************************************************************** //assumes whole maps dupName -> uniqueName map SubSample::deconvolute(map whole, vector& wanted) { @@ -112,100 +104,6 @@ map SubSample::deconvolute(map whole, vector SubSample::getSample(TreeMap* tMap, int size, map >& sample) { - try { - vector temp2; - sample["doNotIncludeMe"] = temp2; - - vector namesInSample; - - vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->inUsersGroups(Groups[i], m->getGroups())) { - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - vector temp; - sample[Groups[i]] = temp; - - if (thisSize >= size) { - - random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); - - for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); } - for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); } - - }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } - } - } - - return namesInSample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} - -//********************************************************************************************************************** -vector SubSample::getSample(TreeMap* tMap, int size) { - try { - vector sample; - - vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->inUsersGroups(Groups[i], m->getGroups())) { - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - - if (thisSize >= size) { - - random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); - - for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); } - }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } - } - } - - return sample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} -//********************************************************************************************************************** -vector SubSample::getSample(TreeMap* tMap, vector Groups) { - try { - vector sample; - - //vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - - for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); } - } - - return sample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} -//********************************************************************************************************************** vector SubSample::getSample(vector& thislookup, int size) { try { diff --git a/subsample.h b/subsample.h index b00f1a7..55abed9 100644 --- a/subsample.h +++ b/subsample.h @@ -13,6 +13,8 @@ #include "sharedrabundvector.h" #include "treemap.h" #include "tree.h" +#include "counttable.h" + //subsampling overwrites the sharedRabunds. If you need to reuse the original use the getSamplePreserve function. @@ -24,20 +26,14 @@ class SubSample { ~SubSample() {} vector getSample(vector&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first. - - //Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. - Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". + Tree* getSample(Tree*, CountTable*, CountTable*, int); //creates new subsampled tree. Uses first counttable to fill new counttable with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it private: MothurOut* m; int eliminateZeroOTUS(vector&); - - vector getSample(TreeMap*, vector); - vector getSample(TreeMap*, int); //names of seqs to include in sample tree - vector getSample(TreeMap* tMap, int size, map >& sample); //sample maps group -> seqs in group. seqs not in sample are in doNotIncludeMe group - map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. + map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. }; diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp index 3e16e74..e932eee 100644 --- a/summarytaxcommand.cpp +++ b/summarytaxcommand.cpp @@ -14,8 +14,9 @@ vector SummaryTaxCommand::setParameters(){ try { CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,9 +35,10 @@ string SummaryTaxCommand::getHelpString(){ try { string helpString = ""; helpString += "The summary.tax command reads a taxonomy file and an optional name file, and summarizes the taxonomy information.\n"; - helpString += "The summary.tax command parameters are taxonomy, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n"; + helpString += "The summary.tax command parameters are taxonomy, count, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n"; helpString += "The name parameter allows you to enter a name file associated with your taxonomy file. \n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. It is not required, but providing it will keep the rankIDs in the summary file static.\n"; helpString += "The summary.tax command should be in the following format: \n"; helpString += "summary.tax(taxonomy=yourTaxonomyFile) \n"; @@ -142,6 +144,14 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; } } + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + } //initialize outputTypes @@ -166,7 +176,20 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true); if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); } else if (refTaxonomy == "not open") { refTaxonomy = ""; abort = true; } @@ -177,11 +200,12 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it } - if (namefile == "") { - vector files; files.push_back(taxfile); - parser.getNameFile(files); + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } } - } } catch(exception& e) { @@ -197,23 +221,35 @@ int SummaryTaxCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); - PhyloSummary* taxaSum; - if (refTaxonomy != "") { - taxaSum = new PhyloSummary(refTaxonomy, groupfile); - }else { - taxaSum = new PhyloSummary(groupfile); - } + GroupMap* groupMap = NULL; + CountTable* ct = NULL; + if (groupfile != "") { + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + } - if (m->control_pressed) { delete taxaSum; return 0; } + PhyloSummary* taxaSum; + if (countfile != "") { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); } + else { taxaSum = new PhyloSummary(ct); } + }else { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); } + else { taxaSum = new PhyloSummary(groupMap); } + } + + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } int numSeqs = 0; - if (namefile == "") { numSeqs = taxaSum->summarize(taxfile); } - else { + if ((namefile == "") || (countfile != "")) { numSeqs = taxaSum->summarize(taxfile); } + else if (namefile != "") { map > nameMap; map >::iterator itNames; m->readNames(namefile, nameMap); - if (m->control_pressed) { delete taxaSum; return 0; } + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } ifstream in; m->openInputFile(taxfile, in); @@ -222,6 +258,9 @@ int SummaryTaxCommand::execute(){ string name, taxon; while(!in.eof()){ + + if (m->control_pressed) { break; } + in >> name >> taxon; m->gobble(in); itNames = nameMap.find(name); @@ -240,7 +279,7 @@ int SummaryTaxCommand::execute(){ in.close(); } - if (m->control_pressed) { delete taxaSum; return 0; } + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } //print summary file ofstream outTaxTree; @@ -250,6 +289,7 @@ int SummaryTaxCommand::execute(){ outTaxTree.close(); delete taxaSum; + if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } diff --git a/summarytaxcommand.h b/summarytaxcommand.h index 5f0630f..e8033e2 100644 --- a/summarytaxcommand.h +++ b/summarytaxcommand.h @@ -11,6 +11,7 @@ */ #include "command.hpp" +#include "counttable.h" /**************************************************************************************************/ @@ -33,7 +34,7 @@ class SummaryTaxCommand : public Command { private: bool abort; - string taxfile, outputDir, namefile, groupfile, refTaxonomy; + string taxfile, outputDir, namefile, groupfile, refTaxonomy, countfile; vector outputNames; map nameMap; }; diff --git a/tree.cpp b/tree.cpp index 44ecadd..0bd98e0 100644 --- a/tree.cpp +++ b/tree.cpp @@ -10,7 +10,7 @@ #include "tree.h" /*****************************************************************/ -Tree::Tree(int num, TreeMap* t) : tmap(t) { +Tree::Tree(int num, CountTable* t) : ct(t) { try { m = MothurOut::getInstance(); @@ -36,21 +36,20 @@ Tree::Tree(string g) { //do not use tree generated by this its just to extract t } } /*****************************************************************/ -Tree::Tree(TreeMap* t) : tmap(t) { +Tree::Tree(CountTable* t) : ct(t) { try { m = MothurOut::getInstance(); if (m->runParse == true) { parseTreeFile(); m->runParse = false; } -//for(int i = 0; i < globaldata->Treenames.size(); i++) { cout << i << '\t' << globaldata->Treenames[i] << endl; } + numLeaves = m->Treenames.size(); numNodes = 2*numLeaves - 1; tree.resize(numNodes); //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); - } + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { @@ -59,19 +58,35 @@ Tree::Tree(TreeMap* t) : tmap(t) { tree[i].setName(m->Treenames[i]); //save group info - string group = tmap->getGroup(m->Treenames[i]); - - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(m->Treenames[i], i); - + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(m->Treenames[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(m->Treenames[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + //intialize non leaf nodes }else if (i > (numLeaves-1)) { tree[i].setName(""); @@ -87,7 +102,7 @@ Tree::Tree(TreeMap* t) : tmap(t) { } } /*****************************************************************/ -Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { +Tree::Tree(CountTable* t, vector< vector >& sims) : ct(t) { try { m = MothurOut::getInstance(); @@ -98,9 +113,8 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree.resize(numNodes); //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); - } + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { @@ -109,18 +123,34 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree[i].setName(m->Treenames[i]); //save group info - string group = tmap->getGroup(m->Treenames[i]); - - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(m->Treenames[i], i); + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(m->Treenames[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(m->Treenames[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if //intialize non leaf nodes }else if (i > (numLeaves-1)) { @@ -129,11 +159,12 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree[i].setGroup(tempGroups); } } + //build tree from matrix //initialize indexes - map indexes; //maps row in simMatrix to vector index in the tree - for (int g = 0; g < numLeaves; g++) { indexes[g] = g; } + map thisIndexes; //maps row in simMatrix to vector index in the tree + for (int g = 0; g < numLeaves; g++) { thisIndexes[g] = g; } //do merges and create tree structure by setting parents and children //there are numGroups - 1 merges to do @@ -152,26 +183,26 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { //set non-leaf node info and update leaves to know their parents //non-leaf - tree[numLeaves + i].setChildren(indexes[row], indexes[column]); + tree[numLeaves + i].setChildren(thisIndexes[row], thisIndexes[column]); //parents - tree[indexes[row]].setParent(numLeaves + i); - tree[indexes[column]].setParent(numLeaves + i); + tree[thisIndexes[row]].setParent(numLeaves + i); + tree[thisIndexes[column]].setParent(numLeaves + i); //blength = distance / 2; float blength = ((1.0 - largest) / 2); //branchlengths - tree[indexes[row]].setBranchLength(blength - tree[indexes[row]].getLengthToLeaves()); - tree[indexes[column]].setBranchLength(blength - tree[indexes[column]].getLengthToLeaves()); + tree[thisIndexes[row]].setBranchLength(blength - tree[thisIndexes[row]].getLengthToLeaves()); + tree[thisIndexes[column]].setBranchLength(blength - tree[thisIndexes[column]].getLengthToLeaves()); //set your length to leaves to your childs length plus branchlength - tree[numLeaves + i].setLengthToLeaves(tree[indexes[row]].getLengthToLeaves() + tree[indexes[row]].getBranchLength()); + tree[numLeaves + i].setLengthToLeaves(tree[thisIndexes[row]].getLengthToLeaves() + tree[thisIndexes[row]].getBranchLength()); //update index - indexes[row] = numLeaves+i; - indexes[column] = numLeaves+i; + thisIndexes[row] = numLeaves+i; + thisIndexes[column] = numLeaves+i; //remove highest value that caused the merge. sims[row][column] = -1000.0; @@ -200,7 +231,7 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { } /*****************************************************************/ Tree::~Tree() {} -/*****************************************************************/ +/***************************************************************** void Tree::addNamesToCounts(map nameMap) { try { //ex. seq1 seq2,seq3,se4 @@ -297,15 +328,15 @@ void Tree::addNamesToCounts(map nameMap) { m->errorOut(e, "Tree", "addNamesToCounts"); exit(1); } -} +}*/ /*****************************************************************/ int Tree::getIndex(string searchName) { try { - //Treemap knows name, group and index to speed up search - // getIndex function will return the vector index or -1 if seq is not found. - int index = tmap->getIndex(searchName); - return index; - + map::iterator itIndex = indexes.find(searchName); + if (itIndex != indexes.end()) { + return itIndex->second; + } + return -1; } catch(exception& e) { m->errorOut(e, "Tree", "getIndex"); @@ -316,8 +347,10 @@ int Tree::getIndex(string searchName) { void Tree::setIndex(string searchName, int index) { try { - //set index in treemap - tmap->setIndex(searchName, index); + map::iterator itIndex = indexes.find(searchName); + if (itIndex == indexes.end()) { + indexes[searchName] = index; + } } catch(exception& e) { m->errorOut(e, "Tree", "setIndex"); @@ -325,14 +358,8 @@ void Tree::setIndex(string searchName, int index) { } } /*****************************************************************/ -int Tree::assembleTree(map nameMap) { - try { - //save for later - names = nameMap; - - //if user has given a names file we want to include that info in the pgroups and pcount info. - if(nameMap.size() != 0) { addNamesToCounts(nameMap); } - +int Tree::assembleTree() { + try { //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { if (m->control_pressed) { return 1; } @@ -348,66 +375,66 @@ int Tree::assembleTree(map nameMap) { exit(1); } } -/***************************************************************** -int Tree::assembleTree(string n) { - try { - - //build the pGroups in non leaf nodes to be used in the parsimony calcs. - for (int i = numLeaves; i < numNodes; i++) { - if (m->control_pressed) { return 1; } - - tree[i].pGroups = (mergeGroups(i)); - tree[i].pcount = (mergeGcounts(i)); - } - //float B = clock(); - //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl; - return 0; - } - catch(exception& e) { - m->errorOut(e, "Tree", "assembleTree"); - exit(1); - } -} /*****************************************************************/ //assumes leaf node names are in groups and no names file - used by indicator command void Tree::getSubTree(Tree* Ctree, vector Groups) { try { //copy Tree since we are going to destroy it - Tree* copy = new Tree(tmap); + Tree* copy = new Tree(ct); copy->getCopy(Ctree); - map empty; - copy->assembleTree(empty); + copy->assembleTree(); //we want to select some of the leaf nodes to create the output tree //go through the input Tree starting at parents of leaves + //initialize groupNodeInfo + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } + + //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { - //initialize leaf nodes if (i <= (numLeaves-1)) { tree[i].setName(Groups[i]); //save group info - string group = tmap->getGroup(Groups[i]); - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(Groups[i], i); - - //intialize non leaf nodes + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(Groups[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(Groups[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + + //intialize non leaf nodes }else if (i > (numLeaves-1)) { tree[i].setName(""); vector tempGroups; tree[i].setGroup(tempGroups); } } - + set removedLeaves; for (int i = 0; i < copy->getNumLeaves(); i++) { @@ -534,7 +561,7 @@ void Tree::getSubTree(Tree* Ctree, vector Groups) { exit(1); } } -/*****************************************************************/ +/***************************************************************** //assumes nameMap contains unique names as key or is empty. //assumes numLeaves defined in tree constructor equals size of seqsToInclude and seqsToInclude only contains unique seqs. int Tree::getSubTree(Tree* copy, vector seqsToInclude, map nameMap) { @@ -578,7 +605,7 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { return (index++); }else { //you are a leaf - int indexInNewTree = tmap->getIndex(oldtree[node].getName()); + int indexInNewTree = getIndex(oldtree[node].getName()); return indexInNewTree; } } @@ -588,7 +615,7 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { } } /*****************************************************************/ -void Tree::getCopy(Tree* copy, map nameMap) { +void Tree::getCopy(Tree* copy, bool subsample) { try { //for each node in the tree copy its info @@ -602,8 +629,6 @@ void Tree::getCopy(Tree* copy, map nameMap) { //copy children tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); } - - if (nameMap.size() != 0) { addNamesToCounts(nameMap); } //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { @@ -640,8 +665,8 @@ void Tree::getCopy(Tree* copy) { tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); //copy index in node and tmap + setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); tree[i].setIndex(copy->tree[i].getIndex()); - setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); //copy pGroups tree[i].pGroups = copy->tree[i].pGroups; @@ -805,8 +830,8 @@ void Tree::randomLabels(vector g) { try { //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + groupNodeInfo[(ct->getNamesOfGroups())[i]].resize(0); } for(int i = 0; i < numLeaves; i++){ @@ -868,23 +893,20 @@ void Tree::randomBlengths() { /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(vector g) { randomLabels(g); - map empty; - assembleTree(empty); + assembleTree(); } /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(string groupA, string groupB) { vector temp; temp.push_back(groupA); temp.push_back(groupB); randomLabels(temp); - map empty; - assembleTree(empty); + assembleTree(); } /*************************************************************************************************/ //for now it's just random topology but may become random labels as well later that why this is such a simple function now... void Tree::assembleRandomTree() { randomTopology(); - map empty; - assembleTree(empty); + assembleTree(); } /**************************************************************************************************/ @@ -1103,16 +1125,16 @@ void Tree::printBranch(int node, ostream& out, string mode) { } } }else { //you are a leaf - string leafGroup = tmap->getGroup(tree[node].getName()); + vector leafGroup = ct->getGroups(tree[node].getName()); if (mode == "branch") { - out << leafGroup; + out << leafGroup[0]; //if there is a branch length then print it if (tree[node].getBranchLength() != -1) { out << ":" << tree[node].getBranchLength(); } }else if (mode == "boot") { - out << leafGroup; + out << leafGroup[0]; //if there is a label then print it if (tree[node].getLabel() != -1) { out << tree[node].getLabel(); @@ -1166,16 +1188,16 @@ void Tree::printBranch(int node, ostream& out, string mode, vector& theseN } } }else { //you are a leaf - string leafGroup = tmap->getGroup(theseNodes[node].getName()); + vector leafGroup = ct->getGroups(theseNodes[node].getName()); if (mode == "branch") { - out << leafGroup; + out << leafGroup[0]; //if there is a branch length then print it if (theseNodes[node].getBranchLength() != -1) { out << ":" << theseNodes[node].getBranchLength(); } }else if (mode == "boot") { - out << leafGroup; + out << leafGroup[0]; //if there is a label then print it if (theseNodes[node].getLabel() != -1) { out << theseNodes[node].getLabel(); diff --git a/tree.h b/tree.h index 03da5f6..88e49c0 100644 --- a/tree.h +++ b/tree.h @@ -11,22 +11,22 @@ */ #include "treenode.h" -#include "treemap.h" +#include "counttable.h" /* This class represents the treefile. */ class Tree { public: Tree(string); //do not use tree generated by this constructor its just to extract the treenames, its a chicken before the egg thing that needs to be revisited. - Tree(int, TreeMap*); - Tree(TreeMap*); //to generate a tree from a file - Tree(TreeMap*, vector< vector >&); //create tree from sim matrix + Tree(int, CountTable*); + Tree(CountTable*); //to generate a tree from a file + Tree(CountTable*, vector< vector >&); //create tree from sim matrix ~Tree(); - TreeMap* getTreeMap() { return tmap; } + CountTable* getCountTable() { return ct; } void getCopy(Tree*); //makes tree a copy of the one passed in. - void getCopy(Tree* copy, map); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want. Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe". + void getCopy(Tree* copy, bool); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want. Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe". void getSubTree(Tree*, vector); //makes tree a that contains only the names passed in. - int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. + //int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. void assembleRandomTree(); void assembleRandomUnifracTree(vector); @@ -45,21 +45,22 @@ public: int findRoot(); //return index of root node //this function takes the leaf info and populates the non leaf nodes - int assembleTree(map); + int assembleTree(); vector tree; //the first n nodes are the leaves, where n is the number of sequences. map< string, vector > groupNodeInfo; //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file. private: - TreeMap* tmap; + CountTable* ct; int numNodes, numLeaves; ofstream out; string filename; - map names; + //map names; map::iterator it, it2; map mergeGroups(int); //returns a map with a groupname and the number of times that group was seen in the children map mergeGcounts(int); + map indexes; //maps seqName -> index in tree vector void addNamesToCounts(map); void randomTopology(); diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp index 6633e51..fb4887c 100644 --- a/treegroupscommand.cpp +++ b/treegroupscommand.cpp @@ -287,7 +287,7 @@ TreeGroupCommand::~TreeGroupCommand(){ if (abort == false) { if (format == "sharedfile") { delete input; } else { delete list; } - delete tmap; + delete ct; } } @@ -400,8 +400,16 @@ int TreeGroupCommand::execute(){ m->runParse = false; //create treemap class from groupmap for tree class to use - tmap = new TreeMap(); - tmap->makeSim(m->getAllGroups()); + ct = new CountTable(); + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getAllGroups().size(); i++) { + nameMap.insert(m->getAllGroups()[i]); + gps.insert(m->getAllGroups()[i]); + groupMap[m->getAllGroups()[i]] = m->getAllGroups()[i]; + } + ct->createTable(nameMap, groupMap, gps); //clear globaldatas old tree names if any m->Treenames.clear(); @@ -429,22 +437,26 @@ int TreeGroupCommand::execute(){ nameMap = new NameAssignment(namefile); nameMap->readMap(); } - else{ - nameMap = NULL; - } + else{ nameMap = NULL; } readMatrix->read(nameMap); list = readMatrix->getListVector(); SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix(); //make treemap - tmap = new TreeMap(); - - if (m->control_pressed) { return 0; } - - tmap->makeSim(list); + ct = new CountTable(); + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < list->getNumBins(); i++) { + string bin = list->get(i); + nameMap.insert(bin); + gps.insert(bin); + groupMap[bin] = bin; + } + ct->createTable(nameMap, groupMap, gps); - vector namesGroups = tmap->getNamesOfGroups(); + vector namesGroups = ct->getNamesOfGroups(); m->setGroups(namesGroups); //clear globaldatas old tree names if any @@ -505,13 +517,12 @@ int TreeGroupCommand::execute(){ Tree* TreeGroupCommand::createTree(vector< vector >& simMatrix){ try { //create tree - t = new Tree(tmap, simMatrix); + t = new Tree(ct, simMatrix); if (m->control_pressed) { delete t; t = NULL; return t; } //assemble tree - map empty; - t->assembleTree(empty); + t->assembleTree(); return t; } diff --git a/treegroupscommand.h b/treegroupscommand.h index 6790afb..7342b36 100644 --- a/treegroupscommand.h +++ b/treegroupscommand.h @@ -15,7 +15,7 @@ #include "groupmap.h" #include "validcalculator.h" #include "tree.h" -#include "treemap.h" +#include "counttable.h" #include "readmatrix.hpp" #include "readcolumn.h" #include "readphylip.h" @@ -104,7 +104,7 @@ private: NameAssignment* nameMap; ListVector* list; - TreeMap* tmap; + CountTable* ct; Tree* t; InputData* input; vector treeCalculators; diff --git a/treemap.cpp b/treemap.cpp index 7b9fd32..06d05d4 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -247,7 +247,7 @@ string TreeMap::getGroup(string sequenceName) { } } -/************************************************************/ +/************************************************************ void TreeMap::setIndex(string seq, int index) { it = treemap.find(seq); if (it != treemap.end()) { //sequence name was in group file @@ -257,7 +257,7 @@ void TreeMap::setIndex(string seq, int index) { treemap[seq].groupname = "not found"; } } -/************************************************************/ +/************************************************************ int TreeMap::getIndex(string seq) { it = treemap.find(seq); diff --git a/treemap.h b/treemap.h index 57822e0..7ffd1e7 100644 --- a/treemap.h +++ b/treemap.h @@ -29,8 +29,8 @@ public: int readMap(string); int getNumGroups(); int getNumSeqs(); - void setIndex(string, int); //sequencename, index - int getIndex(string); //returns vector index of sequence + //void setIndex(string, int); //sequencename, index + //int getIndex(string); //returns vector index of sequence bool isValidGroup(string); //return true if string is a valid group void removeSeq(string); //removes a sequence, this is to accomadate trees that do not contain all the seqs in your groupfile string getGroup(string); diff --git a/treereader.cpp b/treereader.cpp index b385d21..0e25f12 100644 --- a/treereader.cpp +++ b/treereader.cpp @@ -8,12 +8,23 @@ #include "treereader.h" #include "readtree.h" +#include "groupmap.h" /***********************************************************************/ - -TreeReader::TreeReader(string tf) : treefile(tf) { +TreeReader::TreeReader(string tf, string cf) : treefile(tf), countfile(cf) { try { m = MothurOut::getInstance(); + ct = new CountTable(); + ct->readTable(cf); + + //if no groupinfo in count file we need to add it + if (!ct->hasGroupInfo()) { + ct->addGroup("Group1"); + vector namesOfSeqs = ct->getNamesOfSeqs(); + for (int i = 0; i < namesOfSeqs.size(); i++) { + ct->setAbund(namesOfSeqs[i], "Group1", ct->getNumSeqs(namesOfSeqs[i])); + } + } namefile = ""; groupfile = ""; readTrees(); @@ -24,22 +35,32 @@ TreeReader::TreeReader(string tf) : treefile(tf) { } } /***********************************************************************/ - -TreeReader::TreeReader(string tf, string gf) : treefile(tf), groupfile(gf) { - try { - m = MothurOut::getInstance(); - namefile = ""; - readTrees(); - } - catch(exception& e) { - m->errorOut(e, "TreeReader", "TreeReader"); - exit(1); - } -} -/***********************************************************************/ TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf), groupfile(gf), namefile(nf) { try { m = MothurOut::getInstance(); + countfile = ""; + ct = new CountTable(); + if (namefile != "") { ct->createTable(namefile, groupfile, true); } + else { + Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->Treenames.size(); i++) { nameMap.insert(m->Treenames[i]); } + if (groupfile == "") { gps.insert("Group1"); for (int i = 0; i < m->Treenames.size(); i++) { groupMap[m->Treenames[i]] = "Group1"; } } + else { + GroupMap g(groupfile); + g.readMap(); + vector seqs = g.getNamesSeqs(); + for (int i = 0; i < seqs.size(); i++) { + string group = g.getGroup(seqs[i]); + groupMap[seqs[i]] = group; + gps.insert(group); + } + } + ct->createTable(nameMap, groupMap, gps); + } + readTrees(); } catch(exception& e) { @@ -51,22 +72,15 @@ TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf), groupfi bool TreeReader::readTrees() { try { - tmap = new TreeMap(); - if (groupfile != "") { tmap->readMap(groupfile); } - else{ //fake out by putting everyone in one group - Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap - for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); } - } - - int numUniquesInName = 0; - if (namefile != "") { numUniquesInName = readNamesFile(); } + int numUniquesInName = ct->getNumUniqueSeqs(); + //if (namefile != "") { numUniquesInName = readNamesFile(); } ReadTree* read = new ReadNewickTree(treefile); - int readOk = read->read(tmap); + int readOk = read->read(ct); if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete read; m->control_pressed=true; return 0; } - read->AssembleTrees(names); + read->AssembleTrees(); trees = read->getTrees(); delete read; @@ -74,18 +88,19 @@ bool TreeReader::readTrees() { //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size. int numNamesInTree; if (namefile != "") { - if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); } + if (numUniquesInName == m->Treenames.size()) { numNamesInTree = ct->getNumSeqs(); } else { numNamesInTree = m->Treenames.size(); } }else { numNamesInTree = m->Treenames.size(); } //output any names that are in group file but not in tree - if (numNamesInTree < tmap->getNumSeqs()) { - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { + if (numNamesInTree < ct->getNumSeqs()) { + vector namesSeqsCt = ct->getNamesOfSeqs(); + for (int i = 0; i < namesSeqsCt.size(); i++) { //is that name in the tree? int count = 0; for (int j = 0; j < m->Treenames.size(); j++) { - if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it + if (namesSeqsCt[i] == m->Treenames[j]) { break; } //found it count++; } @@ -93,14 +108,8 @@ bool TreeReader::readTrees() { //then you did not find it so report it if (count == m->Treenames.size()) { - //if it is in your namefile then don't remove - map::iterator it = nameMap.find(tmap->namesOfSeqs[i]); - - if (it == nameMap.end()) { - m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine(); - tmap->removeSeq(tmap->namesOfSeqs[i]); - i--; //need this because removeSeq removes name from namesOfSeqs - } + m->mothurOut(namesSeqsCt[i] + " is in your name or group file and not in your tree. It will be disregarded."); m->mothurOutEndLine(); + ct->remove(namesSeqsCt[i]); } } } @@ -112,47 +121,6 @@ bool TreeReader::readTrees() { exit(1); } } -/*****************************************************************/ -int TreeReader::readNamesFile() { - try { - nameMap.clear(); - names.clear(); - int numUniquesInName = 0; - - ifstream in; - m->openInputFile(namefile, in); - - string first, second; - map::iterator itNames; - - while(!in.eof()) { - in >> first >> second; m->gobble(in); - - numUniquesInName++; - - itNames = nameMap.find(first); - if (itNames == nameMap.end()) { - names[first] = second; - - //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them - vector dupNames; - m->splitAtComma(second, dupNames); - - for (int i = 0; i < dupNames.size(); i++) { - nameMap[dupNames[i]] = first; - if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } - } - }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); nameMap.clear(); names.clear(); namefile = ""; return 1; } - } - in.close(); - - return numUniquesInName; - } - catch(exception& e) { - m->errorOut(e, "TreeReader", "readNamesFile"); - exit(1); - } -} /***********************************************************************/ diff --git a/treereader.h b/treereader.h index fb9c791..ac24eb0 100644 --- a/treereader.h +++ b/treereader.h @@ -11,29 +11,26 @@ #include "mothurout.h" #include "tree.h" +#include "counttable.h" class TreeReader { public: - TreeReader(string tf); - TreeReader(string tf, string gf); + TreeReader(string tf, string cf); TreeReader(string tf, string gf, string nf); ~TreeReader() {} vector getTrees() { return trees; } - map getNames() { return nameMap; } //dups -> unique - map getNameMap() { return names; } //unique -> dups list - private: MothurOut* m; vector trees; - TreeMap* tmap; - map nameMap; //dupName -> uniqueName - map names; + CountTable* ct; + //map nameMap; //dupName -> uniqueName + // map names; - string treefile, groupfile, namefile; + string treefile, groupfile, namefile, countfile; bool readTrees(); int readNamesFile(); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 88ed32b..a90e7ad 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -1255,7 +1255,9 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< while(!inOligos.eof()){ inOligos >> type; - + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there m->gobble(inOligos); @@ -1266,6 +1268,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } for(int i=0;i >& fastaFileNames, vector< map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } } + primers[oligo]=indexPrimer; indexPrimer++; primerNameVector.push_back(group); } @@ -1311,15 +1317,22 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< //then this is illumina data with 4 columns if (temp != "") { - string reverseBarcode = reverseOligo(group); //reverse barcode - group = temp; + for(int i=0;idebug) { m->mothurOut("[DEBUG]: reading reverse " + group + ", and group = " + temp + ".\n"); } + + string reverseBarcode = reverseOligo(group); //reverse barcode //check for repeat barcodes map::iterator itBar = rbarcodes.find(reverseBarcode); - if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } - + if (itBar != rbarcodes.end()) { m->mothurOut("reverse barcode " + group + " is in your oligos file already."); m->mothurOutEndLine(); } + + group = temp; rbarcodes[reverseBarcode]=indexBarcode; - } + }else { if (m->debug) { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); } } //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); @@ -1438,7 +1451,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< break; } } - + if (allBlank) { m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); allFiles = false; diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 0749cb7..edc4bbc 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -16,8 +16,9 @@ vector UnifracUnweightedCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); @@ -42,7 +43,7 @@ vector UnifracUnweightedCommand::setParameters(){ string UnifracUnweightedCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n"; + helpString += "The unifrac.unweighted command parameters are tree, group, name, count, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n"; @@ -165,6 +166,14 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -186,6 +195,19 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } @@ -233,7 +255,13 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { consensus = m->isTrue(temp); if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; } - if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } + if (countfile == "") { if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } } + else { + CountTable testCt; + if ((!testCt.testGroups(countfile)) && (subsample)) { + m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true; + } + } if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; } if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; } @@ -246,10 +274,12 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { m->setGroups(Groups); } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } } @@ -267,12 +297,12 @@ int UnifracUnweightedCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); - map nameMap = reader->getNames(); - map unique2Dup = reader->getNameMap(); - delete reader; + ct = T[0]->getCountTable(); + delete reader; sumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("uwsummary"); outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile); @@ -280,7 +310,7 @@ int UnifracUnweightedCommand::execute() { SharedUtil util; Groups = m->getGroups(); - vector namesGroups = tmap->getNamesOfGroups(); + vector namesGroups = ct->getNamesOfGroups(); util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze Unweighted unweighted(includeRoot); @@ -292,10 +322,9 @@ int UnifracUnweightedCommand::execute() { //user has not set size, set size = smallest samples size if (subsampleSize == -1) { vector temp; temp.push_back(Groups[0]); - subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group + subsampleSize = ct->getGroupCount(Groups[0]); //num in first group for (int i = 1; i < Groups.size(); i++) { - temp.clear(); temp.push_back(Groups[i]); - int thisSize = (tmap->getNamesSeqs(temp)).size(); + int thisSize = ct->getGroupCount(Groups[i]); if (thisSize < subsampleSize) { subsampleSize = thisSize; } } m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n"); @@ -303,9 +332,7 @@ int UnifracUnweightedCommand::execute() { vector newGroups = Groups; Groups.clear(); for (int i = 0; i < newGroups.size(); i++) { - vector thisGroup; thisGroup.push_back(newGroups[i]); - vector thisGroupsSeqs = tmap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); + int thisSize = ct->getGroupCount(newGroups[i]); if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); } else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } @@ -330,7 +357,7 @@ int UnifracUnweightedCommand::execute() { //get pscores for users trees for (int i = 0; i < T.size(); i++) { - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } counter = 0; @@ -351,7 +378,7 @@ int UnifracUnweightedCommand::execute() { userData = unweighted.getValues(T[i], processors, outputDir); //userData[0] = unweightedscore - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; } //output scores for each combination for(int k = 0; k < numComp; k++) { @@ -366,7 +393,7 @@ int UnifracUnweightedCommand::execute() { if (random) { runRandomCalcs(T[i], userData); } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int startSubsample = time(NULL); @@ -376,32 +403,28 @@ int UnifracUnweightedCommand::execute() { if (m->control_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. - TreeMap* newTmap = new TreeMap(); - //newTmap->getCopy(*tmap); - - //SubSample sample; - //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); - + CountTable* newCt = new CountTable(); + //uses method of setting groups to doNotIncludeMe SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); + Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize); //call new weighted function vector iterData; iterData.resize(numComp,0); Unweighted thisUnweighted(includeRoot); iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore - + //save data to make ave dist, std dist calcDistsTotals.push_back(iterData); - delete newTmap; + delete newCt; delete subSampleTree; if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); } } - m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); + if (subsample) { m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); } if (consensus) { getConsensusTrees(calcDistsTotals, i); } @@ -420,7 +443,7 @@ int UnifracUnweightedCommand::execute() { outSum.close(); - delete tmap; + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -472,7 +495,7 @@ int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector >& di //find standard deviation vector stdDev; stdDev.resize(numComp, 0); - for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each for (int j = 0; j < dists[thisIter].size(); j++) { stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j])); } @@ -578,8 +601,16 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, m->runParse = false; //create treemap class from groupmap for tree class to use - TreeMap newTmap; - newTmap.makeSim(m->getGroups()); + CountTable newCt; + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getGroups().size(); i++) { + nameMap.insert(m->getGroups()[i]); + gps.insert(m->getGroups()[i]); + groupMap[m->getGroups()[i]] = m->getGroups()[i]; + } + newCt.createTable(nameMap, groupMap, gps); //clear old tree names if any m->Treenames.clear(); @@ -587,7 +618,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, //fills globaldatas tree names m->Treenames = m->getGroups(); - vector newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created + vector newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created if (m->control_pressed) { return 0; } @@ -613,7 +644,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, } /**************************************************************************************************/ -vector UnifracUnweightedCommand::buildTrees(vector< vector >& dists, int treeNum, TreeMap& mytmap) { +vector UnifracUnweightedCommand::buildTrees(vector< vector >& dists, int treeNum, CountTable& myct) { try { vector trees; @@ -647,9 +678,8 @@ vector UnifracUnweightedCommand::buildTrees(vector< vector >& dis } //create tree - Tree* tempTree = new Tree(&mytmap, sims); - map empty; - tempTree->assembleTree(empty); + Tree* tempTree = new Tree(&myct, sims); + tempTree->assembleTree(); trees.push_back(tempTree); diff --git a/unifracunweightedcommand.h b/unifracunweightedcommand.h index 15c3b96..107083f 100644 --- a/unifracunweightedcommand.h +++ b/unifracunweightedcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "unweighted.h" -#include "treemap.h" +#include "counttable.h" #include "sharedutilities.h" #include "fileoutput.h" #include "readtree.h" @@ -39,7 +39,7 @@ class UnifracUnweightedCommand : public Command { private: FileOutput* output; vector T; //user trees - TreeMap* tmap; + CountTable* ct; string sumFile, allGroups; vector groupComb; // AB. AC, BC... int iters, numGroups, numComp, counter, processors, subsampleSize, subsampleIters; @@ -50,7 +50,7 @@ class UnifracUnweightedCommand : public Command { vector< map > rCumul; //map -vector entry for each combination. bool abort, phylip, random, includeRoot, consensus, subsample; - string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile; + string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used ofstream outSum, out; @@ -60,7 +60,7 @@ class UnifracUnweightedCommand : public Command { void printUWSummaryFile(int); void printUnweightedFile(); void createPhylipFile(int); - vector buildTrees(vector< vector >&, int, TreeMap&); + vector buildTrees(vector< vector >&, int, CountTable&); int getConsensusTrees(vector< vector >&, int); int getAverageSTDMatrices(vector< vector >&, int); diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index d1e8833..cbec749 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -16,8 +16,9 @@ vector UnifracWeightedCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); @@ -42,7 +43,7 @@ vector UnifracWeightedCommand::setParameters(){ string UnifracWeightedCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n"; + helpString += "The unifrac.weighted command parameters are tree, group, name, count, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n"; @@ -164,6 +165,14 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -186,6 +195,19 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } @@ -233,14 +255,22 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { consensus = m->isTrue(temp); if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; } - if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } + if (countfile == "") { if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } } + else { + CountTable testCt; + if ((!testCt.testGroups(countfile)) && (subsample)) { + m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true; + } + } if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; } if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } @@ -258,14 +288,14 @@ int UnifracWeightedCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); - map nameMap = reader->getNames(); - map unique2Dup = reader->getNameMap(); + ct = T[0]->getCountTable(); delete reader; - - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } + + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } sumFile = outputDir + m->getSimpleName(treefile) + getOutputFileNameTag("wsummary"); m->openOutputFile(sumFile, outSum); @@ -274,11 +304,11 @@ int UnifracWeightedCommand::execute() { SharedUtil util; string s; //to make work with setgroups Groups = m->getGroups(); - vector nameGroups = tmap->getNamesOfGroups(); + vector nameGroups = ct->getNamesOfGroups(); util.setGroups(Groups, nameGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze m->setGroups(Groups); - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } Weighted weighted(includeRoot); @@ -289,10 +319,9 @@ int UnifracWeightedCommand::execute() { //user has not set size, set size = smallest samples size if (subsampleSize == -1) { vector temp; temp.push_back(Groups[0]); - subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group + subsampleSize = ct->getGroupCount(Groups[0]); //num in first group for (int i = 1; i < Groups.size(); i++) { - temp.clear(); temp.push_back(Groups[i]); - int thisSize = (tmap->getNamesSeqs(temp)).size(); + int thisSize = ct->getGroupCount(Groups[i]); if (thisSize < subsampleSize) { subsampleSize = thisSize; } } m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n"); @@ -300,12 +329,10 @@ int UnifracWeightedCommand::execute() { vector newGroups = Groups; Groups.clear(); for (int i = 0; i < newGroups.size(); i++) { - vector thisGroup; thisGroup.push_back(newGroups[i]); - vector thisGroupsSeqs = tmap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); + int thisSize = ct->getGroupCount(newGroups[i]); if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); } - else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } + else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } } m->setGroups(Groups); } @@ -321,7 +348,7 @@ int UnifracWeightedCommand::execute() { //get weighted scores for users trees for (int i = 0; i < T.size(); i++) { - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } counter = 0; rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... @@ -337,7 +364,7 @@ int UnifracWeightedCommand::execute() { } userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //save users score for (int s=0; scontrol_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. - TreeMap* newTmap = new TreeMap(); - //newTmap->getCopy(*tmap); - - //SubSample sample; - //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); + CountTable* newCt = new CountTable(); //uses method of setting groups to doNotIncludeMe SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); - + Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize); + //call new weighted function vector iterData; iterData.resize(numComp,0); Weighted thisWeighted(includeRoot); @@ -379,20 +402,20 @@ int UnifracWeightedCommand::execute() { //save data to make ave dist, std dist calcDistsTotals.push_back(iterData); - delete newTmap; + delete newCt; delete subSampleTree; if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); } } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); } if (consensus) { getConsensusTrees(calcDistsTotals, i); } } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (phylip) { createPhylipFile(); } @@ -400,7 +423,7 @@ int UnifracWeightedCommand::execute() { //clear out users groups m->clearGroups(); - delete tmap; + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -557,9 +580,17 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i //used in tree constructor m->runParse = false; - //create treemap class from groupmap for tree class to use - TreeMap newTmap; - newTmap.makeSim(m->getGroups()); + ///create treemap class from groupmap for tree class to use + CountTable newCt; + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getGroups().size(); i++) { + nameMap.insert(m->getGroups()[i]); + gps.insert(m->getGroups()[i]); + groupMap[m->getGroups()[i]] = m->getGroups()[i]; + } + newCt.createTable(nameMap, groupMap, gps); //clear old tree names if any m->Treenames.clear(); @@ -567,7 +598,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i //fills globaldatas tree names m->Treenames = m->getGroups(); - vector newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created + vector newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created if (m->control_pressed) { return 0; } @@ -593,7 +624,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i } /**************************************************************************************************/ -vector UnifracWeightedCommand::buildTrees(vector< vector >& dists, int treeNum, TreeMap& mytmap) { +vector UnifracWeightedCommand::buildTrees(vector< vector >& dists, int treeNum, CountTable& myct) { try { vector trees; @@ -627,9 +658,8 @@ vector UnifracWeightedCommand::buildTrees(vector< vector >& dists } //create tree - Tree* tempTree = new Tree(&mytmap, sims); - map empty; - tempTree->assembleTree(empty); + Tree* tempTree = new Tree(&myct, sims); + tempTree->assembleTree(); trees.push_back(tempTree); @@ -682,7 +712,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector usersS //get scores for random trees for (int j = 0; j < iters; j++) { - + cout << j << endl; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores); @@ -693,7 +723,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector usersS driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores); #endif - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //report progress // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine(); @@ -796,7 +826,7 @@ int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector > na /**************************************************************************************************/ int UnifracWeightedCommand::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, vector< vector >& scores) { try { - Tree* randT = new Tree(tmap); + Tree* randT = new Tree(ct); Weighted weighted(includeRoot); diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h index 06354ce..fead41b 100644 --- a/unifracweightedcommand.h +++ b/unifracweightedcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "weighted.h" -#include "treemap.h" +#include "counttable.h" #include "progress.hpp" #include "sharedutilities.h" #include "fileoutput.h" @@ -43,7 +43,7 @@ class UnifracWeightedCommand : public Command { linePair(int i, int j) : start(i), num(j) {} }; vector lines; - TreeMap* tmap; + CountTable* ct; FileOutput* output; vector T; //user trees vector utreeScores; //user tree unweighted scores @@ -58,7 +58,7 @@ class UnifracWeightedCommand : public Command { map validScores; //map contains scores from random bool abort, phylip, random, includeRoot, subsample, consensus; - string groups, itersString, outputForm, treefile, groupfile, namefile; + string groups, itersString, outputForm, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used int processors, subsampleSize, subsampleIters; ofstream outSum; @@ -73,7 +73,7 @@ class UnifracWeightedCommand : public Command { int createProcesses(Tree*, vector< vector >, vector< vector >&); int driver(Tree*, vector< vector >, int, int, vector< vector >&); int runRandomCalcs(Tree*, vector); - vector buildTrees(vector< vector >&, int, TreeMap&); + vector buildTrees(vector< vector >&, int, CountTable&); int getConsensusTrees(vector< vector >&, int); int getAverageSTDMatrices(vector< vector >&, int); diff --git a/unweighted.cpp b/unweighted.cpp index 864a9f8..e95834f 100644 --- a/unweighted.cpp +++ b/unweighted.cpp @@ -16,7 +16,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups int numGroups = m->getNumGroups(); @@ -36,9 +36,9 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { vector groups; if (numGroups == 0) { //get score for all users groups - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - if ((tmap->getNamesOfGroups())[i] != "xxx") { - groups.push_back((tmap->getNamesOfGroups())[i]); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + if ((ct->getNamesOfGroups())[i] != "xxx") { + groups.push_back((ct->getNamesOfGroups())[i]); } } namesOfGroupCombos.push_back(groups); @@ -52,7 +52,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -67,11 +67,11 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -83,7 +83,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -100,7 +100,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); if (m->control_pressed) { exit(0); } @@ -122,7 +122,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -167,7 +167,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } /**************************************************************************************************/ -EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { @@ -261,7 +261,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups int numGroups = m->getNumGroups(); @@ -281,9 +281,9 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st vector groups; if (numGroups == 0) { //get score for all users groups - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - if ((tmap->getNamesOfGroups())[i] != "xxx") { - groups.push_back((tmap->getNamesOfGroups())[i]); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + if ((ct->getNamesOfGroups())[i] != "xxx") { + groups.push_back((ct->getNamesOfGroups())[i]); } } namesOfGroupCombos.push_back(groups); @@ -297,7 +297,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -311,12 +311,12 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, true, tmap); + data = createProcesses(t, namesOfGroupCombos, true, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct); #endif return data; @@ -328,7 +328,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st } /**************************************************************************************************/ -EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, bool usingGroups, TreeMap* tmap) { +EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, bool usingGroups, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -345,7 +345,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, ct); if (m->control_pressed) { exit(0); } @@ -365,7 +365,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -409,14 +409,14 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } /**************************************************************************************************/ -EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, bool usingGroups, TreeMap* tmap) { +EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, bool usingGroups, CountTable* ct) { try { EstOutput results; results.resize(num); int count = 0; - Tree* copyTree = new Tree(tmap); + Tree* copyTree = new Tree(ct); for (int h = start; h < (start+num); h++) { diff --git a/unweighted.h b/unweighted.h index c6c13bb..b136b00 100644 --- a/unweighted.h +++ b/unweighted.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -38,10 +38,10 @@ class Unweighted : public TreeCalculator { map< vector, set > rootForGrouping; //maps a grouping combo to the roots for that combo bool includeRoot; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); - EstOutput driver(Tree*, vector< vector >, int, int, bool, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, bool, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); + EstOutput driver(Tree*, vector< vector >, int, int, bool, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, bool, CountTable*); int getRoot(Tree*, int, vector); }; diff --git a/validparameter.cpp b/validparameter.cpp index 3e1f349..7d1af25 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -307,6 +307,14 @@ string ValidParameters::validFile(map& container, string paramet if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ". I suspect you entered a column formatted file as a phylip file, aborting."); m->mothurOutEndLine(); return "not found"; } } + + //check for blank file + if (ableToOpen != 1) { + if (m->isBlank(container[parameter])) { + m->mothurOut("[ERROR]: " + container[parameter] + " is blank, aborting."); m->mothurOutEndLine(); return "not found"; + } + } + } }else { return "not found"; } diff --git a/weighted.cpp b/weighted.cpp index 85eed52..cf1291d 100644 --- a/weighted.cpp +++ b/weighted.cpp @@ -19,7 +19,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); numGroups = m->getNumGroups(); @@ -38,7 +38,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -52,12 +52,12 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -69,7 +69,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -87,7 +87,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro }else if (pid == 0){ EstOutput Myresults; - Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); //m->mothurOut("Merging results."); m->mothurOutEndLine(); @@ -110,7 +110,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -155,7 +155,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro } } /**************************************************************************************************/ -EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { EstOutput results; vector D; @@ -179,7 +179,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int numSeqsInGroupI = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groupA][j], groupA, groupB); - double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groupA]); + double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groupA)); D[count] += weightedSum; } @@ -190,7 +190,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int numSeqsInGroupL = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groupB][j], groupA, groupB); - double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groupB]); + double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groupB)); D[count] += weightedSum; } @@ -216,7 +216,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, it = t->tree[i].pcount.find(groupA); //if it does u = # of its descendants with a certain group / total number in tree with a certain group if (it != t->tree[i].pcount.end()) { - u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA]; + u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA); }else { u = 0.00; } @@ -225,7 +225,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, //if it does subtract their percentage from u if (it != t->tree[i].pcount.end()) { - u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB]; + u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB); } if (includeRoot) { @@ -270,7 +270,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { data.clear(); //clear out old values - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); if (m->control_pressed) { return data; } @@ -287,7 +287,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { int numSeqsInGroupI = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groups[0]][j], groups[0], groups[1]); - double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]); + double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groups[0])); D += weightedSum; } @@ -298,7 +298,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { int numSeqsInGroupL = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groups[1]][j], groups[0], groups[1]); - double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]); + double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groups[1])); D += weightedSum; } @@ -314,7 +314,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { it = t->tree[i].pcount.find(groupA); //if it does u = # of its descendants with a certain group / total number in tree with a certain group if (it != t->tree[i].pcount.end()) { - u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA]; + u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA); }else { u = 0.00; } @@ -322,7 +322,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { it = t->tree[i].pcount.find(groupB); //if it does subtract their percentage from u if (it != t->tree[i].pcount.end()) { - u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB]; + u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB); } if (includeRoot) { diff --git a/weighted.h b/weighted.h index 180409c..d4082fe 100644 --- a/weighted.h +++ b/weighted.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -41,8 +41,8 @@ class Weighted : public TreeCalculator { map< vector, set > rootForGrouping; //maps a grouping combo to the root for that combo bool includeRoot; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); double getLengthToRoot(Tree*, int, string, string); }; -- 2.39.2