X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=phylodiversitycommand.cpp;h=9d68cb5debbbc65472577c4861391bb8ffbde3bc;hb=8e67e9de1b200106bea5a468ac02125954656499;hp=ca0d02aca04b0a737d45c29b44cb02c2925ff257;hpb=55386dddad84cc1140d736cabaf4dd0ae16f2e01;p=mothur.git diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index ca0d02a..9d68cb5 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -8,6 +8,7 @@ */ #include "phylodiversitycommand.h" +#include "treereader.h" //********************************************************************************************************************** vector PhyloDiversityCommand::setParameters(){ @@ -60,7 +61,28 @@ string PhyloDiversityCommand::getHelpString(){ exit(1); } } - +//********************************************************************************************************************** +string PhyloDiversityCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "phylodiv") { outputFileName = "phylodiv"; } + else if (type == "rarefy") { outputFileName = "phylodiv.rarefaction"; } + else if (type == "summary") { outputFileName = "phylodiv.summary"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "PhyloDiversityCommand", "getOutputFileNameTag"); + exit(1); + } +} //********************************************************************************************************************** PhyloDiversityCommand::PhyloDiversityCommand(){ @@ -136,15 +158,9 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { } } - m->runParse = true; - m->clearGroups(); - m->clearAllGroups(); - m->Treenames.clear(); - m->names.clear(); - //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); @@ -159,7 +175,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -167,10 +183,10 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { string temp; temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; } - convert(temp, freq); + m->mothurConvert(temp, freq); temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } - convert(temp, iters); + m->mothurConvert(temp, iters); temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; } rarefy = m->isTrue(temp); @@ -187,7 +203,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } @@ -197,6 +213,11 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { } if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } } @@ -212,75 +233,18 @@ int PhyloDiversityCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } + int start = time(NULL); + m->setTreeFile(treefile); - - if (groupfile != "") { - //read in group map info. - tmap = new TreeMap(groupfile); - tmap->readMap(); - }else{ //fake out by putting everyone in one group - Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap - tmap = new TreeMap(); - - for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); } - } - - if (namefile != "") { readNamesFile(); } - - read = new ReadNewickTree(treefile); - int readOk = read->read(tmap); - - if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; } - - read->AssembleTrees(); - vector trees = read->getTrees(); - delete read; - - //make sure all files match - //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size. - int numNamesInTree; - if (namefile != "") { - if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); } - else { numNamesInTree = m->Treenames.size(); } - }else { numNamesInTree = m->Treenames.size(); } - - - //output any names that are in group file but not in tree - if (numNamesInTree < tmap->getNumSeqs()) { - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { - //is that name in the tree? - int count = 0; - for (int j = 0; j < m->Treenames.size(); j++) { - if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it - count++; - } - - if (m->control_pressed) { - delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; } - for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); - m->clearGroups(); - return 0; - } - - //then you did not find it so report it - if (count == m->Treenames.size()) { - //if it is in your namefile then don't remove - map::iterator it = nameMap.find(tmap->namesOfSeqs[i]); - - if (it == nameMap.end()) { - m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine(); - tmap->removeSeq(tmap->namesOfSeqs[i]); - i--; //need this because removeSeq removes name from namesOfSeqs - } - } - } - } - - SharedUtil* util = new SharedUtil(); + TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + vector trees = reader->getTrees(); + tmap = trees[0]->getTreeMap(); + delete reader; + + SharedUtil util; vector mGroups = m->getGroups(); vector tGroups = tmap->getNamesOfGroups(); - util->setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze - delete util; + util.setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i); break; } } @@ -294,9 +258,9 @@ int PhyloDiversityCommand::execute(){ if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } ofstream outSum, outRare, outCollect; - string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + ".phylodiv.summary"; - string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + ".phylodiv.rarefaction"; - string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + ".phylodiv"; + string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + "." + getOutputFileNameTag("summary"); + string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + "." + getOutputFileNameTag("rarefy"); + string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + "." + getOutputFileNameTag("phylodiv"); if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); outputTypes["summary"].push_back(outSumFile); } if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); outputTypes["rarefy"].push_back(outRareFile); } @@ -348,7 +312,7 @@ int PhyloDiversityCommand::execute(){ if (numSampledList.count(diversity[mGroups[j]].size()-1) == 0) { numSampledList.insert(diversity[mGroups[j]].size()-1); } } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true); }else{ @@ -382,6 +346,9 @@ int PhyloDiversityCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine(); + + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -398,7 +365,7 @@ int PhyloDiversityCommand::execute(){ //********************************************************************************************************************** int PhyloDiversityCommand::createProcesses(vector& procIters, Tree* t, map< string, vector >& div, map >& sumDiv, int numIters, int increment, vector& randomLeaf, set& numSampledList, ofstream& outCollect, ofstream& outSum){ try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; vector processIDS; @@ -488,39 +455,50 @@ int PhyloDiversityCommand::driver(Tree* t, map< string, vector >& div, ma try { int numLeafNodes = randomLeaf.size(); vector mGroups = m->getGroups(); - + + map rootForGroup = getRootForGroups(t); //maps groupName to root node in tree. "root" for group may not be the trees root and we don't want to include the extra branches. + for (int l = 0; l < numIters; l++) { random_shuffle(randomLeaf.begin(), randomLeaf.end()); - + cout << l << endl; //initialize counts map counts; - map< string, set > countedBranch; - for (int j = 0; j < mGroups.size(); j++) { counts[mGroups[j]] = 0; countedBranch[mGroups[j]].insert(-2); } //add dummy index to initialize countedBranch sets + vector< map > countedBranch; + for (int i = 0; i < t->getNumNodes(); i++) { + map temp; + for (int j = 0; j < mGroups.size(); j++) { temp[mGroups[j]] = false; } + countedBranch.push_back(temp); + } + + for (int j = 0; j < mGroups.size(); j++) { counts[mGroups[j]] = 0; } for(int k = 0; k < numLeafNodes; k++){ if (m->control_pressed) { return 0; } //calc branch length of randomLeaf k - vector br = calcBranchLength(t, randomLeaf[k], countedBranch); + vector br = calcBranchLength(t, randomLeaf[k], countedBranch, rootForGroup); //for each group in the groups update the total branch length accounting for the names file vector groups = t->tree[randomLeaf[k]].getGroup(); for (int j = 0; j < groups.size(); j++) { - int numSeqsInGroupJ = 0; - map::iterator it; - it = t->tree[randomLeaf[k]].pcount.find(groups[j]); - if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j - numSeqsInGroupJ = it->second; - } - - if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j]; } - - for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) { - div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths - } - counts[groups[j]] += numSeqsInGroupJ; + + if (m->inUsersGroups(groups[j], mGroups)) { + int numSeqsInGroupJ = 0; + map::iterator it; + it = t->tree[randomLeaf[k]].pcount.find(groups[j]); + if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j + numSeqsInGroupJ = it->second; + } + + if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j]; } + + for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) { + div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths + } + counts[groups[j]] += numSeqsInGroupJ; + } } } @@ -615,9 +593,9 @@ void PhyloDiversityCommand::printData(set& num, map< string, vector } //********************************************************************************************************************** //need a vector of floats one branch length for every group the node represents. -vector PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set >& counted){ +vector PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, vector< map >& counted, map roots){ try { - + //calc the branch length //while you aren't at root vector sums; @@ -626,127 +604,101 @@ vector PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< st vector groups = t->tree[leaf].getGroup(); sums.resize(groups.size(), 0.0); - map > tempTotals; //maps node to total Branch Length - map< string, set > tempCounted; - set::iterator it; - - //you are a leaf + + //you are a leaf if(t->tree[index].getBranchLength() != -1){ for (int k = 0; k < groups.size(); k++) { sums[k] += abs(t->tree[index].getBranchLength()); - counted[groups[k]].insert(index); } } - - for (int k = 0; k < groups.size(); k++) { - tempTotals[groups[k]][index] = 0.0; - } - - index = t->tree[index].getParent(); - + + + index = t->tree[index].getParent(); + //while you aren't at root while(t->tree[index].getParent() != -1){ - + if (m->control_pressed) { return sums; } - int pcountSize = 0; for (int k = 0; k < groups.size(); k++) { - map::iterator itGroup = t->tree[index].pcount.find(groups[k]); - if (itGroup != t->tree[index].pcount.end()) { pcountSize++; } - - //do both your chidren have have descendants from the users groups? - int lc = t->tree[index].getLChild(); - int rc = t->tree[index].getRChild(); - - int LpcountSize = 0; - itGroup = t->tree[lc].pcount.find(groups[k]); - if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++; } - - int RpcountSize = 0; - itGroup = t->tree[rc].pcount.find(groups[k]); - if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++; } - - //if yes, add your childrens tempTotals - if ((LpcountSize != 0) && (RpcountSize != 0)) { - sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; - - for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); } - - //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl; - if (t->tree[index].getBranchLength() != -1) { - if (counted[groups[k]].count(index) == 0) { - tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength()); - tempCounted[groups[k]].insert(index); - }else{ - tempTotals[groups[k]][index] = 0.0; - } - }else { - tempTotals[groups[k]][index] = 0.0; - } - }else { //if no, your tempTotal is your childrens temp totals + your branch length - tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; - - if (counted[groups[k]].count(index) == 0) { - tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength()); - tempCounted[groups[k]].insert(index); - } - - } - //cout << "temptotal = "<< tempTotals[i] << endl; - } - - index = t->tree[index].getParent(); - } - + + if (index >= roots[groups[k]]) { counted[index][groups[k]] = true; } //if you are at this groups "root", then say we are done + + if (!counted[index][groups[k]]){ //if counted[index][groups[k] is true this groups has already added all br from here to root, so quit early + if (t->tree[index].getBranchLength() != -1) { + sums[k] += abs(t->tree[index].getBranchLength()); + } + counted[index][groups[k]] = true; + } + } + index = t->tree[index].getParent(); + } + return sums; - + } catch(exception& e) { m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength"); exit(1); } } -/*****************************************************************/ -int PhyloDiversityCommand::readNamesFile() { +//********************************************************************************************************************** +map PhyloDiversityCommand::getRootForGroups(Tree* t){ try { - m->names.clear(); - numUniquesInName = 0; - - ifstream in; - m->openInputFile(namefile, in); - - string first, second; - map::iterator itNames; - - while(!in.eof()) { - in >> first >> second; m->gobble(in); - - numUniquesInName++; - - itNames = m->names.find(first); - if (itNames == m->names.end()) { - m->names[first] = second; - - //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them - vector dupNames; - m->splitAtComma(second, dupNames); - - for (int i = 0; i < dupNames.size(); i++) { - nameMap[dupNames[i]] = dupNames[i]; - if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } - } - }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; } - } - in.close(); - - return 0; + map roots; //maps group to root for group, may not be root of tree + map done; + + //initialize root for all groups to -1 + for (int k = 0; k < (t->getTreeMap())->getNamesOfGroups().size(); k++) { done[(t->getTreeMap())->getNamesOfGroups()[k]] = false; } + + for (int i = 0; i < t->getNumLeaves(); i++) { + + vector groups = t->tree[i].getGroup(); + + int index = t->tree[i].getParent(); + + for (int j = 0; j < groups.size(); j++) { + + if (done[groups[j]] == false) { //we haven't found the root for this group yet + + done[groups[j]] = true; + roots[groups[j]] = i; //set root to self to start + + //while you aren't at root + while(t->tree[index].getParent() != -1){ + + if (m->control_pressed) { return roots; } + + //do both your chidren have have descendants from the users groups? + int lc = t->tree[index].getLChild(); + int rc = t->tree[index].getRChild(); + + int LpcountSize = 0; + map:: iterator itGroup = t->tree[lc].pcount.find(groups[j]); + if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++; } + + int RpcountSize = 0; + itGroup = t->tree[rc].pcount.find(groups[j]); + if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++; } + + if ((LpcountSize != 0) && (RpcountSize != 0)) { //possible root + roots[groups[j]] = index; + }else { ;} + + index = t->tree[index].getParent(); + } + } + } + } + + return roots; + } catch(exception& e) { - m->errorOut(e, "PhyloDiversityCommand", "readNamesFile"); + m->errorOut(e, "PhyloDiversityCommand", "getRootForGroups"); exit(1); } } - //**********************************************************************************************************************