From 72e0be6b9c80009d4dbee24e8d690ad9514dc6fb Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Fri, 11 May 2012 07:22:19 -0400 Subject: [PATCH] fixed path redirect issue in shhh.flows. optimized the subsampling in unifrac commands --- shhhercommand.cpp | 12 ++--- subsample.cpp | 75 ++++++++++++++++++---------- subsample.h | 5 +- tree.cpp | 97 ++---------------------------------- tree.h | 2 +- unifracunweightedcommand.cpp | 15 +++--- unifracweightedcommand.cpp | 11 ++-- 7 files changed, 78 insertions(+), 139 deletions(-) diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 49e2faa..a086c98 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -218,7 +218,7 @@ ShhherCommand::ShhherCommand(string option) { if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; } } else{ - outputDir += m->hasPath(flowFileName); + if (outputDir == "") { outputDir = m->hasPath(flowFileName); } flowFileVector.push_back(flowFileName); } @@ -2980,7 +2980,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filenam try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + if (outputDir == "") { thisOutputDir = m->hasPath(filename); } string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual"; ofstream qualityFile; @@ -3088,7 +3088,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filenam void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector otuCounts, vector& uniqueFlowgrams, vector& seqNameVector, vector >& aaI, vector& centroids){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + if (outputDir == "") { thisOutputDir = m->hasPath(filename); } string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta"; ofstream fastaFile; m->openOutputFile(fastaFileName, fastaFile); @@ -3136,7 +3136,7 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector otuCounts, vector& seqNameVector, vector >& aaI, vector& nSeqsPerOTU){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + if (outputDir == "") { thisOutputDir = m->hasPath(filename); } string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names"; ofstream nameFile; m->openOutputFile(nameFileName, nameFile); @@ -3174,7 +3174,7 @@ void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, s void ShhherCommand::writeGroups(string filename, int numSeqs, vector& seqNameVector){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + if (outputDir == "") { thisOutputDir = m->hasPath(filename); } string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename)); string groupFileName = fileRoot + "shhh.groups"; ofstream groupFile; @@ -3199,7 +3199,7 @@ void ShhherCommand::writeGroups(string filename, int numSeqs, vector& se void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector otuCounts, vector& centroids, vector& uniqueFlowgrams, vector& seqNameVector, vector >& aaI, vector& nSeqsPerOTU, vector& lengths, vector& flowDataIntI){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + if (outputDir == "") { thisOutputDir = m->hasPath(filename); } string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts"; ofstream otuCountsFile; m->openOutputFile(otuCountsFileName, otuCountsFile); diff --git a/subsample.cpp b/subsample.cpp index 70c16a8..457b7b9 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -8,38 +8,22 @@ #include "subsample.h" //********************************************************************************************************************** -Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size, map originalNameMap) { +Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map originalNameMap) { try { Tree* newTree = NULL; - vector subsampledSeqs = getSample(tmap, size); - map sampledNameMap = deconvolute(whole, subsampledSeqs); + map > newGroups; + vector subsampledSeqs = getSample(tmap, size, newGroups); //remove seqs not in sample from treemap - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { - //is that name in the subsample? - int count = 0; - string name = tmap->namesOfSeqs[i]; - for (int j = 0; j < subsampledSeqs.size(); j++) { - if (name == subsampledSeqs[j]) { break; } //found it - count++; - } - - if (m->control_pressed) { return newTree; } - - //if you didnt find it, remove it - if (count == subsampledSeqs.size()) { - tmap->removeSeq(name); - tmap->addSeq(name, "doNotIncludeMe"); + for (map >::iterator it = newGroups.begin(); it != newGroups.end(); it++) { + for (int i = 0; i < (it->second).size(); i++) { + newTmap->addSeq((it->second)[i], it->first); } } - //create new tree - int numUniques = sampledNameMap.size(); - if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); } - - newTree = new Tree(tmap); - newTree->getCopy(T, originalNameMap, subsampledSeqs); + newTree = new Tree(newTmap); + newTree->getCopy(T, originalNameMap); return newTree; } @@ -48,7 +32,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, in exit(1); } } -//********************************************************************************************************************** +/********************************************************************************************************************** Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size) { try { Tree* newTree = NULL; @@ -87,7 +71,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, in m->errorOut(e, "SubSample", "getSample-Tree"); exit(1); } -} +}*/ //********************************************************************************************************************** //assumes whole maps dupName -> uniqueName map SubSample::deconvolute(map whole, vector& wanted) { @@ -126,7 +110,46 @@ map SubSample::deconvolute(map whole, vectorerrorOut(e, "SubSample", "deconvolute"); exit(1); } +} +//********************************************************************************************************************** +vector SubSample::getSample(TreeMap* tMap, int size, map >& sample) { + try { + vector temp2; + sample["doNotIncludeMe"] = temp2; + + vector namesInSample; + + vector Groups = tMap->getNamesOfGroups(); + for (int i = 0; i < Groups.size(); i++) { + + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } + + vector thisGroup; thisGroup.push_back(Groups[i]); + vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); + int thisSize = thisGroupsSeqs.size(); + vector temp; + sample[Groups[i]] = temp; + + if (thisSize >= size) { + + random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); + + for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); } + for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); } + + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + } + } + + return namesInSample; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-TreeMap"); + exit(1); + } } + //********************************************************************************************************************** vector SubSample::getSample(TreeMap* tMap, int size) { try { diff --git a/subsample.h b/subsample.h index ef5c389..feca37e 100644 --- a/subsample.h +++ b/subsample.h @@ -25,8 +25,8 @@ class SubSample { vector getSample(vector&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first. - Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. - Tree* getSample(Tree*, TreeMap*, map, int, map); //creates new subsampled tree, destroys treemap so copy if needed. + //Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. + Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". private: @@ -35,6 +35,7 @@ class SubSample { vector getSample(TreeMap*, vector); vector getSample(TreeMap*, int); //names of seqs to include in sample tree + vector getSample(TreeMap* tMap, int size, map >& sample); //sample maps group -> seqs in group. seqs not in sample are in doNotIncludeMe group map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. diff --git a/tree.cpp b/tree.cpp index ed65250..44ecadd 100644 --- a/tree.cpp +++ b/tree.cpp @@ -588,18 +588,11 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { } } /*****************************************************************/ -void Tree::getCopy(Tree* copy, map nameMap, vector namesToInclude) { +void Tree::getCopy(Tree* copy, map nameMap) { try { //for each node in the tree copy its info for (int i = 0; i < numNodes; i++) { - //copy name - tree[i].setName(copy->tree[i].getName()); - - //copy group - vector temp; - tree[i].setGroup(temp); - //copy branch length tree[i].setBranchLength(copy->tree[i].getBranchLength()); @@ -608,93 +601,9 @@ void Tree::getCopy(Tree* copy, map nameMap, vector names //copy children tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); - - //copy index in node and tmap - tree[i].setIndex(copy->tree[i].getIndex()); - setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); - - //copy pGroups - tree[i].pGroups.clear(); - - //copy pcount - tree[i].pcount.clear(); - } + } - groupNodeInfo.clear(); - - //now lets change prune the seqs not in namesToInclude by setting their group to "doNotIncludeMe" - for (int i = 0; i < numLeaves; i++) { - - if (m->control_pressed) { break; } - - string name = tree[i].getName(); - - map::iterator itNames = nameMap.find(name); - - if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1); } - else { - vector dupNames; - m->splitAtComma(nameMap[name], dupNames); - - map::iterator itCounts; - int maxPars = 1; - set groupsAddedForThisNode; - for (int j = 0; j < dupNames.size(); j++) { - - string group = tmap->getGroup(dupNames[j]); - bool includeMe = m->inUsersGroups(dupNames[j], namesToInclude); - - if (!includeMe && (group != "doNotIncludeMe")) { m->mothurOut("[ERROR] : creating subtree in copy.\n"); m->control_pressed = true; } - else if (!includeMe) { - if (groupsAddedForThisNode.count(group) == 0) { groupNodeInfo[group].push_back(i); groupsAddedForThisNode.insert(group); } //if you have not already added this node for this group, then add it - - //update pcounts - itCounts = tree[i].pcount.find(group); - if (itCounts == tree[i].pcount.end()) { //new group, add it - tree[i].pcount[group] = 1; - }else { - tree[i].pcount[group]++; - } - - //update pgroups - itCounts = tree[i].pGroups.find(group); - if (itCounts == tree[i].pGroups.end()) { //new group, add it - tree[i].pGroups[group] = 1; - }else{ - tree[i].pGroups[group]++; - } - - //keep highest group - if(tree[i].pGroups[group] > maxPars){ - maxPars = tree[i].pGroups[group]; - } - } - }//end for - - if (maxPars > 1) { //then we have some more dominant groups - //erase all the groups that are less than maxPars because you found a more dominant group. - for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ - if(it->second < maxPars){ - tree[i].pGroups.erase(it++); - }else { it++; } - } - //set one remaining groups to 1 - for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ - tree[i].pGroups[it->first] = 1; - } - }//end if - - //update groups to reflect all the groups this node represents - vector nodeGroups; - map::iterator itGroups; - for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) { - nodeGroups.push_back(itGroups->first); - } - tree[i].setGroup(nodeGroups); - - }//end else - }//end for - + if (nameMap.size() != 0) { addNamesToCounts(nameMap); } //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { diff --git a/tree.h b/tree.h index 40ae14f..03da5f6 100644 --- a/tree.h +++ b/tree.h @@ -24,7 +24,7 @@ public: TreeMap* getTreeMap() { return tmap; } void getCopy(Tree*); //makes tree a copy of the one passed in. - void getCopy(Tree* copy, map, vector); //makes a copy of the tree passed in, but everyone who is not in the vector has group set to doNotIncludeMe. Assumes the tmap already has these seqs group set to doNotIncludeMe. + void getCopy(Tree* copy, map); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want. Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe". void getSubTree(Tree*, vector); //makes tree a that contains only the names passed in. int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index db5cf90..318409f 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -342,23 +342,23 @@ int UnifracUnweightedCommand::execute() { if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + int startSubsample = time(NULL); + //subsample loop vector< vector > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f. - if (m->control_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. TreeMap* newTmap = new TreeMap(); - newTmap->getCopy(*tmap); + //newTmap->getCopy(*tmap); - SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); + //SubSample sample; + //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); //uses method of setting groups to doNotIncludeMe - //SubSample sample; - //Tree* newTree2 = sample.getSample(T[i], newTmap, nameMap, subsampleSize, unique2Dup); - // newTree2->print(cout); + SubSample sample; + Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); //call new weighted function vector iterData; iterData.resize(numComp,0); @@ -373,6 +373,7 @@ int UnifracUnweightedCommand::execute() { if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); } } + m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 633cb64..1df7988 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -237,6 +237,7 @@ int UnifracWeightedCommand::execute() { T = reader->getTrees(); tmap = T[0]->getTreeMap(); map nameMap = reader->getNames(); + map unique2Dup = reader->getNameMap(); delete reader; if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } @@ -336,11 +337,15 @@ int UnifracWeightedCommand::execute() { //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. TreeMap* newTmap = new TreeMap(); - newTmap->getCopy(*tmap); + //newTmap->getCopy(*tmap); + //SubSample sample; + //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); + + //uses method of setting groups to doNotIncludeMe SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); - + Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); + //call new weighted function vector iterData; iterData.resize(numComp,0); Weighted thisWeighted(includeRoot); -- 2.39.2