From e98d56be8369a799e61a411bc13d3bd1fa3451e5 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 2 May 2012 15:22:40 -0400 Subject: [PATCH] fixed segfault in unifrac with subsample. in progress of implementing a version of the subsample tree that sets sees you don't want to includes groups to doNotIncludeMe. to test which is more efficient. --- subsample.cpp | 82 ++++++++++++++++++++--- subsample.h | 4 +- tree.cpp | 122 +++++++++++++++++++++++++++++++++++ tree.h | 1 + unifracunweightedcommand.cpp | 10 ++- 5 files changed, 208 insertions(+), 11 deletions(-) diff --git a/subsample.cpp b/subsample.cpp index b1e78a4..70c16a8 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -7,7 +7,47 @@ // #include "subsample.h" - +//********************************************************************************************************************** +Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size, map originalNameMap) { + try { + Tree* newTree = NULL; + + vector subsampledSeqs = getSample(tmap, size); + map sampledNameMap = deconvolute(whole, subsampledSeqs); + + //remove seqs not in sample from treemap + for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { + //is that name in the subsample? + int count = 0; + string name = tmap->namesOfSeqs[i]; + for (int j = 0; j < subsampledSeqs.size(); j++) { + if (name == subsampledSeqs[j]) { break; } //found it + count++; + } + + if (m->control_pressed) { return newTree; } + + //if you didnt find it, remove it + if (count == subsampledSeqs.size()) { + tmap->removeSeq(name); + tmap->addSeq(name, "doNotIncludeMe"); + } + } + + //create new tree + int numUniques = sampledNameMap.size(); + if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); } + + newTree = new Tree(tmap); + newTree->getCopy(T, originalNameMap, subsampledSeqs); + + return newTree; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-Tree"); + exit(1); + } +} //********************************************************************************************************************** Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size) { try { @@ -47,7 +87,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, in m->errorOut(e, "SubSample", "getSample-Tree"); exit(1); } -} +} //********************************************************************************************************************** //assumes whole maps dupName -> uniqueName map SubSample::deconvolute(map whole, vector& wanted) { @@ -93,6 +133,37 @@ vector SubSample::getSample(TreeMap* tMap, int size) { vector sample; vector Groups = tMap->getNamesOfGroups(); + for (int i = 0; i < Groups.size(); i++) { + + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } + + vector thisGroup; thisGroup.push_back(Groups[i]); + vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); + int thisSize = thisGroupsSeqs.size(); + + if (thisSize >= size) { + + random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); + + for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); } + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + } + } + + return sample; + } + catch(exception& e) { + m->errorOut(e, "SubSample", "getSample-TreeMap"); + exit(1); + } +} +//********************************************************************************************************************** +vector SubSample::getSample(TreeMap* tMap, vector Groups) { + try { + vector sample; + + //vector Groups = tMap->getNamesOfGroups(); for (int i = 0; i < Groups.size(); i++) { if (m->control_pressed) { break; } @@ -100,13 +171,8 @@ vector SubSample::getSample(TreeMap* tMap, int size) { vector thisGroup; thisGroup.push_back(Groups[i]); vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); int thisSize = thisGroupsSeqs.size(); - - if (thisSize >= size) { - - random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); - for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); } - }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); } } return sample; diff --git a/subsample.h b/subsample.h index aaf5244..ef5c389 100644 --- a/subsample.h +++ b/subsample.h @@ -26,13 +26,15 @@ class SubSample { vector getSample(vector&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first. Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. + Tree* getSample(Tree*, TreeMap*, map, int, map); //creates new subsampled tree, destroys treemap so copy if needed. private: MothurOut* m; int eliminateZeroOTUS(vector&); - vector getSample(TreeMap*, int); //returns map contains names of seqs in subsample -> group. + vector getSample(TreeMap*, vector); + vector getSample(TreeMap*, int); //names of seqs to include in sample tree map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. diff --git a/tree.cpp b/tree.cpp index d9b4a9c..ed65250 100644 --- a/tree.cpp +++ b/tree.cpp @@ -588,6 +588,128 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { } } /*****************************************************************/ +void Tree::getCopy(Tree* copy, map nameMap, vector namesToInclude) { + try { + + //for each node in the tree copy its info + for (int i = 0; i < numNodes; i++) { + //copy name + tree[i].setName(copy->tree[i].getName()); + + //copy group + vector temp; + tree[i].setGroup(temp); + + //copy branch length + tree[i].setBranchLength(copy->tree[i].getBranchLength()); + + //copy parent + tree[i].setParent(copy->tree[i].getParent()); + + //copy children + tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); + + //copy index in node and tmap + tree[i].setIndex(copy->tree[i].getIndex()); + setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); + + //copy pGroups + tree[i].pGroups.clear(); + + //copy pcount + tree[i].pcount.clear(); + } + + groupNodeInfo.clear(); + + //now lets change prune the seqs not in namesToInclude by setting their group to "doNotIncludeMe" + for (int i = 0; i < numLeaves; i++) { + + if (m->control_pressed) { break; } + + string name = tree[i].getName(); + + map::iterator itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1); } + else { + vector dupNames; + m->splitAtComma(nameMap[name], dupNames); + + map::iterator itCounts; + int maxPars = 1; + set groupsAddedForThisNode; + for (int j = 0; j < dupNames.size(); j++) { + + string group = tmap->getGroup(dupNames[j]); + bool includeMe = m->inUsersGroups(dupNames[j], namesToInclude); + + if (!includeMe && (group != "doNotIncludeMe")) { m->mothurOut("[ERROR] : creating subtree in copy.\n"); m->control_pressed = true; } + else if (!includeMe) { + if (groupsAddedForThisNode.count(group) == 0) { groupNodeInfo[group].push_back(i); groupsAddedForThisNode.insert(group); } //if you have not already added this node for this group, then add it + + //update pcounts + itCounts = tree[i].pcount.find(group); + if (itCounts == tree[i].pcount.end()) { //new group, add it + tree[i].pcount[group] = 1; + }else { + tree[i].pcount[group]++; + } + + //update pgroups + itCounts = tree[i].pGroups.find(group); + if (itCounts == tree[i].pGroups.end()) { //new group, add it + tree[i].pGroups[group] = 1; + }else{ + tree[i].pGroups[group]++; + } + + //keep highest group + if(tree[i].pGroups[group] > maxPars){ + maxPars = tree[i].pGroups[group]; + } + } + }//end for + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + + //update groups to reflect all the groups this node represents + vector nodeGroups; + map::iterator itGroups; + for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) { + nodeGroups.push_back(itGroups->first); + } + tree[i].setGroup(nodeGroups); + + }//end else + }//end for + + + //build the pGroups in non leaf nodes to be used in the parsimony calcs. + for (int i = numLeaves; i < numNodes; i++) { + if (m->control_pressed) { break; } + + tree[i].pGroups = (mergeGroups(i)); + tree[i].pcount = (mergeGcounts(i)); + } + } + catch(exception& e) { + m->errorOut(e, "Tree", "getCopy"); + exit(1); + } +} +/*****************************************************************/ void Tree::getCopy(Tree* copy) { try { diff --git a/tree.h b/tree.h index 0660e8a..40ae14f 100644 --- a/tree.h +++ b/tree.h @@ -24,6 +24,7 @@ public: TreeMap* getTreeMap() { return tmap; } void getCopy(Tree*); //makes tree a copy of the one passed in. + void getCopy(Tree* copy, map, vector); //makes a copy of the tree passed in, but everyone who is not in the vector has group set to doNotIncludeMe. Assumes the tmap already has these seqs group set to doNotIncludeMe. void getSubTree(Tree*, vector); //makes tree a that contains only the names passed in. int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index dbdee2a..db5cf90 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -245,6 +245,7 @@ int UnifracUnweightedCommand::execute() { T = reader->getTrees(); tmap = T[0]->getTreeMap(); map nameMap = reader->getNames(); + map unique2Dup = reader->getNameMap(); delete reader; sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary"; @@ -281,7 +282,7 @@ int UnifracUnweightedCommand::execute() { int thisSize = thisGroupsSeqs.size(); if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); } - else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } + else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } } m->setGroups(Groups); } @@ -305,7 +306,7 @@ int UnifracUnweightedCommand::execute() { for (int i = 0; i < T.size(); i++) { if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - counter = 0; + counter = 0; if (random) { output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted", itersString); @@ -354,6 +355,11 @@ int UnifracUnweightedCommand::execute() { SubSample sample; Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); + //uses method of setting groups to doNotIncludeMe + //SubSample sample; + //Tree* newTree2 = sample.getSample(T[i], newTmap, nameMap, subsampleSize, unique2Dup); + // newTree2->print(cout); + //call new weighted function vector iterData; iterData.resize(numComp,0); Unweighted thisUnweighted(includeRoot); -- 2.39.2