]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed segfault in unifrac with subsample. in progress of implementing a version of...
authorSarah Westcott <mothur.westcott@gmail.com>
Wed, 2 May 2012 19:22:40 +0000 (15:22 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Wed, 2 May 2012 19:22:40 +0000 (15:22 -0400)
subsample.cpp
subsample.h
tree.cpp
tree.h
unifracunweightedcommand.cpp

index b1e78a44a0a2e5b5cd31e7c38e27710603ee1578..70c16a8a6668649e3a8ea697d6f32ec1a738dcc9 100644 (file)
@@ -7,7 +7,47 @@
 //
 
 #include "subsample.h"
-
+//**********************************************************************************************************************
+Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size, map<string, string> originalNameMap) {
+    try {
+        Tree* newTree = NULL;
+        
+        vector<string> subsampledSeqs = getSample(tmap, size);
+        map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
+        
+        //remove seqs not in sample from treemap
+        for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+            //is that name in the subsample?
+            int count = 0;
+            string name = tmap->namesOfSeqs[i];
+            for (int j = 0; j < subsampledSeqs.size(); j++) {
+                if (name == subsampledSeqs[j]) { break; } //found it
+                count++;
+            }
+            
+            if (m->control_pressed) { return newTree; }
+            
+            //if you didnt find it, remove it 
+            if (count == subsampledSeqs.size()) { 
+                tmap->removeSeq(name);
+                tmap->addSeq(name, "doNotIncludeMe");
+            }
+        }
+        
+        //create new tree
+        int numUniques = sampledNameMap.size();
+        if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
+        
+        newTree = new Tree(tmap);
+        newTree->getCopy(T, originalNameMap, subsampledSeqs);
+        
+        return newTree;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "SubSample", "getSample-Tree");
+        exit(1);
+    }
+}
 //**********************************************************************************************************************
 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
     try {
@@ -47,7 +87,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, in
         m->errorOut(e, "SubSample", "getSample-Tree");
         exit(1);
     }
-}      
+}
 //**********************************************************************************************************************
 //assumes whole maps dupName -> uniqueName
 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
@@ -93,6 +133,37 @@ vector<string> SubSample::getSample(TreeMap* tMap, int size) {
         vector<string> sample;
         
         vector<string> Groups = tMap->getNamesOfGroups();    
+        for (int i = 0; i < Groups.size(); i++) {
+            
+            if (m->inUsersGroups(Groups[i], m->getGroups())) {
+                if (m->control_pressed) { break; }
+                
+                vector<string> thisGroup; thisGroup.push_back(Groups[i]);
+                vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
+                int thisSize = thisGroupsSeqs.size();
+                
+                if (thisSize >= size) {        
+                    
+                    random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
+                    
+                    for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
+                }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+            }
+        } 
+        
+        return sample;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "getSample-TreeMap");
+               exit(1);
+       }
+}      
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(TreeMap* tMap, vector<string> Groups) {
+    try {
+        vector<string> sample;
+        
+        //vector<string> Groups = tMap->getNamesOfGroups();    
         for (int i = 0; i < Groups.size(); i++) {
             
             if (m->control_pressed) { break; }
@@ -100,13 +171,8 @@ vector<string> SubSample::getSample(TreeMap* tMap, int size) {
             vector<string> thisGroup; thisGroup.push_back(Groups[i]);
             vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
             int thisSize = thisGroupsSeqs.size();
-            
-            if (thisSize >= size) {    
-                
-                random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
                 
-                for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
-            }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+            for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); }
         } 
         
         return sample;
index aaf52447b026127b27141bb8794a77a80f60c7f4..ef5c389a938820a61201d83629efc6b27c5237d8 100644 (file)
@@ -26,13 +26,15 @@ class SubSample {
         vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
         
         Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
+        Tree* getSample(Tree*, TreeMap*, map<string, string>, int, map<string, string>); //creates new subsampled tree, destroys treemap so copy if needed.
     
     private:
     
         MothurOut* m;
         int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
     
-        vector<string> getSample(TreeMap*, int); //returns map contains names of seqs in subsample -> group. 
+        vector<string> getSample(TreeMap*, vector<string>);
+        vector<string> getSample(TreeMap*, int); //names of seqs to include in sample tree 
         map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
 
 
index d9b4a9c1e8f4e31475aa5127adb38d339d1a8631..ed652508ef2e6b049c183777ae5eecab42100d97 100644 (file)
--- a/tree.cpp
+++ b/tree.cpp
@@ -588,6 +588,128 @@ int Tree::populateNewTree(vector<Node>& oldtree, int node, int& index) {
        }
 }
 /*****************************************************************/
+void Tree::getCopy(Tree* copy, map<string, string> nameMap, vector<string> namesToInclude) {
+       try {
+        
+               //for each node in the tree copy its info
+               for (int i = 0; i < numNodes; i++) {
+                       //copy name
+                       tree[i].setName(copy->tree[i].getName());
+            
+                       //copy group
+            vector<string> temp;
+                       tree[i].setGroup(temp);
+                       
+                       //copy branch length
+                       tree[i].setBranchLength(copy->tree[i].getBranchLength());
+            
+                       //copy parent
+                       tree[i].setParent(copy->tree[i].getParent());
+            
+                       //copy children
+                       tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild());
+            
+                       //copy index in node and tmap
+                       tree[i].setIndex(copy->tree[i].getIndex());
+                       setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName()));
+                       
+                       //copy pGroups
+                       tree[i].pGroups.clear();
+            
+                       //copy pcount
+                       tree[i].pcount.clear();
+               }
+               
+               groupNodeInfo.clear();
+        
+        //now lets change prune the seqs not in namesToInclude by setting their group to "doNotIncludeMe"
+        for (int i = 0; i < numLeaves; i++) {
+            
+            if (m->control_pressed) { break; }
+            
+                       string name = tree[i].getName();
+            
+                       map<string, string>::iterator itNames = nameMap.find(name);
+            
+                       if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1);  }
+                       else {
+                               vector<string> dupNames;
+                               m->splitAtComma(nameMap[name], dupNames);
+                               
+                               map<string, int>::iterator itCounts;
+                               int maxPars = 1;
+                               set<string> groupsAddedForThisNode;
+                               for (int j = 0; j < dupNames.size(); j++) {
+                                       
+                                       string group = tmap->getGroup(dupNames[j]);
+                    bool includeMe = m->inUsersGroups(dupNames[j], namesToInclude);
+                    
+                    if (!includeMe && (group != "doNotIncludeMe")) { m->mothurOut("[ERROR] : creating subtree in copy.\n"); m->control_pressed = true; }
+                                       else if (!includeMe) {
+                                               if (groupsAddedForThisNode.count(group) == 0)  {  groupNodeInfo[group].push_back(i);  groupsAddedForThisNode.insert(group);  } //if you have not already added this node for this group, then add it
+                                               
+                                               //update pcounts
+                                               itCounts = tree[i].pcount.find(group);
+                                               if (itCounts == tree[i].pcount.end()) { //new group, add it
+                                                       tree[i].pcount[group] = 1;
+                                               }else {
+                                                       tree[i].pcount[group]++;
+                                               }
+                        
+                                               //update pgroups
+                                               itCounts = tree[i].pGroups.find(group);
+                                               if (itCounts == tree[i].pGroups.end()) { //new group, add it
+                                                       tree[i].pGroups[group] = 1;
+                                               }else{
+                                                       tree[i].pGroups[group]++;
+                                               }
+                                               
+                                               //keep highest group
+                                               if(tree[i].pGroups[group] > maxPars){
+                                                       maxPars = tree[i].pGroups[group];
+                                               }
+                    }
+                               }//end for
+                               
+                               if (maxPars > 1) { //then we have some more dominant groups
+                                       //erase all the groups that are less than maxPars because you found a more dominant group.
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){
+                                               if(it->second < maxPars){
+                                                       tree[i].pGroups.erase(it++);
+                                               }else { it++; }
+                                       }
+                                       //set one remaining groups to 1
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){
+                                               tree[i].pGroups[it->first] = 1;
+                                       }
+                               }//end if
+                               
+                               //update groups to reflect all the groups this node represents
+                               vector<string> nodeGroups;
+                               map<string, int>::iterator itGroups;
+                               for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) {
+                                       nodeGroups.push_back(itGroups->first);
+                               }
+                               tree[i].setGroup(nodeGroups);
+                               
+                       }//end else
+               }//end for              
+        
+        
+        //build the pGroups in non leaf nodes to be used in the parsimony calcs.
+               for (int i = numLeaves; i < numNodes; i++) {
+                       if (m->control_pressed) { break; }
+            
+                       tree[i].pGroups = (mergeGroups(i));
+                       tree[i].pcount = (mergeGcounts(i));
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Tree", "getCopy");
+               exit(1);
+       }
+}
+/*****************************************************************/
 void Tree::getCopy(Tree* copy) {
        try {
        
diff --git a/tree.h b/tree.h
index 0660e8a181632ae09668d1484236276e5898ed39..40ae14f8615651693985b8f7249c4098cde7f07b 100644 (file)
--- a/tree.h
+++ b/tree.h
@@ -24,6 +24,7 @@ public:
        
     TreeMap* getTreeMap() { return tmap; }
        void getCopy(Tree*);  //makes tree a copy of the one passed in.
+    void getCopy(Tree* copy, map<string, string>, vector<string>); //makes a copy of the tree passed in, but everyone who is not in the vector<string> has group set to doNotIncludeMe. Assumes the tmap already has these seqs group set to doNotIncludeMe.
        void getSubTree(Tree*, vector<string>);  //makes tree a that contains only the names passed in.
     int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap);  //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. 
     
index dbdee2ad942db51860cb0a7f403662a895def1bf..db5cf90cf56ad18ad1f5fd80cb6ee5b79c1aaa32 100644 (file)
@@ -245,6 +245,7 @@ int UnifracUnweightedCommand::execute() {
         T = reader->getTrees();
         tmap = T[0]->getTreeMap();
         map<string, string> nameMap = reader->getNames();
+        map<string, string> unique2Dup = reader->getNameMap();
         delete reader; 
         
                sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
@@ -281,7 +282,7 @@ int UnifracUnweightedCommand::execute() {
                     int thisSize = thisGroupsSeqs.size();
                     
                     if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);        }
-                    else {  m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+                    else {   m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
                 } 
                 m->setGroups(Groups);
             }
@@ -305,7 +306,7 @@ int UnifracUnweightedCommand::execute() {
                for (int i = 0; i < T.size(); i++) {
                        if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
                        
-                       counter = 0;
+            counter = 0;
                        
                        if (random)  {  
                                output = new ColumnFile(outputDir + m->getSimpleName(treefile)  + toString(i+1) + ".unweighted", itersString);
@@ -354,6 +355,11 @@ int UnifracUnweightedCommand::execute() {
                 SubSample sample;
                 Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
                 
+                //uses method of setting groups to doNotIncludeMe
+                //SubSample sample;
+                //Tree* newTree2 = sample.getSample(T[i], newTmap, nameMap, subsampleSize, unique2Dup);
+                // newTree2->print(cout);
+                
                 //call new weighted function
                 vector<double> iterData; iterData.resize(numComp,0);
                 Unweighted thisUnweighted(includeRoot);