]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed path redirect issue in shhh.flows. optimized the subsampling in unifrac commands
authorSarah Westcott <mothur.westcott@gmail.com>
Fri, 11 May 2012 11:22:19 +0000 (07:22 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Fri, 11 May 2012 11:22:19 +0000 (07:22 -0400)
shhhercommand.cpp
subsample.cpp
subsample.h
tree.cpp
tree.h
unifracunweightedcommand.cpp
unifracweightedcommand.cpp

index 49e2faa03c1cd2eb74ea502ee5952c0512678644..a086c987b9a29711a25e95e64953023df7deaa0f 100644 (file)
@@ -218,7 +218,7 @@ ShhherCommand::ShhherCommand(string option) {
                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
             }
             else{
-                outputDir += m->hasPath(flowFileName);
+                if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
                 flowFileVector.push_back(flowFileName);
             }
                
@@ -2980,7 +2980,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filenam
        
        try {
                string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
+               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
                string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
         
                ofstream qualityFile;
@@ -3088,7 +3088,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filenam
 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
        try {
                string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
+               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
                string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
                ofstream fastaFile;
                m->openOutputFile(fastaFileName, fastaFile);
@@ -3136,7 +3136,7 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU
 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
        try {
                string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
+               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
                string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
                ofstream nameFile;
                m->openOutputFile(nameFileName, nameFile);
@@ -3174,7 +3174,7 @@ void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, s
 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
        try {
                string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
+               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
                string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
                string groupFileName = fileRoot + "shhh.groups";
                ofstream groupFile;
@@ -3199,7 +3199,7 @@ void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& se
 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
        try {
                string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
+               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
                string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
                ofstream otuCountsFile;
                m->openOutputFile(otuCountsFileName, otuCountsFile);
index 70c16a8a6668649e3a8ea697d6f32ec1a738dcc9..457b7b9d5f6a1273b57e898b3adec90dff5dd9f0 100644 (file)
@@ -8,38 +8,22 @@
 
 #include "subsample.h"
 //**********************************************************************************************************************
-Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size, map<string, string> originalNameMap) {
+Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map<string, string> originalNameMap) {
     try {
         Tree* newTree = NULL;
         
-        vector<string> subsampledSeqs = getSample(tmap, size);
-        map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
+        map<string, vector<string> > newGroups;
+        vector<string> subsampledSeqs = getSample(tmap, size, newGroups);
         
         //remove seqs not in sample from treemap
-        for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-            //is that name in the subsample?
-            int count = 0;
-            string name = tmap->namesOfSeqs[i];
-            for (int j = 0; j < subsampledSeqs.size(); j++) {
-                if (name == subsampledSeqs[j]) { break; } //found it
-                count++;
-            }
-            
-            if (m->control_pressed) { return newTree; }
-            
-            //if you didnt find it, remove it 
-            if (count == subsampledSeqs.size()) { 
-                tmap->removeSeq(name);
-                tmap->addSeq(name, "doNotIncludeMe");
+        for (map<string, vector<string> >::iterator it = newGroups.begin(); it != newGroups.end(); it++) {
+            for (int i = 0; i < (it->second).size(); i++) {
+                newTmap->addSeq((it->second)[i], it->first);
             }
         }
         
-        //create new tree
-        int numUniques = sampledNameMap.size();
-        if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
-        
-        newTree = new Tree(tmap);
-        newTree->getCopy(T, originalNameMap, subsampledSeqs);
+        newTree = new Tree(newTmap);
+        newTree->getCopy(T, originalNameMap);
         
         return newTree;
     }
@@ -48,7 +32,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, in
         exit(1);
     }
 }
-//**********************************************************************************************************************
+/**********************************************************************************************************************
 Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
     try {
         Tree* newTree = NULL;
@@ -87,7 +71,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, in
         m->errorOut(e, "SubSample", "getSample-Tree");
         exit(1);
     }
-}
+}*/
 //**********************************************************************************************************************
 //assumes whole maps dupName -> uniqueName
 map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
@@ -126,7 +110,46 @@ map<string, string> SubSample::deconvolute(map<string, string> whole, vector<str
                m->errorOut(e, "SubSample", "deconvolute");
                exit(1);
        }
+}
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(TreeMap* tMap, int size, map<string, vector<string> >& sample) {
+    try {
+        vector<string> temp2;
+        sample["doNotIncludeMe"] = temp2;
+        
+        vector<string> namesInSample;
+        
+        vector<string> Groups = tMap->getNamesOfGroups();    
+        for (int i = 0; i < Groups.size(); i++) {
+            
+            if (m->inUsersGroups(Groups[i], m->getGroups())) {
+                if (m->control_pressed) { break; }
+                
+                vector<string> thisGroup; thisGroup.push_back(Groups[i]);
+                vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
+                int thisSize = thisGroupsSeqs.size();
+                vector<string> temp;
+                sample[Groups[i]] = temp;
+                
+                if (thisSize >= size) {        
+                    
+                    random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
+                    
+                    for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); }
+                    for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); }
+                    
+                }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+            }
+        } 
+        
+        return namesInSample;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSample", "getSample-TreeMap");
+               exit(1);
+       }
 }      
+
 //**********************************************************************************************************************
 vector<string> SubSample::getSample(TreeMap* tMap, int size) {
     try {
index ef5c389a938820a61201d83629efc6b27c5237d8..feca37eddbfbf42cd4d7674adea5a18ceed3acee 100644 (file)
@@ -25,8 +25,8 @@ class SubSample {
     
         vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
         
-        Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
-        Tree* getSample(Tree*, TreeMap*, map<string, string>, int, map<string, string>); //creates new subsampled tree, destroys treemap so copy if needed.
+        //Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
+        Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map<string, string>); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
     
     private:
     
@@ -35,6 +35,7 @@ class SubSample {
     
         vector<string> getSample(TreeMap*, vector<string>);
         vector<string> getSample(TreeMap*, int); //names of seqs to include in sample tree 
+        vector<string> getSample(TreeMap* tMap, int size, map<string, vector<string> >& sample); //sample maps group -> seqs in group. seqs not in sample are in doNotIncludeMe group
         map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
 
 
index ed652508ef2e6b049c183777ae5eecab42100d97..44ecadd534b60d60b602d9e052259ac69709f9d3 100644 (file)
--- a/tree.cpp
+++ b/tree.cpp
@@ -588,18 +588,11 @@ int Tree::populateNewTree(vector<Node>& oldtree, int node, int& index) {
        }
 }
 /*****************************************************************/
-void Tree::getCopy(Tree* copy, map<string, string> nameMap, vector<string> namesToInclude) {
+void Tree::getCopy(Tree* copy, map<string, string> nameMap) {
        try {
         
                //for each node in the tree copy its info
                for (int i = 0; i < numNodes; i++) {
-                       //copy name
-                       tree[i].setName(copy->tree[i].getName());
-            
-                       //copy group
-            vector<string> temp;
-                       tree[i].setGroup(temp);
-                       
                        //copy branch length
                        tree[i].setBranchLength(copy->tree[i].getBranchLength());
             
@@ -608,93 +601,9 @@ void Tree::getCopy(Tree* copy, map<string, string> nameMap, vector<string> names
             
                        //copy children
                        tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild());
-            
-                       //copy index in node and tmap
-                       tree[i].setIndex(copy->tree[i].getIndex());
-                       setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName()));
-                       
-                       //copy pGroups
-                       tree[i].pGroups.clear();
-            
-                       //copy pcount
-                       tree[i].pcount.clear();
-               }
+        }
                
-               groupNodeInfo.clear();
-        
-        //now lets change prune the seqs not in namesToInclude by setting their group to "doNotIncludeMe"
-        for (int i = 0; i < numLeaves; i++) {
-            
-            if (m->control_pressed) { break; }
-            
-                       string name = tree[i].getName();
-            
-                       map<string, string>::iterator itNames = nameMap.find(name);
-            
-                       if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file, please correct."); m->mothurOutEndLine(); exit(1);  }
-                       else {
-                               vector<string> dupNames;
-                               m->splitAtComma(nameMap[name], dupNames);
-                               
-                               map<string, int>::iterator itCounts;
-                               int maxPars = 1;
-                               set<string> groupsAddedForThisNode;
-                               for (int j = 0; j < dupNames.size(); j++) {
-                                       
-                                       string group = tmap->getGroup(dupNames[j]);
-                    bool includeMe = m->inUsersGroups(dupNames[j], namesToInclude);
-                    
-                    if (!includeMe && (group != "doNotIncludeMe")) { m->mothurOut("[ERROR] : creating subtree in copy.\n"); m->control_pressed = true; }
-                                       else if (!includeMe) {
-                                               if (groupsAddedForThisNode.count(group) == 0)  {  groupNodeInfo[group].push_back(i);  groupsAddedForThisNode.insert(group);  } //if you have not already added this node for this group, then add it
-                                               
-                                               //update pcounts
-                                               itCounts = tree[i].pcount.find(group);
-                                               if (itCounts == tree[i].pcount.end()) { //new group, add it
-                                                       tree[i].pcount[group] = 1;
-                                               }else {
-                                                       tree[i].pcount[group]++;
-                                               }
-                        
-                                               //update pgroups
-                                               itCounts = tree[i].pGroups.find(group);
-                                               if (itCounts == tree[i].pGroups.end()) { //new group, add it
-                                                       tree[i].pGroups[group] = 1;
-                                               }else{
-                                                       tree[i].pGroups[group]++;
-                                               }
-                                               
-                                               //keep highest group
-                                               if(tree[i].pGroups[group] > maxPars){
-                                                       maxPars = tree[i].pGroups[group];
-                                               }
-                    }
-                               }//end for
-                               
-                               if (maxPars > 1) { //then we have some more dominant groups
-                                       //erase all the groups that are less than maxPars because you found a more dominant group.
-                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){
-                                               if(it->second < maxPars){
-                                                       tree[i].pGroups.erase(it++);
-                                               }else { it++; }
-                                       }
-                                       //set one remaining groups to 1
-                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){
-                                               tree[i].pGroups[it->first] = 1;
-                                       }
-                               }//end if
-                               
-                               //update groups to reflect all the groups this node represents
-                               vector<string> nodeGroups;
-                               map<string, int>::iterator itGroups;
-                               for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) {
-                                       nodeGroups.push_back(itGroups->first);
-                               }
-                               tree[i].setGroup(nodeGroups);
-                               
-                       }//end else
-               }//end for              
-        
+        if (nameMap.size() != 0) {  addNamesToCounts(nameMap);  }
         
         //build the pGroups in non leaf nodes to be used in the parsimony calcs.
                for (int i = numLeaves; i < numNodes; i++) {
diff --git a/tree.h b/tree.h
index 40ae14f8615651693985b8f7249c4098cde7f07b..03da5f6841d9f30cef2f946d6c9ecbc1d5c58cc0 100644 (file)
--- a/tree.h
+++ b/tree.h
@@ -24,7 +24,7 @@ public:
        
     TreeMap* getTreeMap() { return tmap; }
        void getCopy(Tree*);  //makes tree a copy of the one passed in.
-    void getCopy(Tree* copy, map<string, string>, vector<string>); //makes a copy of the tree passed in, but everyone who is not in the vector<string> has group set to doNotIncludeMe. Assumes the tmap already has these seqs group set to doNotIncludeMe.
+    void getCopy(Tree* copy, map<string, string>); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want.  Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe".
        void getSubTree(Tree*, vector<string>);  //makes tree a that contains only the names passed in.
     int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap);  //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. 
     
index db5cf90cf56ad18ad1f5fd80cb6ee5b79c1aaa32..318409fa8845f91fcaae0765a6f930626299e82d 100644 (file)
@@ -342,23 +342,23 @@ int UnifracUnweightedCommand::execute() {
                        
                        if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0;  }
             
+            int startSubsample = time(NULL);
+            
             //subsample loop
             vector< vector<double> > calcDistsTotals;  //each iter, each groupCombos dists. this will be used to make .dist files
             for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
-                
                 if (m->control_pressed) { break; }
                 
                 //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
                 TreeMap* newTmap = new TreeMap();
-                newTmap->getCopy(*tmap);
+                //newTmap->getCopy(*tmap);
                 
-                SubSample sample;
-                Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+                //SubSample sample;
+                //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
                 
                 //uses method of setting groups to doNotIncludeMe
-                //SubSample sample;
-                //Tree* newTree2 = sample.getSample(T[i], newTmap, nameMap, subsampleSize, unique2Dup);
-                // newTree2->print(cout);
+                SubSample sample;
+                Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup);
                 
                 //call new weighted function
                 vector<double> iterData; iterData.resize(numComp,0);
@@ -373,6 +373,7 @@ int UnifracUnweightedCommand::execute() {
                 
                 if((thisIter+1) % 100 == 0){   m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
             }
+            m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine();
             
             if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0;  }
 
index 633cb643a213044e4f7e92c108ad012690deff09..1df79888eb8bdacb442b9dc2c89c51a7d4fa75ff 100644 (file)
@@ -237,6 +237,7 @@ int UnifracWeightedCommand::execute() {
         T = reader->getTrees();
         tmap = T[0]->getTreeMap();
         map<string, string> nameMap = reader->getNames();
+        map<string, string> unique2Dup = reader->getNameMap();
         delete reader;
     
         if (m->control_pressed) {  delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
@@ -336,11 +337,15 @@ int UnifracWeightedCommand::execute() {
                 
                 //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
                 TreeMap* newTmap = new TreeMap();
-                newTmap->getCopy(*tmap);
+                //newTmap->getCopy(*tmap);
                 
+                //SubSample sample;
+               //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+                
+                //uses method of setting groups to doNotIncludeMe
                 SubSample sample;
-                Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
-                   
+                Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup);
+
                 //call new weighted function
                 vector<double> iterData; iterData.resize(numComp,0);
                 Weighted thisWeighted(includeRoot);