]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed sub.sample so that it will eliminate samples with abundances that are less...
authorwestcott <westcott>
Fri, 8 Apr 2011 15:40:16 +0000 (15:40 +0000)
committerwestcott <westcott>
Fri, 8 Apr 2011 15:40:16 +0000 (15:40 +0000)
formatcolumn.cpp
hcluster.cpp
heatmapsimcommand.cpp
readblast.cpp
readcluster.cpp
readcluster.h
readcolumn.cpp
subsamplecommand.cpp

index 2bbcf518b864aa5a5656a271f2922c531e74af11..4ce73b33186425cc9cb62add6258fe9663328513 100644 (file)
@@ -45,8 +45,8 @@ int FormatColumnMatrix::read(NameAssignment* nameMap){
        
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){      cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);   }
-                       if(itB == nameMap->end()){      cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 
                        if (distance == -1) { distance = 1000000; }
                
index 88cba6ecce59b98c874a119c84c494aa1e152156..8a596f3163c06dd55129cdc8de0f5779a5d6cb4d 100644 (file)
@@ -396,8 +396,8 @@ vector<seqDist> HCluster::getSeqsFNNN(){
                        
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                
                        //using cutoff
                        if (distance > cutoff) { break; }
@@ -760,8 +760,8 @@ int HCluster::processFile() {
                        
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                
                        //using cutoff
                        if (distance > cutoff) { break; }
index 0ba5ef1f28261307e91d678a73b4993ff38fa0ec..dd8fa4c7ecf647f4cc3ab9b8657cb7c3f019d1ac 100644 (file)
@@ -512,8 +512,8 @@ int HeatMapSimCommand::runCommandDist() {
                                map<string, int>::iterator itA = nameMap->find(first);
                                map<string, int>::iterator itB = nameMap->find(second);
                                
-                               if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << first << "' was not found in the names file, please correct\n"; exit(1);  }
-                               if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << second << "' was not found in the names file, please correct\n"; exit(1);  }
+                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1);  }
+                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1);  }
                                
                                //save distance
                                matrix[itA->second][itB->second] = dist;
index 1efaf5b5e3526b8ec53ff3651f9de4b540720b7e..66f1db23d633eb83b3706bd15f6deb48e8c7cb5b 100644 (file)
@@ -90,8 +90,8 @@ int ReadBlast::read(NameAssignment* nameMap) {
                                //convert name to number
                                map<string,int>::iterator itA = nameMap->find(firstName);
                                map<string,int>::iterator itB = nameMap->find(secondName);
-                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
-                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                                
                                thisRowsBlastScores[itB->second] = score;
                                
@@ -143,8 +143,8 @@ int ReadBlast::read(NameAssignment* nameMap) {
                                                //convert name to number
                                                map<string,int>::iterator itA = nameMap->find(firstName);
                                                map<string,int>::iterator itB = nameMap->find(secondName);
-                                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
-                                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                                                
                                                //save score
                                                thisRowsBlastScores[itB->second] = score;
@@ -210,8 +210,8 @@ int ReadBlast::read(NameAssignment* nameMap) {
                                                //convert name to number
                                                map<string,int>::iterator itA = nameMap->find(firstName);
                                                map<string,int>::iterator itB = nameMap->find(secondName);
-                                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
-                                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                                                
                                                thisRowsBlastScores[itB->second] = score;
                                                
index 497fc804faadc760bdcc1684f1a45a856ddd9ab1..d8227101b2c339b3904e95144d11f5cccb068dd3 100644 (file)
@@ -22,7 +22,7 @@ ReadCluster::ReadCluster(string distfile, float c, string o, bool s){
 
 /***********************************************************************/
 
-int ReadCluster::read(NameAssignment* nameMap){
+int ReadCluster::read(NameAssignment*& nameMap){
        try {
         
                if (format == "phylip") { convertPhylip2Column(nameMap); }
@@ -43,7 +43,7 @@ int ReadCluster::read(NameAssignment* nameMap){
 }
 /***********************************************************************/
 
-int ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
+int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
        try {   
                //convert phylip file to column file
                map<int, string> rowToName;
index e4d3e4c240882410737b0c785fc04f5fd49432b2..eaf6b2dbb2bb422b67d6ffe55b3a1c854f7196d4 100644 (file)
@@ -23,7 +23,7 @@ class ReadCluster {
 public:
        ReadCluster(string, float, string, bool);
        ~ReadCluster();
-       int read(NameAssignment*);
+       int read(NameAssignment*&);
        string getOutputFile() { return OutPutFile; }
        void setFormat(string f) { format = f;  }
        ListVector* getListVector()             {       return list;    }
@@ -36,7 +36,7 @@ private:
        MothurOut* m;
        bool sortWanted;
        
-       int convertPhylip2Column(NameAssignment*);
+       int convertPhylip2Column(NameAssignment*&);
 };
 
 /******************************************************/
index f6f26d512f1a11dffa1de864e971d29dd919884c..53a8c4263dddc4687acc5a64cf7c5b1d27aac647 100644 (file)
@@ -55,13 +55,8 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){
                        map<string,int>::iterator itA = nameMap->find(firstName);
                        map<string,int>::iterator itB = nameMap->find(secondName);
                                
-                       if(itA == nameMap->end()){
-                               cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);
-                       }
-                       if(itB == nameMap->end()){
-                               cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);
-                       }
-//if (((itA->second == 8) && (itB->second == 1588)) || ((itA->second == 1588) && (itB->second == 8))) { cout << "found it" << endl; }
+                       if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                       if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
 
                        if (distance == -1) { distance = 1000000; }
                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
@@ -117,12 +112,8 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){
                                map<string,int>::iterator itA = nameMap->find(firstName);
                                map<string,int>::iterator itB = nameMap->find(secondName);
                                
-                               if(itA == nameMap->end()){
-                                       cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n";
-                               }
-                               if(itB == nameMap->end()){
-                                       cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n";
-                               }
+                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
+                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
                                
                                if (distance == -1) { distance = 1000000; }
                                else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
index 3b89c6d94eca1b6be43baef0da6ed3c9e286ed33..a055e81a1a1f6228ee069ad38760868c7cbaf082 100644 (file)
@@ -356,13 +356,6 @@ int SubSampleCommand::getSubSampleFasta() {
                
                if (m->control_pressed) { return 0; }
                
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
-               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile);
-               
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
                
                //make sure that if your picked groups size is not too big
                int thisSize = names.size();
@@ -375,13 +368,14 @@ int SubSampleCommand::getSubSampleFasta() {
                                        if (thisSize < size) {  size = thisSize;        }
                                }
                        }else { //make sure size is not too large
-                               int smallestSize = groupMap->getNumSeqs(Groups[0]);
-                               for (int i = 1; i < Groups.size(); i++) {
+                               vector<string> newGroups;
+                               for (int i = 0; i < Groups.size(); i++) {
                                        int thisSize = groupMap->getNumSeqs(Groups[i]);
                                        
-                                       if (thisSize < smallestSize) {  smallestSize = thisSize;        }
+                                       if (thisSize >= size) { newGroups.push_back(Groups[i]); }
+                                       else {  m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); }
                                }
-                               if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); }
+                               Groups = newGroups;
                        }
                        
                        m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();                        
@@ -477,6 +471,17 @@ int SubSampleCommand::getSubSampleFasta() {
                                }
                        }       
                }
+               
+               if (subset.size() == 0) {  m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine();  return 0; }
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(fastafile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
                //read through fasta file outputting only the names on the subsample list
                ifstream in;
                m->openInputFile(fastafile, in);
@@ -644,14 +649,6 @@ int SubSampleCommand::readNames() {
 int SubSampleCommand::getSubSampleShared() {
        try {
                
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
-               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile);
-               
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
-               
                InputData* input = new InputData(sharedfile, "sharedfile");
                vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
                string lastLabel = lookup[0]->getLabel();
@@ -667,9 +664,34 @@ int SubSampleCommand::getSubSampleShared() {
                                
                                if (thisSize < size) {  size = thisSize;        }
                        }
+               }else {
+                       m->Groups.clear();
+                       vector<SharedRAbundVector*> temp;
+                       for (int i = 0; i < lookup.size(); i++) {
+                               if (lookup[i]->getNumSeqs() < size) { 
+                                       m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+                                       delete lookup[i];
+                               }else { 
+                                       m->Groups.push_back(lookup[i]->getGroup()); 
+                                       temp.push_back(lookup[i]);
+                               }
+                       } 
+                       lookup = temp;
+                       Groups = m->Groups;
                }
                
-               m->mothurOut("Sampling " + toString(size) + " from " + toString(lookup[0]->getNumSeqs()) + "."); m->mothurOutEndLine();
+               if (lookup.size() == 0) {  m->mothurOut("The size you selected is too large, skipping shared file."); m->mothurOutEndLine(); delete input; return 0; }
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputTypes["shared"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
+               
+               m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();
                
                //as long as you are not at the end of the file or done wih the lines you want
                while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
@@ -874,13 +896,14 @@ int SubSampleCommand::getSubSampleList() {
                                        if (thisSize < size) {  size = thisSize;        }
                                }
                        }else { //make sure size is not too large
-                               int smallestSize = groupMap->getNumSeqs(Groups[0]);
-                               for (int i = 1; i < Groups.size(); i++) {
+                               vector<string> newGroups;
+                               for (int i = 0; i < Groups.size(); i++) {
                                        int thisSize = groupMap->getNumSeqs(Groups[i]);
                                        
-                                       if (thisSize < smallestSize) {  smallestSize = thisSize;        }
+                                       if (thisSize >= size) { newGroups.push_back(Groups[i]); }
+                                       else {  m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); }
                                }
-                               if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); }
+                               Groups = newGroups;
                        }
                        
                        m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();        
@@ -1151,15 +1174,6 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set<string>&
 //**********************************************************************************************************************
 int SubSampleCommand::getSubSampleRabund() {
        try {
-               
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(rabundfile);  }
-               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile);
-               
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               outputTypes["rabund"].push_back(outputFileName);  outputNames.push_back(outputFileName);
-               
                InputData* input = new InputData(rabundfile, "rabund");
                RAbundVector* rabund = input->getRAbundVector();
                string lastLabel = rabund->getLabel();
@@ -1170,10 +1184,18 @@ int SubSampleCommand::getSubSampleRabund() {
                
                if (size == 0) { //user has not set size, set size = 10%
                        size = int((rabund->getNumSeqs()) * 0.10);
-               }
+               }else if (size > rabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping rabund file."); m->mothurOutEndLine(); delete input; delete rabund; return 0; }
                
                m->mothurOut("Sampling " + toString(size) + " from " + toString(rabund->getNumSeqs()) + "."); m->mothurOutEndLine();
                
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(rabundfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputTypes["rabund"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
                //as long as you are not at the end of the file or done wih the lines you want
                while((rabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                        if (m->control_pressed) {  delete input; delete rabund; out.close(); return 0;  }
@@ -1308,15 +1330,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) {
 //**********************************************************************************************************************
 int SubSampleCommand::getSubSampleSabund() {
        try {
-               
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir += m->hasPath(sabundfile);  }
-               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile);
-               
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               outputTypes["sabund"].push_back(outputFileName);  outputNames.push_back(outputFileName);
-               
+                               
                InputData* input = new InputData(sabundfile, "sabund");
                SAbundVector* sabund = input->getSAbundVector();
                string lastLabel = sabund->getLabel();
@@ -1327,10 +1341,20 @@ int SubSampleCommand::getSubSampleSabund() {
                
                if (size == 0) { //user has not set size, set size = 10%
                        size = int((sabund->getNumSeqs()) * 0.10);
-               }
+               }else if (size > sabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping sabund file."); m->mothurOutEndLine(); delete input; delete sabund; return 0; }
+               
                
                m->mothurOut("Sampling " + toString(size) + " from " + toString(sabund->getNumSeqs()) + "."); m->mothurOutEndLine();
                
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(sabundfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputTypes["sabund"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
+               
                //as long as you are not at the end of the file or done wih the lines you want
                while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                        if (m->control_pressed) {  delete input; delete sabund; out.close(); return 0;  }