From 5cc45e630ccee2c3503e1bfe7e9b3524898cef8a Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 8 Apr 2011 15:40:16 +0000 Subject: [PATCH] fixed sub.sample so that it will eliminate samples with abundances that are less than the size selected --- formatcolumn.cpp | 4 +- hcluster.cpp | 8 +-- heatmapsimcommand.cpp | 4 +- readblast.cpp | 12 ++--- readcluster.cpp | 4 +- readcluster.h | 4 +- readcolumn.cpp | 17 ++----- subsamplecommand.cpp | 112 +++++++++++++++++++++++++----------------- 8 files changed, 90 insertions(+), 75 deletions(-) diff --git a/formatcolumn.cpp b/formatcolumn.cpp index 2bbcf51..4ce73b3 100644 --- a/formatcolumn.cpp +++ b/formatcolumn.cpp @@ -45,8 +45,8 @@ int FormatColumnMatrix::read(NameAssignment* nameMap){ map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } if (distance == -1) { distance = 1000000; } diff --git a/hcluster.cpp b/hcluster.cpp index 88cba6e..8a596f3 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -396,8 +396,8 @@ vector HCluster::getSeqsFNNN(){ map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } @@ -760,8 +760,8 @@ int HCluster::processFile() { map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //using cutoff if (distance > cutoff) { break; } diff --git a/heatmapsimcommand.cpp b/heatmapsimcommand.cpp index 0ba5ef1..dd8fa4c 100644 --- a/heatmapsimcommand.cpp +++ b/heatmapsimcommand.cpp @@ -512,8 +512,8 @@ int HeatMapSimCommand::runCommandDist() { map::iterator itA = nameMap->find(first); map::iterator itB = nameMap->find(second); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << first << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << second << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1); } //save distance matrix[itA->second][itB->second] = dist; diff --git a/readblast.cpp b/readblast.cpp index 1efaf5b..66f1db2 100644 --- a/readblast.cpp +++ b/readblast.cpp @@ -90,8 +90,8 @@ int ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } thisRowsBlastScores[itB->second] = score; @@ -143,8 +143,8 @@ int ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //save score thisRowsBlastScores[itB->second] = score; @@ -210,8 +210,8 @@ int ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } thisRowsBlastScores[itB->second] = score; diff --git a/readcluster.cpp b/readcluster.cpp index 497fc80..d822710 100644 --- a/readcluster.cpp +++ b/readcluster.cpp @@ -22,7 +22,7 @@ ReadCluster::ReadCluster(string distfile, float c, string o, bool s){ /***********************************************************************/ -int ReadCluster::read(NameAssignment* nameMap){ +int ReadCluster::read(NameAssignment*& nameMap){ try { if (format == "phylip") { convertPhylip2Column(nameMap); } @@ -43,7 +43,7 @@ int ReadCluster::read(NameAssignment* nameMap){ } /***********************************************************************/ -int ReadCluster::convertPhylip2Column(NameAssignment* nameMap){ +int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){ try { //convert phylip file to column file map rowToName; diff --git a/readcluster.h b/readcluster.h index e4d3e4c..eaf6b2d 100644 --- a/readcluster.h +++ b/readcluster.h @@ -23,7 +23,7 @@ class ReadCluster { public: ReadCluster(string, float, string, bool); ~ReadCluster(); - int read(NameAssignment*); + int read(NameAssignment*&); string getOutputFile() { return OutPutFile; } void setFormat(string f) { format = f; } ListVector* getListVector() { return list; } @@ -36,7 +36,7 @@ private: MothurOut* m; bool sortWanted; - int convertPhylip2Column(NameAssignment*); + int convertPhylip2Column(NameAssignment*&); }; /******************************************************/ diff --git a/readcolumn.cpp b/readcolumn.cpp index f6f26d5..53a8c42 100644 --- a/readcolumn.cpp +++ b/readcolumn.cpp @@ -55,13 +55,8 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ - cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); - } - if(itB == nameMap->end()){ - cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); - } -//if (((itA->second == 8) && (itB->second == 1588)) || ((itA->second == 1588) && (itB->second == 8))) { cout << "found it" << endl; } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. @@ -117,12 +112,8 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ - cerr << "BError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; - } - if(itB == nameMap->end()){ - cerr << "BError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; - } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 3b89c6d..a055e81 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -356,13 +356,6 @@ int SubSampleCommand::getSubSampleFasta() { if (m->control_pressed) { return 0; } - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //make sure that if your picked groups size is not too big int thisSize = names.size(); @@ -375,13 +368,14 @@ int SubSampleCommand::getSubSampleFasta() { if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large - int smallestSize = groupMap->getNumSeqs(Groups[0]); - for (int i = 1; i < Groups.size(); i++) { + vector newGroups; + for (int i = 0; i < Groups.size(); i++) { int thisSize = groupMap->getNumSeqs(Groups[i]); - if (thisSize < smallestSize) { smallestSize = thisSize; } + if (thisSize >= size) { newGroups.push_back(Groups[i]); } + else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } - if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } + Groups = newGroups; } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -477,6 +471,17 @@ int SubSampleCommand::getSubSampleFasta() { } } } + + if (subset.size() == 0) { m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine(); return 0; } + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); + //read through fasta file outputting only the names on the subsample list ifstream in; m->openInputFile(fastafile, in); @@ -644,14 +649,6 @@ int SubSampleCommand::readNames() { int SubSampleCommand::getSubSampleShared() { try { - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - InputData* input = new InputData(sharedfile, "sharedfile"); vector lookup = input->getSharedRAbundVectors(); string lastLabel = lookup[0]->getLabel(); @@ -667,9 +664,34 @@ int SubSampleCommand::getSubSampleShared() { if (thisSize < size) { size = thisSize; } } + }else { + m->Groups.clear(); + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < size) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + m->Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + Groups = m->Groups; } - m->mothurOut("Sampling " + toString(size) + " from " + toString(lookup[0]->getNumSeqs()) + "."); m->mothurOutEndLine(); + if (lookup.size() == 0) { m->mothurOut("The size you selected is too large, skipping shared file."); m->mothurOutEndLine(); delete input; return 0; } + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); + + + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -874,13 +896,14 @@ int SubSampleCommand::getSubSampleList() { if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large - int smallestSize = groupMap->getNumSeqs(Groups[0]); - for (int i = 1; i < Groups.size(); i++) { + vector newGroups; + for (int i = 0; i < Groups.size(); i++) { int thisSize = groupMap->getNumSeqs(Groups[i]); - if (thisSize < smallestSize) { smallestSize = thisSize; } + if (thisSize >= size) { newGroups.push_back(Groups[i]); } + else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } - if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } + Groups = newGroups; } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -1151,15 +1174,6 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set& //********************************************************************************************************************** int SubSampleCommand::getSubSampleRabund() { try { - - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(rabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["rabund"].push_back(outputFileName); outputNames.push_back(outputFileName); - InputData* input = new InputData(rabundfile, "rabund"); RAbundVector* rabund = input->getRAbundVector(); string lastLabel = rabund->getLabel(); @@ -1170,10 +1184,18 @@ int SubSampleCommand::getSubSampleRabund() { if (size == 0) { //user has not set size, set size = 10% size = int((rabund->getNumSeqs()) * 0.10); - } + }else if (size > rabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping rabund file."); m->mothurOutEndLine(); delete input; delete rabund; return 0; } m->mothurOut("Sampling " + toString(size) + " from " + toString(rabund->getNumSeqs()) + "."); m->mothurOutEndLine(); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(rabundfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["rabund"].push_back(outputFileName); outputNames.push_back(outputFileName); + //as long as you are not at the end of the file or done wih the lines you want while((rabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->control_pressed) { delete input; delete rabund; out.close(); return 0; } @@ -1308,15 +1330,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) { //********************************************************************************************************************** int SubSampleCommand::getSubSampleSabund() { try { - - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(sabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["sabund"].push_back(outputFileName); outputNames.push_back(outputFileName); - + InputData* input = new InputData(sabundfile, "sabund"); SAbundVector* sabund = input->getSAbundVector(); string lastLabel = sabund->getLabel(); @@ -1327,10 +1341,20 @@ int SubSampleCommand::getSubSampleSabund() { if (size == 0) { //user has not set size, set size = 10% size = int((sabund->getNumSeqs()) * 0.10); - } + }else if (size > sabund->getNumSeqs()) { m->mothurOut("The size you selected is too large, skipping sabund file."); m->mothurOutEndLine(); delete input; delete sabund; return 0; } + m->mothurOut("Sampling " + toString(size) + " from " + toString(sabund->getNumSeqs()) + "."); m->mothurOutEndLine(); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sabundfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["sabund"].push_back(outputFileName); outputNames.push_back(outputFileName); + + //as long as you are not at the end of the file or done wih the lines you want while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->control_pressed) { delete input; delete sabund; out.close(); return 0; } -- 2.39.2