X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=subsamplecommand.cpp;h=717b1d3231c20368a2d23e9b86d52a6707f03b12;hb=2c97dd48b8e27ee0a6a86c7a082f4c504c3357c6;hp=651ee028426f2713954557c4cdaf44af29112104;hpb=55386dddad84cc1140d736cabaf4dd0ae16f2e01;p=mothur.git diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 651ee02..717b1d3 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -9,6 +9,8 @@ #include "subsamplecommand.h" #include "sharedutilities.h" +#include "deconvolutecommand.h" +#include "subsample.h" //********************************************************************************************************************** vector SubSampleCommand::setParameters(){ @@ -233,7 +235,7 @@ SubSampleCommand::SubSampleCommand(string option) { } string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; } - convert(temp, size); + m->mothurConvert(temp, size); temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } persample = m->isTrue(temp); @@ -254,6 +256,10 @@ SubSampleCommand::SubSampleCommand(string option) { if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } @@ -423,60 +429,49 @@ int SubSampleCommand::getSubSampleFasta() { set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { - - //randomly select a subset of those names from this group to include in the subsample - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return 0; } + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { - - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (group == Groups[i]) { subset.insert(names[myrand]); break; } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } } - } + } } }else { //randomly select a subset of those names to include in the subsample - for (int j = 0; j < size; j++) { + //since names was randomly shuffled just grab the next one + for (int j = 0; j < names.size(); j++) { if (m->control_pressed) { return 0; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - if (subset.count(names[myrand]) == 0) { - - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - subset.insert(names[myrand]); break; - } - }else{ - subset.insert(names[myrand]); break; - } - }else{ //save everyone, group - subset.insert(names[myrand]); break; - } + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { + subset.insert(names[j]); + } + }else{ + subset.insert(names[j]); } - } + }else{ //save everyone, group + subset.insert(names[j]); + } + + //do we have enough?? + if (subset.size() == size) { break; } } } @@ -488,7 +483,6 @@ int SubSampleCommand::getSubSampleFasta() { ofstream out; m->openOutputFile(outputFileName, out); - outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //read through fasta file outputting only the names on the subsample list ifstream in; @@ -531,6 +525,34 @@ int SubSampleCommand::getSubSampleFasta() { m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine(); } + if (namefile != "") { + m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + outputFileName; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + m->mothurCalling = false; + + outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]); + m->mothurRemove(outputFileName); + outputFileName = filenames["fasta"][0]; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->mothurOut("Done."); m->mothurOutEndLine(); + } + + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); + //if a groupfile is provided read through the group file only outputting the names on the subsample list if (groupfile != "") { @@ -780,67 +802,28 @@ int SubSampleCommand::processShared(vector& thislookup) { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile); - - - ofstream out; + + SubSample sample; + vector subsampledLabels = sample.getSample(thislookup, size); + + if (m->control_pressed) { return 0; } + + ofstream out; m->openOutputFile(outputFileName, out); outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - int numBins = thislookup[0]->getNumBins(); - for (int i = 0; i < thislookup.size(); i++) { - int thisSize = thislookup[i]->getNumSeqs(); - - if (thisSize != size) { - - string thisgroup = thislookup[i]->getGroup(); - - OrderVector* order = new OrderVector(); - for(int p=0;pgetAbundance(p);j++){ - order->push_back(p); - } - } - random_shuffle(order->begin(), order->end()); - - SharedRAbundVector* temp = new SharedRAbundVector(numBins); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - - delete thislookup[i]; - thislookup[i] = temp; - - - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { delete order; out.close(); return 0; } - - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); - - int abund = thislookup[i]->getAbundance(bin); - thislookup[i]->set(bin, (abund+1), thisgroup); - } - delete order; - } - } - - //subsampling may have created some otus with no sequences in them - eliminateZeroOTUS(thislookup); - - if (m->control_pressed) { out.close(); return 0; } - + m->currentBinLabels = subsampledLabels; + thislookup[0]->printHeaders(out); for (int i = 0; i < thislookup.size(); i++) { out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t'; thislookup[i]->print(out); } - out.close(); - - //save mothurOut's binLabels to restore for next label + + + //save mothurOut's binLabels to restore for next label m->currentBinLabels = saveBinLabels; return 0; @@ -1011,37 +994,30 @@ int SubSampleCommand::getSubSampleList() { //randomly select a subset of those names to include in the subsample set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { //you are not already added - if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } } - } + } } }else{ for (int j = 0; j < size; j++) { if (m->control_pressed) { break; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; } - } + subset.insert(names[j]); } } @@ -1322,10 +1298,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); @@ -1484,10 +1457,7 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); @@ -1514,57 +1484,6 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { } } //********************************************************************************************************************** -int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) { - try { - - vector newLookup; - for (int i = 0; i < thislookup.size(); i++) { - SharedRAbundVector* temp = new SharedRAbundVector(); - temp->setLabel(thislookup[i]->getLabel()); - temp->setGroup(thislookup[i]->getGroup()); - newLookup.push_back(temp); - } - - //for each bin - vector newBinLabels; - for (int i = 0; i < thislookup[0]->getNumBins(); i++) { - if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } - - //look at each sharedRabund and make sure they are not all zero - bool allZero = true; - for (int j = 0; j < thislookup.size(); j++) { - if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; } - } - - //if they are not all zero add this bin - if (!allZero) { - for (int j = 0; j < thislookup.size(); j++) { - newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup()); - } - //if there is a bin label use it otherwise make one - string binLabel = "Otu" + toString(i+1); - if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; } - - newBinLabels.push_back(binLabel); - } - } - - for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; } - thislookup.clear(); - - thislookup = newLookup; - m->currentBinLabels = newBinLabels; - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS"); - exit(1); - } -} - -//**********************************************************************************************************************