X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=subsamplecommand.cpp;h=c352feb099a83879bae9e1aca0f35d5bfe56e691;hb=2bb9267aa4b4ecdf8488b06605cc9f3f36fa4332;hp=73c1e5c76c48db67f0676ebb4a409aa0660b241d;hpb=4c16a1dac0538d5ba2ac925674747ab174612ab8;p=mothur.git diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 73c1e5c..c352feb 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -9,6 +9,7 @@ #include "subsamplecommand.h" #include "sharedutilities.h" +#include "deconvolutecommand.h" //********************************************************************************************************************** vector SubSampleCommand::setParameters(){ @@ -88,6 +89,7 @@ SubSampleCommand::SubSampleCommand(string option) { //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { vector myArray = setParameters(); @@ -181,32 +183,38 @@ SubSampleCommand::SubSampleCommand(string option) { //check for required parameters listfile = validParameter.validFile(parameters, "list", true); if (listfile == "not open") { listfile = ""; abort = true; } - else if (listfile == "not found") { listfile = ""; } + else if (listfile == "not found") { listfile = ""; } + else { m->setListFile(listfile); } sabundfile = validParameter.validFile(parameters, "sabund", true); if (sabundfile == "not open") { sabundfile = ""; abort = true; } else if (sabundfile == "not found") { sabundfile = ""; } + else { m->setSabundFile(sabundfile); } rabundfile = validParameter.validFile(parameters, "rabund", true); if (rabundfile == "not open") { rabundfile = ""; abort = true; } else if (rabundfile == "not found") { rabundfile = ""; } + else { m->setRabundFile(rabundfile); } fastafile = validParameter.validFile(parameters, "fasta", true); if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } + else { m->setFastaFile(fastafile); } sharedfile = validParameter.validFile(parameters, "shared", true); if (sharedfile == "not open") { sharedfile = ""; abort = true; } else if (sharedfile == "not found") { sharedfile = ""; } + else { m->setSharedFile(sharedfile); } namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } + else { m->setNameFile(namefile); } groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } - + else { m->setGroupFile(groupfile); } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -222,11 +230,11 @@ SubSampleCommand::SubSampleCommand(string option) { else { pickedGroups = true; m->splitAtDash(groups, Groups); - m->Groups = Groups; + m->setGroups(Groups); } string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; } - convert(temp, size); + m->mothurConvert(temp, size); temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } persample = m->isTrue(temp); @@ -247,6 +255,10 @@ SubSampleCommand::SubSampleCommand(string option) { if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } @@ -263,19 +275,19 @@ int SubSampleCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } if (sharedfile != "") { getSubSampleShared(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); return 0; } } if (listfile != "") { getSubSampleList(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); return 0; } } if (rabundfile != "") { getSubSampleRabund(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); return 0; } } if (sabundfile != "") { getSubSampleSabund(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); return 0; } } if (fastafile != "") { getSubSampleFasta(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); return 0; } } //set fasta file as new current fastafile string current = ""; @@ -342,7 +354,8 @@ int SubSampleCommand::getSubSampleFasta() { //takes care of user setting groupNames that are invalid or setting groups=all SharedUtil* util = new SharedUtil(); - util->setGroups(Groups, groupMap->namesOfGroups); + vector namesGroups = groupMap->getNamesOfGroups(); + util->setGroups(Groups, namesGroups); delete util; //file mismatch quit @@ -415,60 +428,49 @@ int SubSampleCommand::getSubSampleFasta() { set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { - - //randomly select a subset of those names from this group to include in the subsample - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return 0; } + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { - - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (group == Groups[i]) { subset.insert(names[myrand]); break; } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } } - } + } } }else { //randomly select a subset of those names to include in the subsample - for (int j = 0; j < size; j++) { + //since names was randomly shuffled just grab the next one + for (int j = 0; j < names.size(); j++) { if (m->control_pressed) { return 0; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - if (subset.count(names[myrand]) == 0) { - - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - subset.insert(names[myrand]); break; - } - }else{ - subset.insert(names[myrand]); break; - } - }else{ //save everyone, group - subset.insert(names[myrand]); break; - } + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { + subset.insert(names[j]); + } + }else{ + subset.insert(names[j]); } - } + }else{ //save everyone, group + subset.insert(names[j]); + } + + //do we have enough?? + if (subset.size() == size) { break; } } } @@ -480,7 +482,6 @@ int SubSampleCommand::getSubSampleFasta() { ofstream out; m->openOutputFile(outputFileName, out); - outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //read through fasta file outputting only the names on the subsample list ifstream in; @@ -523,6 +524,32 @@ int SubSampleCommand::getSubSampleFasta() { m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine(); } + if (namefile != "") { + m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + outputFileName; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]); + m->mothurRemove(outputFileName); + outputFileName = filenames["fasta"][0]; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->mothurOut("Done."); m->mothurOutEndLine(); + } + + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); + //if a groupfile is provided read through the group file only outputting the names on the subsample list if (groupfile != "") { @@ -665,43 +692,35 @@ int SubSampleCommand::getSubSampleShared() { if (thisSize < size) { size = thisSize; } } }else { - m->Groups.clear(); + m->clearGroups(); + Groups.clear(); vector temp; for (int i = 0; i < lookup.size(); i++) { if (lookup[i]->getNumSeqs() < size) { m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); delete lookup[i]; }else { - m->Groups.push_back(lookup[i]->getGroup()); + Groups.push_back(lookup[i]->getGroup()); temp.push_back(lookup[i]); } } lookup = temp; - Groups = m->Groups; + m->setGroups(Groups); } if (lookup.size() == 0) { m->mothurOut("The size you selected is too large, skipping shared file."); m->mothurOutEndLine(); delete input; return 0; } - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - - m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } out.close(); return 0; } + if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } return 0; } if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - processShared(lookup, out); + processShared(lookup); processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); @@ -715,7 +734,7 @@ int SubSampleCommand::getSubSampleShared() { lookup = input->getSharedRAbundVectors(lastLabel); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - processShared(lookup, out); + processShared(lookup); processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); @@ -733,7 +752,7 @@ int SubSampleCommand::getSubSampleShared() { } - if (m->control_pressed) { out.close(); return 0; } + if (m->control_pressed) { return 0; } //output error messages about any remaining user labels set::iterator it; @@ -755,13 +774,12 @@ int SubSampleCommand::getSubSampleShared() { m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - processShared(lookup, out); + processShared(lookup); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } delete input; - out.close(); return 0; @@ -772,9 +790,21 @@ int SubSampleCommand::getSubSampleShared() { } } //********************************************************************************************************************** -int SubSampleCommand::processShared(vector& thislookup, ofstream& out) { +int SubSampleCommand::processShared(vector& thislookup) { try { + //save mothurOut's binLabels to restore for next label + vector saveBinLabels = m->currentBinLabels; + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile); + + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); + int numBins = thislookup[0]->getNumBins(); for (int i = 0; i < thislookup.size(); i++) { int thisSize = thislookup[i]->getNumSeqs(); @@ -801,12 +831,13 @@ int SubSampleCommand::processShared(vector& thislookup, ofs for (int j = 0; j < size; j++) { - if (m->control_pressed) { delete order; return 0; } + if (m->control_pressed) { delete order; out.close(); return 0; } //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); + //don't need this because of the random shuffle above + //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - int bin = order->get(myrand); + int bin = order->get(j); int abund = thislookup[i]->getAbundance(bin); thislookup[i]->set(bin, (abund+1), thisgroup); @@ -818,13 +849,20 @@ int SubSampleCommand::processShared(vector& thislookup, ofs //subsampling may have created some otus with no sequences in them eliminateZeroOTUS(thislookup); - if (m->control_pressed) { return 0; } + if (m->control_pressed) { out.close(); return 0; } + + thislookup[0]->printHeaders(out); for (int i = 0; i < thislookup.size(); i++) { out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t'; thislookup[i]->print(out); } + out.close(); + + //save mothurOut's binLabels to restore for next label + m->currentBinLabels = saveBinLabels; + return 0; } @@ -862,7 +900,8 @@ int SubSampleCommand::getSubSampleList() { //takes care of user setting groupNames that are invalid or setting groups=all SharedUtil* util = new SharedUtil(); - util->setGroups(Groups, groupMap->namesOfGroups); + vector namesGroups = groupMap->getNamesOfGroups(); + util->setGroups(Groups, namesGroups); delete util; //create outputfiles @@ -992,37 +1031,30 @@ int SubSampleCommand::getSubSampleList() { //randomly select a subset of those names to include in the subsample set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { //you are not already added - if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } } - } + } } }else{ for (int j = 0; j < size; j++) { if (m->control_pressed) { break; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; } - } + subset.insert(names[j]); } } @@ -1303,10 +1335,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); @@ -1465,10 +1494,7 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); @@ -1507,6 +1533,8 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) } //for each bin + vector newBinLabels; + string snumBins = toString(thislookup[0]->getNumBins()); for (int i = 0; i < thislookup[0]->getNumBins(); i++) { if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } @@ -1521,6 +1549,17 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) for (int j = 0; j < thislookup.size(); j++) { newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup()); } + //if there is a bin label use it otherwise make one + string binLabel = "Otu"; + string sbinNumber = toString(i+1); + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { binLabel += "0"; } + } + binLabel += sbinNumber; + if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; } + + newBinLabels.push_back(binLabel); } } @@ -1528,6 +1567,7 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) thislookup.clear(); thislookup = newLookup; + m->currentBinLabels = newBinLabels; return 0;