X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=subsamplecommand.cpp;h=3e954759b078ff9371f5c28b64b3de86ef3c89cc;hp=717b1d3231c20368a2d23e9b86d52a6707f03b12;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=0caf3fbabaa3ece404f8ce77f4c883dc5b1bf1dc diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 717b1d3..3e95475 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -10,24 +10,27 @@ #include "subsamplecommand.h" #include "sharedutilities.h" #include "deconvolutecommand.h" +#include "getseqscommand.h" #include "subsample.h" //********************************************************************************************************************** vector SubSampleCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter plist("list", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(plist); - CommandParameter pshared("shared", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pshared); - CommandParameter prabund("rabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(prabund); - CommandParameter psabund("sabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(psabund); - CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); - CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); - CommandParameter psize("size", "Number", "", "0", "", "", "",false,false); parameters.push_back(psize); - CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ppersample); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FLSSR", "none","fasta",false,false,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptaxonomy); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "FLSSR", "none","list",false,false,true); parameters.push_back(plist); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "FLSSR", "none","shared",false,false,true); parameters.push_back(pshared); + CommandParameter prabund("rabund", "InputTypes", "", "", "none", "FLSSR", "none","rabund",false,false); parameters.push_back(prabund); + CommandParameter psabund("sabund", "InputTypes", "", "", "none", "FLSSR", "none","sabund",false,false); parameters.push_back(psabund); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); + CommandParameter psize("size", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(psize); + CommandParameter ppersample("persample", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(ppersample); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -43,7 +46,7 @@ string SubSampleCommand::getHelpString(){ try { string helpString = ""; helpString += "The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"; - helpString += "The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"; + helpString += "The sub.sample command parameters are fasta, name, list, group, count, rabund, sabund, shared, taxonomy, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"; helpString += "The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; @@ -64,6 +67,29 @@ string SubSampleCommand::getHelpString(){ } } //********************************************************************************************************************** +string SubSampleCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "fasta") { pattern = "[filename],subsample,[extension]"; } + else if (type == "sabund") { pattern = "[filename],subsample,[extension]"; } + else if (type == "name") { pattern = "[filename],subsample,[extension]"; } + else if (type == "group") { pattern = "[filename],subsample,[extension]"; } + else if (type == "count") { pattern = "[filename],subsample,[extension]"; } + else if (type == "list") { pattern = "[filename],[distance],subsample,[extension]"; } + else if (type == "taxonomy") { pattern = "[filename],subsample,[extension]"; } + else if (type == "shared") { pattern = "[filename],[distance],subsample,[extension]"; } + else if (type == "rabund") { pattern = "[filename],subsample,[extension]"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** SubSampleCommand::SubSampleCommand(){ try { abort = true; calledHelp = true; @@ -76,6 +102,8 @@ SubSampleCommand::SubSampleCommand(){ outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["count"] = tempOutNames; + outputTypes["taxonomy"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand"); @@ -115,6 +143,8 @@ SubSampleCommand::SubSampleCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["count"] = tempOutNames; + outputTypes["taxonomy"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -179,6 +209,22 @@ SubSampleCommand::SubSampleCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } } //check for required parameters @@ -216,7 +262,28 @@ SubSampleCommand::SubSampleCommand(string option) { if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + taxonomyfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; } + else if (taxonomyfile == "not found") { taxonomyfile = ""; } + else { m->setTaxonomyFile(taxonomyfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile, true, false); + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); @@ -240,26 +307,37 @@ SubSampleCommand::SubSampleCommand(string option) { temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } persample = m->isTrue(temp); - if (groupfile == "") { persample = false; } + if ((groupfile == "") && (countfile == "")) { persample = false; } + if (countfile != "") { + if (!ct.hasGroupInfo()) { + persample = false; + if (pickedGroups) { m->mothurOut("You cannot pick groups without group info in your count file."); m->mothurOutEndLine(); abort = true; } + } + } - if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; } + if ((namefile != "") && ((fastafile == "") && (taxonomyfile == ""))) { m->mothurOut("You may only use a name file with a fasta file or taxonomy file."); m->mothurOutEndLine(); abort = true; } + + if ((taxonomyfile != "") && ((fastafile == "") && (listfile == ""))) { m->mothurOut("You may only use a taxonomyfile with a fastafile or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) { m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; } - if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { - m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; } + if (pickedGroups && ((groupfile == "") && (sharedfile == "") && (countfile == ""))) { + m->mothurOut("You cannot pick groups without a valid group, count or shared file."); m->mothurOutEndLine(); abort = true; } - if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { - m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } + if (((groupfile != "") || (countfile != "")) && ((fastafile == "") && (listfile == ""))) { + m->mothurOut("Group or count files are only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } - if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { - m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if (((groupfile != "") || (countfile != "")) && ((fastafile != "") && (listfile != ""))) { + m->mothurOut("A new group or count file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } - if ((fastafile != "") && (namefile == "")) { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -326,7 +404,16 @@ int SubSampleCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + itTypes = outputTypes.find("taxonomy"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -347,49 +434,67 @@ int SubSampleCommand::getSubSampleFasta() { if (namefile != "") { readNames(); } //fills names with all names in namefile. else { getNames(); }//no name file, so get list of names to pick from - GroupMap* groupMap; + GroupMap groupMap; if (groupfile != "") { - - groupMap = new GroupMap(groupfile); - groupMap->readMap(); + groupMap.readMap(groupfile); //takes care of user setting groupNames that are invalid or setting groups=all - SharedUtil* util = new SharedUtil(); - vector namesGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, namesGroups); - delete util; + SharedUtil util; + vector namesGroups = groupMap.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); //file mismatch quit - if (names.size() != groupMap->getNumSeqs()) { - m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (names.size() != groupMap.getNumSeqs()) { + m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); m->mothurOutEndLine(); - delete groupMap; return 0; } - } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + SharedUtil util; + vector namesGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); + } + + //file mismatch quit + if (names.size() != ct.getNumUniqueSeqs()) { + m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); + m->mothurOutEndLine(); + return 0; + } + } if (m->control_pressed) { return 0; } - //make sure that if your picked groups size is not too big - int thisSize = names.size(); + int thisSize = 0; + if (countfile == "") { thisSize = names.size(); } + else { thisSize = ct. getNumSeqs(); } //all seqs not just unique + if (persample) { if (size == 0) { //user has not set size, set size = smallest samples size - size = groupMap->getNumSeqs(Groups[0]); + if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); } + else { size = ct.getGroupCount(Groups[0]); } + for (int i = 1; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large vector newGroups; for (int i = 0; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize >= size) { newGroups.push_back(Groups[i]); } else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } Groups = newGroups; + if (newGroups.size() == 0) { m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; } } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -397,7 +502,8 @@ int SubSampleCommand::getSubSampleFasta() { if (pickedGroups) { int total = 0; for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); + if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); } + else { total += ct.getGroupCount(Groups[i]); } } if (size == 0) { //user has not set size, set size = 10% samples size @@ -415,72 +521,103 @@ int SubSampleCommand::getSubSampleFasta() { } if (size == 0) { //user has not set size, set size = 10% samples size - size = int (names.size() * 0.10); + if (countfile == "") { size = int (names.size() * 0.10); } + else { size = int (ct.getNumSeqs() * 0.10); } } - if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); - size = thisSize; - } - - if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } + + if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); + size = thisSize; + } + + if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } } random_shuffle(names.begin(), names.end()); set subset; //dont want repeat sequence names added if (persample) { - //initialize counts - map groupCounts; - map::iterator itGroupCounts; - for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + if (countfile == "") { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } - for (int j = 0; j < names.size(); j++) { + for (int j = 0; j < names.size(); j++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return 0; } - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - else{ - itGroupCounts = groupCounts.find(group); - if (itGroupCounts != groupCounts.end()) { - if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } - } - } - } + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (itGroupCounts->second < size) { subset.insert(names[j]); (itGroupCounts->second)++; } + } + } + } + }else { + SubSample sample; + CountTable sampledCt = sample.getSample(ct, size, Groups); + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + map variables; + variables["[filename]"] = countOutputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string countOutputFileName = getOutputFileName("count", variables); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } }else { - - //randomly select a subset of those names to include in the subsample - //since names was randomly shuffled just grab the next one - for (int j = 0; j < names.size(); j++) { - - if (m->control_pressed) { return 0; } - - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - subset.insert(names[j]); - } - }else{ - subset.insert(names[j]); - } - }else{ //save everyone, group - subset.insert(names[j]); - } - - //do we have enough?? - if (subset.size() == size) { break; } - } + if (countfile == "") { + //randomly select a subset of those names to include in the subsample + //since names was randomly shuffled just grab the next one + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return 0; } + + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { subset.insert(names[j]); } + }else{ subset.insert(names[j]); } + }else{ //save everyone, group + subset.insert(names[j]); + } + + //do we have enough?? + if (subset.size() == size) { break; } + } + }else { + SubSample sample; + CountTable sampledCt = sample.getSample(ct, size, Groups, pickedGroups); + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + map variables; + variables["[filename]"] = countOutputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string countOutputFileName = getOutputFileName("count", variables); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } } if (subset.size() == 0) { m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine(); return 0; } string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile); - + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + variables["[extension]"] = m->getExtension(fastafile); + string outputFileName = getOutputFileName("fasta", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -520,6 +657,7 @@ int SubSampleCommand::getSubSampleFasta() { } in.close(); out.close(); + if (count != subset.size()) { m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine(); @@ -527,7 +665,10 @@ int SubSampleCommand::getSubSampleFasta() { if (namefile != "") { m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine(); - + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string outputNameFileName = getOutputFileName("name", variables); //use unique.seqs to create new name and fastafile string inputString = "fasta=" + outputFileName; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); @@ -542,14 +683,41 @@ int SubSampleCommand::getSubSampleFasta() { delete uniqueCommand; m->mothurCalling = false; - outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]); - m->mothurRemove(outputFileName); - outputFileName = filenames["fasta"][0]; - + m->renameFile(filenames["name"][0], outputNameFileName); + m->renameFile(filenames["fasta"][0], outputFileName); + + outputTypes["name"].push_back(outputNameFileName); outputNames.push_back(outputNameFileName); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Done."); m->mothurOutEndLine(); - } + + if (taxonomyfile != "") { + set tempSubset; + //get new unique names from fasta file + //read through fasta file outputting only the names on the subsample list after deconvolute + ifstream in2; + m->openInputFile(outputFileName, in2); + + while (!in2.eof()) { + Sequence seq(in2); m->gobble(in2); + if (seq.getName() != "") { + tempSubset.insert(seq.getName()); + } + } + in2.close(); + + //send that list to getTax + int tcount = getTax(tempSubset); + if (tcount != tempSubset.size()) { m->mothurOut("[ERROR]: subsampled fasta file contains " + toString(tempSubset.size()) + " sequences, but I only found " + toString(tcount) + " in your taxonomy file, please correct."); m->mothurOutEndLine(); } + } + }else { + if (taxonomyfile != "") { + int tcount = getTax(subset); + if (tcount != subset.size()) { m->mothurOut("[ERROR]: subsampled fasta file contains " + toString(subset.size()) + " sequences, but I only found " + toString(tcount) + " in your taxonomy file, please correct."); m->mothurOutEndLine(); } + + } //should only contain uniques. + } outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -558,7 +726,10 @@ int SubSampleCommand::getSubSampleFasta() { string groupOutputDir = outputDir; if (outputDir == "") { groupOutputDir += m->hasPath(groupfile); } - string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile); + map variables; + variables["[filename]"] = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string groupOutputFileName = getOutputFileName("group", variables); ofstream outGroup; m->openOutputFile(groupOutputFileName, outGroup); @@ -639,34 +810,13 @@ int SubSampleCommand::getNames() { int SubSampleCommand::readNames() { try { - ifstream in; - m->openInputFile(namefile, in); - - string thisname, repnames; - map >::iterator it; - - while(!in.eof()){ - - if (m->control_pressed) { in.close(); return 0; } - - in >> thisname; m->gobble(in); //read from first column - in >> repnames; //read from second column - - it = nameMap.find(thisname); - if (it == nameMap.end()) { - - vector splitRepNames; - m->splitAtComma(repnames, splitRepNames); - - nameMap[thisname] = splitRepNames; - for (int i = 0; i < splitRepNames.size(); i++) { names.push_back(splitRepNames[i]); } - - }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); } - - m->gobble(in); - } - in.close(); - + nameMap.clear(); + m->readNames(namefile, nameMap); + + //save names of all sequences + map >::iterator it; + for (it = nameMap.begin(); it != nameMap.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { names.push_back((it->second)[i]); } } + return 0; } @@ -797,12 +947,15 @@ int SubSampleCommand::processShared(vector& thislookup) { try { //save mothurOut's binLabels to restore for next label - vector saveBinLabels = m->currentBinLabels; + vector saveBinLabels = m->currentSharedBinLabels; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile); - + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)); + variables["[extension]"] = m->getExtension(sharedfile); + variables["[distance]"] = thislookup[0]->getLabel(); + string outputFileName = getOutputFileName("shared", variables); SubSample sample; vector subsampledLabels = sample.getSample(thislookup, size); @@ -812,7 +965,7 @@ int SubSampleCommand::processShared(vector& thislookup) { m->openOutputFile(outputFileName, out); outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); - m->currentBinLabels = subsampledLabels; + m->currentSharedBinLabels = subsampledLabels; thislookup[0]->printHeaders(out); @@ -824,7 +977,7 @@ int SubSampleCommand::processShared(vector& thislookup) { //save mothurOut's binLabels to restore for next label - m->currentBinLabels = saveBinLabels; + m->currentSharedBinLabels = saveBinLabels; return 0; @@ -837,83 +990,86 @@ int SubSampleCommand::processShared(vector& thislookup) { //********************************************************************************************************************** int SubSampleCommand::getSubSampleList() { try { - - string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "subsample" + m->getExtension(listfile); - - ofstream out; - m->openOutputFile(outputFileName, out); - outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); - + + if (namefile != "") { m->readNames(namefile, nameMap); } + InputData* input = new InputData(listfile, "list"); ListVector* list = input->getListVector(); string lastLabel = list->getLabel(); - //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - + ofstream outGroup; - GroupMap* groupMap; + GroupMap groupMap; if (groupfile != "") { - - groupMap = new GroupMap(groupfile); - groupMap->readMap(); + groupMap.readMap(groupfile); //takes care of user setting groupNames that are invalid or setting groups=all - SharedUtil* util = new SharedUtil(); - vector namesGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, namesGroups); - delete util; + SharedUtil util; vector namesGroups = groupMap.getNamesOfGroups(); util.setGroups(Groups, namesGroups); //create outputfiles string groupOutputDir = outputDir; if (outputDir == "") { groupOutputDir += m->hasPath(groupfile); } string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile); - m->openOutputFile(groupOutputFileName, outGroup); outputTypes["group"].push_back(groupOutputFileName); outputNames.push_back(groupOutputFileName); //file mismatch quit - if (list->getNumSeqs() != groupMap->getNumSeqs()) { - m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (list->getNumSeqs() != groupMap.getNumSeqs()) { + m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); + m->mothurOutEndLine(); delete list; delete input; outGroup.close(); return 0; + } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + SharedUtil util; + vector namesGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); + } + + //file mismatch quit + if (list->getNumSeqs() != ct.getNumUniqueSeqs()) { + m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); m->mothurOutEndLine(); - delete groupMap; - delete list; - delete input; - out.close(); - outGroup.close(); return 0; - } - } - + } + } + //make sure that if your picked groups size is not too big if (persample) { if (size == 0) { //user has not set size, set size = smallest samples size - size = groupMap->getNumSeqs(Groups[0]); + if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); } + else { size = ct.getGroupCount(Groups[0]); } + for (int i = 1; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large vector newGroups; for (int i = 0; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize >= size) { newGroups.push_back(Groups[i]); } else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } Groups = newGroups; + if (newGroups.size() == 0) { m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; } } - m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); }else{ - if (pickedGroups) { + if (pickedGroups) { int total = 0; for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); + if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); } + else { total += ct.getGroupCount(Groups[i]); } } if (size == 0) { //user has not set size, set size = 10% samples size @@ -921,128 +1077,119 @@ int SubSampleCommand::getSubSampleList() { } if (total < size) { - m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + if (size != 0) { + m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + } size = int (total * 0.10); } m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); - }else{ - - if (size == 0) { //user has not set size, set size = 10% samples size - size = int (list->getNumSeqs() * 0.10); + }else { + if (size == 0) { //user has not set size, set size = 10% samples size + if (countfile == "") { size = int (list->getNumSeqs() * 0.10); } + else { size = int (ct.getNumSeqs() * 0.10); } } - int thisSize = list->getNumSeqs(); + int thisSize = 0; + if (countfile == "") { thisSize = list->getNumSeqs(); } + else { thisSize = ct.getNumSeqs(); } + if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); size = thisSize; } - m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine(); - } - } - - - //fill names - for (int i = 0; i < list->getNumBins(); i++) { - string binnames = list->get(i); - - //parse names - string individual = ""; - int length = binnames.length(); - for(int j=0;jgetGroup(individual); - if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - names.push_back(individual); - } - }else{ - names.push_back(individual); - } - }else{ //save everyone, group - names.push_back(individual); - } - individual = ""; - } - else{ - individual += binnames[j]; - } - } - //save last name - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(individual); - if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - names.push_back(individual); - } - }else{ - names.push_back(individual); - } - }else{ //save everyone, group - names.push_back(individual); - } - } - - random_shuffle(names.begin(), names.end()); - - //randomly select a subset of those names to include in the subsample - set subset; //dont want repeat sequence names added - if (persample) { - //initialize counts - map groupCounts; - map::iterator itGroupCounts; - for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } - - for (int j = 0; j < names.size(); j++) { - - if (m->control_pressed) { return 0; } - - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - else{ - itGroupCounts = groupCounts.find(group); - if (itGroupCounts != groupCounts.end()) { - if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } - } - } - } - }else{ - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - subset.insert(names[j]); - } - } - - if (groupfile != "") { - //write out new groupfile - for (set::iterator it = subset.begin(); it != subset.end(); it++) { - string group = groupMap->getGroup(*it); - if (group == "not found") { group = "NOTFOUND"; } - - outGroup << *it << '\t' << group << endl; - } - outGroup.close(); delete groupMap; - } - + m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); + } + } + + set subset; //dont want repeat sequence names added + if (countfile == "") { + //fill names + for (int i = 0; i < list->getNumBins(); i++) { + string binnames = list->get(i); + vector thisBin; + m->splitAtComma(binnames, thisBin); + + for(int j=0;jmothurOut("[ERROR]: " + thisBin[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + //if hte user picked groups, we only want to keep the names of sequences from those groups + if (pickedGroups) { if (m->inUsersGroups(group, Groups)) { names.push_back(thisBin[j]); } } + else{ names.push_back(thisBin[j]); } + }//save everyone, group + else{ names.push_back(thisBin[j]); } + } + } + + random_shuffle(names.begin(), names.end()); + + //randomly select a subset of those names to include in the subsample + if (persample) { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { delete list; delete input; return 0; } + + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } + } + } + } + }else{ + for (int j = 0; j < size; j++) { + if (m->control_pressed) { break; } + subset.insert(names[j]); + } + } + + if (groupfile != "") { + //write out new groupfile + for (set::iterator it = subset.begin(); it != subset.end(); it++) { + string group = groupMap.getGroup(*it); + if (group == "not found") { group = "NOTFOUND"; } + outGroup << *it << '\t' << group << endl; + } + outGroup.close(); + } + }else { + SubSample sample; CountTable sampledCt; + + if (persample) { sampledCt = sample.getSample(ct, size, Groups); } + else { sampledCt = sample.getSample(ct, size, Groups, pickedGroups); } + + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + map variables; + variables["[filename]"] = countOutputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string countOutputFileName = getOutputFileName("count", variables); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } //as long as you are not at the end of the file or done wih the lines you want while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { delete list; delete input; out.close(); return 0; } + if (m->control_pressed) { delete list; delete input; return 0; } if(allLines == 1 || labels.count(list->getLabel()) == 1){ m->mothurOut(list->getLabel()); m->mothurOutEndLine(); - processList(list, out, subset); + processList(list, subset); processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -1056,7 +1203,7 @@ int SubSampleCommand::getSubSampleList() { list = input->getListVector(lastLabel); m->mothurOut(list->getLabel()); m->mothurOutEndLine(); - processList(list, out, subset); + processList(list, subset); processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -1074,7 +1221,7 @@ int SubSampleCommand::getSubSampleList() { } - if (m->control_pressed) { if (list != NULL) { delete list; } delete input; out.close(); return 0; } + if (m->control_pressed) { if (list != NULL) { delete list; } delete input; return 0; } //output error messages about any remaining user labels set::iterator it; @@ -1097,14 +1244,77 @@ int SubSampleCommand::getSubSampleList() { m->mothurOut(list->getLabel()); m->mothurOutEndLine(); - processList(list, out, subset); + processList(list, subset); delete list; list = NULL; } - out.close(); if (list != NULL) { delete list; } delete input; + + if (taxonomyfile != "") { + if (namefile == "") { + InputData input(listfile, "list"); + ListVector* list = input.getListVector(); + string lastLabel = list->getLabel(); + + for (int i = 0; i < list->getNumBins(); i++) { + vector temp; + string bin = list->get(i); + m->splitAtComma(bin, temp); + for (int j = 0; j < temp.size(); j++) { vector tempFakeOut; tempFakeOut.push_back(temp[j]); nameMap[temp[j]] = tempFakeOut; } + } + delete list; + + int tcount = getTax(subset); + if (tcount != subset.size()) { m->mothurOut("[ERROR]: subsampled list file contains " + toString(subset.size()) + " sequences, but I only found " + toString(tcount) + " in your taxonomy file, did you forget a name file? Please correct."); m->mothurOutEndLine(); } + }else { + string tempAccnos = "temp.accnos"; + ofstream outAccnos; + m->openOutputFile(tempAccnos, outAccnos); + for (set::iterator it = subset.begin(); it != subset.end(); it++) { outAccnos << *it << endl; } + outAccnos.close(); + + m->mothurOut("Sampling taxonomy and name file... "); m->mothurOutEndLine(); + string thisNameOutputDir = outputDir; + if (outputDir == "") { thisNameOutputDir += m->hasPath(namefile); } + map variables; + variables["[filename]"] = thisNameOutputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string outputNameFileName = getOutputFileName("name", variables); + + string thisTaxOutputDir = outputDir; + if (outputDir == "") { thisTaxOutputDir += m->hasPath(taxonomyfile); } + variables["[filename]"] = thisTaxOutputDir + m->getRootName(m->getSimpleName(taxonomyfile)); + variables["[extension]"] = m->getExtension(taxonomyfile); + string outputTaxFileName = getOutputFileName("taxonomy", variables); + + + //use unique.seqs to create new name and fastafile + string inputString = "dups=f, name=" + namefile + ", taxonomy=" + taxonomyfile + ", accnos=" + tempAccnos; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: get.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* getCommand = new GetSeqsCommand(inputString); + getCommand->execute(); + + map > filenames = getCommand->getOutputFiles(); + + delete getCommand; + m->mothurCalling = false; + + m->renameFile(filenames["name"][0], outputNameFileName); + m->renameFile(filenames["taxonomy"][0], outputTaxFileName); + + outputTypes["name"].push_back(outputNameFileName); outputNames.push_back(outputNameFileName); + outputNames.push_back(outputTaxFileName); outputTypes["taxonomy"].push_back(outputTaxFileName); + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->mothurOut("Done."); m->mothurOutEndLine(); + } + } return 0; @@ -1115,48 +1325,54 @@ int SubSampleCommand::getSubSampleList() { } } //********************************************************************************************************************** -int SubSampleCommand::processList(ListVector*& list, ofstream& out, set& subset) { +int SubSampleCommand::processList(ListVector*& list, set& subset) { try { - + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(listfile)); + variables["[extension]"] = m->getExtension(listfile); + variables["[distance]"] = list->getLabel(); + string outputFileName = getOutputFileName("list", variables); + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); + int numBins = list->getNumBins(); ListVector* temp = new ListVector(); temp->setLabel(list->getLabel()); + vector binLabels = list->getLabels(); + vector newLabels; for (int i = 0; i < numBins; i++) { if (m->control_pressed) { break; } - string binnames = list->get(i); - - //parse names - string individual = ""; - string newNames = ""; - int length = binnames.length(); - for(int j=0;jget(i); + vector binnames; + m->splitAtComma(bin, binnames); + string newNames = ""; + for(int j=0;jpush_back(newNames); + newLabels.push_back(binLabels[i]); } } + temp->setLabels(newLabels); delete list; list = temp; - if (m->control_pressed) { return 0; } + if (m->control_pressed) { out.close(); return 0; } + list->printHeaders(out); list->print(out); + out.close(); return 0; @@ -1185,8 +1401,10 @@ int SubSampleCommand::getSubSampleRabund() { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(rabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)) + "subsample" + m->getExtension(rabundfile); - + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(rabundfile)); + variables["[extension]"] = m->getExtension(rabundfile); + string outputFileName = getOutputFileName("rabund", variables); ofstream out; m->openOutputFile(outputFileName, out); outputTypes["rabund"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -1340,8 +1558,10 @@ int SubSampleCommand::getSubSampleSabund() { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(sabundfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)) + "subsample" + m->getExtension(sabundfile); - + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(sabundfile)); + variables["[extension]"] = m->getExtension(sabundfile); + string outputFileName = getOutputFileName("sabund", variables); ofstream out; m->openOutputFile(outputFileName, out); outputTypes["sabund"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -1482,7 +1702,63 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { m->errorOut(e, "SubSampleCommand", "processSabund"); exit(1); } -} +} + +//********************************************************************************************************************** +int SubSampleCommand::getTax(set& subset) { + try { + + string thisTaxOutputDir = outputDir; + if (outputDir == "") { thisTaxOutputDir += m->hasPath(taxonomyfile); } + map variables; + variables["[filename]"] = thisTaxOutputDir + m->getRootName(m->getSimpleName(taxonomyfile)); + variables["[extension]"] = m->getExtension(taxonomyfile); + string outputTaxFileName = getOutputFileName("taxonomy", variables); + ofstream outTax; + m->openOutputFile(outputTaxFileName, outTax); + outputNames.push_back(outputTaxFileName); outputTypes["taxonomy"].push_back(outputTaxFileName); + + //read through fasta file outputting only the names on the subsample list + ifstream inTax; + m->openInputFile(taxonomyfile, inTax); + + string tname, tax; + int tcount = 0; + map >::iterator itNameMap; + + while(!inTax.eof()){ + + if (m->control_pressed) { inTax.close(); outTax.close(); return 0; } + + inTax >> tname; m->gobble(inTax); //read from first column + inTax >> tax; m->gobble(inTax); //read from second column + + //does the subset contain a sequence that this sequence represents + itNameMap = nameMap.find(tname); + if (itNameMap != nameMap.end()) { + vector nameRepresents = itNameMap->second; + + + for (int i = 0; i < nameRepresents.size(); i++){ + if (subset.count(nameRepresents[i]) != 0) { + outTax << nameRepresents[i] << '\t' << tax << endl; + tcount++; + + } + } + }else{ m->mothurOut("[ERROR]: " + tname + " is missing, please correct."); m->mothurOutEndLine(); } + } + inTax.close(); + outTax.close(); + + return tcount; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getTax"); + exit(1); + } +} + //**********************************************************************************************************************