+ //make sure that if your picked groups size is not too big
+ int thisSize = 0;
+ if (countfile == "") { thisSize = names.size(); }
+ else { thisSize = ct. getNumSeqs(); } //all seqs not just unique
+
+ if (persample) {
+ if (size == 0) { //user has not set size, set size = smallest samples size
+ if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); }
+ else { size = ct.getGroupCount(Groups[0]); }
+
+ for (int i = 1; i < Groups.size(); i++) {
+ int thisSize = 0;
+ if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+ else { thisSize = ct.getGroupCount(Groups[i]); }
+
+ if (thisSize < size) { size = thisSize; }
+ }
+ }else { //make sure size is not too large
+ vector<string> newGroups;
+ for (int i = 0; i < Groups.size(); i++) {
+ int thisSize = 0;
+ if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+ else { thisSize = ct.getGroupCount(Groups[i]); }
+
+ if (thisSize >= size) { newGroups.push_back(Groups[i]); }
+ else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); }
+ }
+ Groups = newGroups;
+ if (newGroups.size() == 0) { m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; }
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();
+ }else {
+ if (pickedGroups) {
+ int total = 0;
+ for(int i = 0; i < Groups.size(); i++) {
+ if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); }
+ else { total += ct.getGroupCount(Groups[i]); }
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (total * 0.10);
+ }
+
+ if (total < size) {
+ if (size != 0) {
+ m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+ }
+ size = int (total * 0.10);
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ if (countfile == "") { size = int (names.size() * 0.10); }
+ else { size = int (ct.getNumSeqs() * 0.10); }
+ }
+
+
+ if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+ size = thisSize;
+ }
+
+ if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
+
+ }
+ random_shuffle(names.begin(), names.end());
+
+ set<string> subset; //dont want repeat sequence names added
+ if (persample) {
+ if (countfile == "") {
+ //initialize counts
+ map<string, int> groupCounts;
+ map<string, int>::iterator itGroupCounts;
+ for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+
+ for (int j = 0; j < names.size(); j++) {
+
+ if (m->control_pressed) { return 0; }
+
+ string group = groupMap.getGroup(names[j]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+ else{
+ itGroupCounts = groupCounts.find(group);
+ if (itGroupCounts != groupCounts.end()) {
+ if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; }
+ }
+ }
+ }
+ }else {
+ SubSample sample;
+ CountTable sampledCt = sample.getSample(ct, size, Groups);
+ vector<string> sampledSeqs = sampledCt.getNamesOfSeqs();
+ for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); }
+
+ string countOutputDir = outputDir;
+ if (outputDir == "") { countOutputDir += m->hasPath(countfile); }
+ map<string, string> variables;
+ variables["[filename]"] = countOutputDir + m->getRootName(m->getSimpleName(countfile));
+ variables["[extension]"] = m->getExtension(countfile);
+ string countOutputFileName = getOutputFileName("count", variables);
+ outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName);
+ sampledCt.printTable(countOutputFileName);
+ }
+ }else {
+ if (countfile == "") {
+ //randomly select a subset of those names to include in the subsample
+ //since names was randomly shuffled just grab the next one
+ for (int j = 0; j < names.size(); j++) {
+
+ if (m->control_pressed) { return 0; }
+
+ if (groupfile != "") { //if there is a groupfile given fill in group info
+ string group = groupMap.getGroup(names[j]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+
+ if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+ if (m->inUsersGroups(group, Groups)) { subset.insert(names[j]); }
+ }else{ subset.insert(names[j]); }
+ }else{ //save everyone, group
+ subset.insert(names[j]);
+ }
+
+ //do we have enough??
+ if (subset.size() == size) { break; }
+ }
+ }else {
+ SubSample sample;
+ CountTable sampledCt = sample.getSample(ct, size, Groups, pickedGroups);
+ vector<string> sampledSeqs = sampledCt.getNamesOfSeqs();
+ for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); }
+
+ string countOutputDir = outputDir;
+ if (outputDir == "") { countOutputDir += m->hasPath(countfile); }
+ map<string, string> variables;
+ variables["[filename]"] = countOutputDir + m->getRootName(m->getSimpleName(countfile));
+ variables["[extension]"] = m->getExtension(countfile);
+ string countOutputFileName = getOutputFileName("count", variables);
+ outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName);
+ sampledCt.printTable(countOutputFileName);
+ }
+ }
+
+ if (subset.size() == 0) { m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine(); return 0; }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
+ map<string, string> variables;
+ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
+ variables["[extension]"] = m->getExtension(fastafile);
+ string outputFileName = getOutputFileName("fasta", variables);