+int SubSampleCommand::getSubSampleFasta() {
+ try {
+
+ if (namefile != "") { readNames(); } //fills names with all names in namefile.
+ else { getNames(); }//no name file, so get list of names to pick from
+
+ GroupMap* groupMap;
+ if (groupfile != "") {
+
+ groupMap = new GroupMap(groupfile);
+ groupMap->readMap();
+
+ //takes care of user setting groupNames that are invalid or setting groups=all
+ SharedUtil* util = new SharedUtil();
+ util->setGroups(Groups, groupMap->namesOfGroups);
+ delete util;
+
+ //file mismatch quit
+ if (names.size() != groupMap->getNumSeqs()) {
+ m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
+ m->mothurOutEndLine();
+ delete groupMap;
+ return 0;
+ }
+ }
+
+ if (m->control_pressed) { return 0; }
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "subsample" + m->getExtension(fastafile);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ //make sure that if your picked groups size is not too big
+
+ if (pickedGroups) {
+ int total = 0;
+ for(int i = 0; i < Groups.size(); i++) {
+ total += groupMap->getNumSeqs(Groups[i]);
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (total * 0.10);
+ }
+
+ if (total < size) {
+ if (size != 0) {
+ m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+ }
+ size = int (total * 0.10);
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (names.size() * 0.10);
+ }
+
+ int thisSize = names.size();
+ if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+ size = thisSize;
+ }
+
+ random_shuffle(names.begin(), names.end());
+
+ if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
+
+ //randomly select a subset of those names to include in the subsample
+ set<string> subset; //dont want repeat sequence names added
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { return 0; }
+
+ //get random sequence to add, making sure we have not already added it
+ bool done = false;
+ int myrand;
+ while (!done) {
+ myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
+
+ if (subset.count(names[myrand]) == 0) {
+
+ if (groupfile != "") { //if there is a groupfile given fill in group info
+ string group = groupMap->getGroup(names[myrand]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+
+ if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+ if (m->inUsersGroups(group, Groups)) {
+ subset.insert(names[myrand]); break;
+ }
+ }else{
+ subset.insert(names[myrand]); break;
+ }
+ }else{ //save everyone, group
+ subset.insert(names[myrand]); break;
+ }
+ }
+ }
+ }
+
+ //read through fasta file outputting only the names on the subsample list
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ string thisname;
+ int count = 0;
+ map<string, vector<string> >::iterator itNameMap;
+
+ while(!in.eof()){
+
+ if (m->control_pressed) { in.close(); out.close(); return 0; }
+
+ Sequence currSeq(in);
+ thisname = currSeq.getName();
+
+ if (thisname != "") {
+
+ //does the subset contain a sequence that this sequence represents
+ itNameMap = nameMap.find(thisname);
+ if (itNameMap != nameMap.end()) {
+ vector<string> nameRepresents = itNameMap->second;
+
+ for (int i = 0; i < nameRepresents.size(); i++){
+ if (subset.count(nameRepresents[i]) != 0) {
+ out << ">" << nameRepresents[i] << endl << currSeq.getAligned() << endl;
+ count++;
+ }
+ }
+ }else{
+ m->mothurOut("[ERROR]: " + thisname + " is not in your namefile, please correct."); m->mothurOutEndLine();
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ if (count != subset.size()) {
+ m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine();
+ }
+
+ //if a groupfile is provided read through the group file only outputting the names on the subsample list
+ if (groupfile != "") {
+
+ string groupOutputDir = outputDir;
+ if (outputDir == "") { groupOutputDir += m->hasPath(groupfile); }
+ string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile);
+
+ ofstream outGroup;
+ m->openOutputFile(groupOutputFileName, outGroup);
+ outputTypes["group"].push_back(groupOutputFileName); outputNames.push_back(groupOutputFileName);
+
+ ifstream inGroup;
+ m->openInputFile(groupfile, inGroup);
+ string name, group;
+
+ while(!inGroup.eof()){
+
+ if (m->control_pressed) { inGroup.close(); outGroup.close(); return 0; }
+
+ inGroup >> name; m->gobble(inGroup); //read from first column
+ inGroup >> group; //read from second column
+
+ //if this name is in the accnos file
+ if (subset.count(name) != 0) {
+ outGroup << name << '\t' << group << endl;
+ subset.erase(name);
+ }
+
+ m->gobble(inGroup);
+ }
+ inGroup.close();
+ outGroup.close();
+
+ //sanity check
+ if (subset.size() != 0) {
+ m->mothurOut("Your groupfile does not match your fasta file."); m->mothurOutEndLine();
+ for (set<string>::iterator it = subset.begin(); it != subset.end(); it++) {
+ m->mothurOut("[ERROR]: " + *it + " is missing from your groupfile."); m->mothurOutEndLine();
+ }
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSampleCommand", "getSubSampleFasta");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SubSampleCommand::getNames() {
+ try {
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ string thisname;
+ while(!in.eof()){
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ Sequence currSeq(in);
+ thisname = currSeq.getName();
+
+ if (thisname != "") {
+ vector<string> temp; temp.push_back(thisname);
+ nameMap[thisname] = temp;
+ names.push_back(thisname);
+ }
+ m->gobble(in);
+ }
+ in.close();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSampleCommand", "getNames");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SubSampleCommand::readNames() {
+ try {
+
+ ifstream in;
+ m->openInputFile(namefile, in);
+
+ string thisname, repnames;
+ map<string, vector<string> >::iterator it;
+
+ while(!in.eof()){
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ in >> thisname; m->gobble(in); //read from first column
+ in >> repnames; //read from second column
+
+ it = nameMap.find(thisname);
+ if (it == nameMap.end()) {
+
+ vector<string> splitRepNames;
+ m->splitAtComma(repnames, splitRepNames);
+
+ nameMap[thisname] = splitRepNames;
+ for (int i = 0; i < splitRepNames.size(); i++) { names.push_back(splitRepNames[i]); }
+
+ }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
+
+ m->gobble(in);
+ }
+ in.close();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSampleCommand", "readNames");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************