+ vector<string> reps;
+ string rep = "notFound";
+
+ if ((names.size() == 1)) {
+ return names[0];
+ }else{
+ //fill seqIndex and initialize sums
+ int maxAbund = 0;
+ for (int i = 0; i < names.size(); i++) {
+
+ if (m->control_pressed) { return "control"; }
+
+ if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
+ int numRep = 0;
+ if (group != "") { numRep = ct.getGroupCount(names[i], group); }
+ else { numRep = ct.getGroupCount(names[i]); }
+ if (numRep > maxAbund) {
+ reps.clear();
+ reps.push_back(names[i]);
+ maxAbund = numRep;
+ }else if(numRep == maxAbund) { //tie
+ reps.push_back(names[i]);
+ }
+ }else { //name file used, we assume list file contains all sequences
+ map<string, int>::iterator itNameMap = nameToIndex.find(names[i]);
+ if (itNameMap == nameToIndex.end()) {} //assume that this sequence is not a unique
+ else {
+ if (itNameMap->second > maxAbund) {
+ reps.clear();
+ reps.push_back(names[i]);
+ maxAbund = itNameMap->second;
+ }else if(itNameMap->second == maxAbund) { //tie
+ reps.push_back(names[i]);
+ }
+ }
+ }
+ }
+
+ if (reps.size() == 0) { m->mothurOut("[ERROR]: no rep found, file mismatch?? Quitting.\n"); m->control_pressed = true; }
+ else if (reps.size() == 1) { rep = reps[0]; }
+ else { //tie
+ int index = m->getRandomIndex(reps.size()-1);
+ rep = reps[index];
+ }
+ }
+
+ return rep;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOTURepCommand", "findRepAbund");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string GetOTURepCommand::findRep(vector<string> names, string group) {
+ try{
+ //if using abundance
+ if (method == "abundance") { return (findRepAbund(names, group)); }
+ else { //find rep based on distance
+
+ // if only 1 sequence in bin or processing the "unique" label, then
+ // the first sequence of the OTU is the representative one
+ if ((names.size() == 1)) {
+ return names[0];
+ }else{
+ vector<int> seqIndex; //(names.size());
+ map<string, string>::iterator itNameFile;
+ map<string, int>::iterator itNameIndex;
+
+ //fill seqIndex and initialize sums
+ for (size_t i = 0; i < names.size(); i++) {
+ if (weighted) {
+ seqIndex.push_back(nameToIndex[names[i]]);
+ if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
+ int numRep = 0;
+ if (group != "") { numRep = ct.getGroupCount(names[i], group); }
+ else { numRep = ct.getGroupCount(names[i]); }
+ for (int j = 1; j < numRep; j++) { //don't add yourself again
+ seqIndex.push_back(nameToIndex[names[i]]);
+ }
+ }
+ }else {
+ if (namefile == "") {
+ itNameIndex = nameToIndex.find(names[i]);
+
+ if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
+ if (large) { seqIndex.push_back((rowPositions.size()-1)); }
+ else { seqIndex.push_back((seqVec.size()-1)); }
+ }else {
+ seqIndex.push_back(itNameIndex->second);
+ }
+
+ }else {
+ itNameFile = nameFileMap.find(names[i]);
+
+ if (itNameFile == nameFileMap.end()) {
+ m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+ }else{
+ string name1 = itNameFile->first;
+ string name2 = itNameFile->second;
+
+ if (name1 == name2) { //then you are unique so add your real dists
+ seqIndex.push_back(nameToIndex[names[i]]);
+ }else { //add dummy
+ if (large) { seqIndex.push_back((rowPositions.size()-1)); }
+ else { seqIndex.push_back((seqVec.size()-1)); }
+ }
+ }
+ }
+ }
+ }
+
+ vector<float> max_dist(seqIndex.size(), 0.0);
+ vector<float> total_dist(seqIndex.size(), 0.0);
+
+ // loop through all entries in seqIndex
+ SeqMap::iterator it;
+ SeqMap currMap;
+ for (size_t i=0; i < seqIndex.size(); i++) {
+ if (m->control_pressed) { return "control"; }
+
+ if (!large) { currMap = seqVec[seqIndex[i]]; }
+ else { currMap = getMap(seqIndex[i]); }
+
+ for (size_t j=0; j < seqIndex.size(); j++) {
+ it = currMap.find(seqIndex[j]);
+ if (it != currMap.end()) {
+ max_dist[i] = max(max_dist[i], it->second);
+ max_dist[j] = max(max_dist[j], it->second);
+ total_dist[i] += it->second;
+ total_dist[j] += it->second;
+ }else{ //if you can't find the distance make it the cutoff
+ max_dist[i] = max(max_dist[i], cutoff);
+ max_dist[j] = max(max_dist[j], cutoff);
+ total_dist[i] += cutoff;
+ total_dist[j] += cutoff;
+ }
+ }
+ }
+
+ // sequence with the smallest maximum distance is the representative
+ //if tie occurs pick sequence with smallest average distance
+ float min = 10000;
+ int minIndex;
+ for (size_t i=0; i < max_dist.size(); i++) {
+ if (m->control_pressed) { return "control"; }
+ if (max_dist[i] < min) {
+ min = max_dist[i];
+ minIndex = i;
+ }else if (max_dist[i] == min) {
+ float currentAverage = total_dist[minIndex] / (float) total_dist.size();
+ float newAverage = total_dist[i] / (float) total_dist.size();
+
+ if (newAverage < currentAverage) {
+ min = max_dist[i];
+ minIndex = i;
+ }
+ }
+ }
+
+ return(names[minIndex]);
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetOTURepCommand", "FindRep");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int GetOTURepCommand::process(ListVector* processList) {
+ try{
+ string name, sequence;
+ string nameRep;
+
+ //create output file
+ if (outputDir == "") { outputDir += m->hasPath(listfile); }
+
+ ofstream newNamesOutput;
+ string outputNamesFile;
+ map<string, ofstream*> filehandles;
+
+ map<string, string> variables;
+ variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
+
+ if (Groups.size() == 0) { //you don't want to use groups
+ variables["[tag]"] = processList->getLabel();
+ if (countfile == "") {
+ outputNamesFile = getOutputFileName("name", variables);
+ outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
+ }else {
+ outputNamesFile = getOutputFileName("count", variables);
+ outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
+ }
+ outputNameFiles[outputNamesFile] = processList->getLabel();
+ m->openOutputFile(outputNamesFile, newNamesOutput);
+ newNamesOutput << "noGroup" << endl;
+ }else{ //you want to use groups
+ ofstream* temp;
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ variables["[tag]"] = processList->getLabel();
+ variables["[group]"] = Groups[i];
+ filehandles[Groups[i]] = temp;
+ outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
+ if (countfile == "") {
+ outputNamesFile = getOutputFileName("name", variables);
+ outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
+ }else {
+ outputNamesFile = getOutputFileName("count", variables);
+ outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
+ }
+
+ m->openOutputFile(outputNamesFile, *(temp));
+ *(temp) << Groups[i] << endl;
+ outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
+ }
+ }
+
+ //for each bin in the list vector
+ for (int i = 0; i < processList->size(); i++) {
+ if (m->control_pressed) {
+ out.close();
+ if (Groups.size() == 0) { //you don't want to use groups
+ newNamesOutput.close();