5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){
42 //initialize outputTypes
43 vector<string> tempOutNames;
44 outputTypes["fasta"] = tempOutNames;
45 outputTypes["name"] = tempOutNames;
48 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
52 //**********************************************************************************************************************
53 vector<string> GetOTURepCommand::getValidParameters(){
55 string Array[] = {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
56 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
60 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
64 //**********************************************************************************************************************
65 vector<string> GetOTURepCommand::getRequiredParameters(){
67 string Array[] = {"fasta","list"};
68 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
72 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
76 //**********************************************************************************************************************
77 vector<string> GetOTURepCommand::getRequiredFiles(){
79 vector<string> myArray;
83 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
87 //**********************************************************************************************************************
88 GetOTURepCommand::GetOTURepCommand(string option) {
90 globaldata = GlobalData::getInstance();
95 //allow user to run help
96 if (option == "help") {
99 //valid paramters for this command
100 string Array[] = {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
101 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
103 OptionParser parser(option);
104 map<string, string> parameters = parser.getParameters();
106 ValidParameters validParameter;
107 map<string, string>::iterator it;
109 //check to make sure all parameters are valid for command
110 for (it = parameters.begin(); it != parameters.end(); it++) {
111 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
114 //initialize outputTypes
115 vector<string> tempOutNames;
116 outputTypes["fasta"] = tempOutNames;
117 outputTypes["name"] = tempOutNames;
119 //if the user changes the input directory command factory will send this info to us in the output parameter
120 string inputDir = validParameter.validFile(parameters, "inputdir", false);
121 if (inputDir == "not found"){ inputDir = ""; }
124 it = parameters.find("list");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["list"] = inputDir + it->second; }
132 it = parameters.find("fasta");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["fasta"] = inputDir + it->second; }
140 it = parameters.find("phylip");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["phylip"] = inputDir + it->second; }
148 it = parameters.find("column");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["column"] = inputDir + it->second; }
156 it = parameters.find("name");
157 //user has given a template file
158 if(it != parameters.end()){
159 path = m->hasPath(it->second);
160 //if the user has not given a path then, add inputdir. else leave path alone.
161 if (path == "") { parameters["name"] = inputDir + it->second; }
164 it = parameters.find("group");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["group"] = inputDir + it->second; }
174 //if the user changes the output directory command factory will send this info to us in the output parameter
175 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
177 //check for required parameters
178 fastafile = validParameter.validFile(parameters, "fasta", true);
179 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
180 else if (fastafile == "not open") { abort = true; }
182 listfile = validParameter.validFile(parameters, "list", true);
183 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
184 else if (listfile == "not open") { abort = true; }
186 phylipfile = validParameter.validFile(parameters, "phylip", true);
187 if (phylipfile == "not found") { phylipfile = ""; }
188 else if (phylipfile == "not open") { abort = true; }
189 else { distFile = phylipfile; format = "phylip"; }
191 columnfile = validParameter.validFile(parameters, "column", true);
192 if (columnfile == "not found") { columnfile = ""; }
193 else if (columnfile == "not open") { abort = true; }
194 else { distFile = columnfile; format = "column"; }
196 namefile = validParameter.validFile(parameters, "name", true);
197 if (namefile == "not open") { abort = true; }
198 else if (namefile == "not found") { namefile = ""; }
200 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
201 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
203 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
205 //check for optional parameter and set defaults
206 // ...at some point should added some additional type checking...
207 label = validParameter.validFile(parameters, "label", false);
208 if (label == "not found") { label = ""; allLines = 1; }
210 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
211 else { allLines = 1; }
214 groupfile = validParameter.validFile(parameters, "group", true);
215 if (groupfile == "not open") { groupfile = ""; abort = true; }
216 else if (groupfile == "not found") { groupfile = ""; }
218 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
219 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
220 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
224 if ((sorted == "group") && (groupfile == "")) {
225 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
229 groups = validParameter.validFile(parameters, "groups", false);
230 if (groups == "not found") { groups = ""; }
232 if (groupfile == "") {
233 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
236 m->splitAtDash(groups, Groups);
239 globaldata->Groups = Groups;
241 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
242 large = m->isTrue(temp);
244 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { if (namefile == "") { temp = "F"; } else { temp = "t"; } }
245 weighted = m->isTrue(temp);
247 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
249 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
250 convert(temp, precision);
252 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
253 convert(temp, cutoff);
254 cutoff += (5 / (precision * 10.0));
257 catch(exception& e) {
258 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
263 //**********************************************************************************************************************
265 void GetOTURepCommand::help(){
267 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
268 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
269 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
270 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
271 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
272 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
273 m->mothurOut("The default value for label is all labels in your inputfile.\n");
274 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
275 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
276 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false with no namesfile and true when a name file is provided.\n");
277 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
278 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
279 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
280 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
281 m->mothurOut("The group parameter allows you provide a group file.\n");
282 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
283 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
284 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
285 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
287 catch(exception& e) {
288 m->errorOut(e, "GetOTURepCommand", "help");
293 //**********************************************************************************************************************
295 GetOTURepCommand::~GetOTURepCommand(){}
297 //**********************************************************************************************************************
299 int GetOTURepCommand::execute(){
302 if (abort == true) { return 0; }
306 //read distance files
307 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
308 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
309 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
311 readMatrix->setCutoff(cutoff);
314 nameMap = new NameAssignment(namefile);
316 }else{ nameMap = NULL; }
318 readMatrix->read(nameMap);
320 if (m->control_pressed) { delete readMatrix; return 0; }
323 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
324 globaldata->gListVector = readMatrix->getListVector();
326 SparseMatrix* matrix = readMatrix->getMatrix();
328 // Create a data structure to quickly access the distance information.
329 // It consists of a vector of distance maps, where each map contains
330 // all distances of a certain sequence. Vector and maps are accessed
331 // via the index of a sequence in the distance matrix
332 seqVec = vector<SeqMap>(globaldata->gListVector->size());
333 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
334 if (m->control_pressed) { delete readMatrix; return 0; }
335 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
337 //add dummy map for unweighted calc
339 seqVec.push_back(dummy);
345 if (m->control_pressed) { return 0; }
347 //process file and set up indexes
348 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
349 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
350 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
352 formatMatrix->setCutoff(cutoff);
355 nameMap = new NameAssignment(namefile);
357 }else{ nameMap = NULL; }
359 formatMatrix->read(nameMap);
361 if (m->control_pressed) { delete formatMatrix; return 0; }
364 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
365 globaldata->gListVector = formatMatrix->getListVector();
367 distFile = formatMatrix->getFormattedFileName();
369 //positions in file where the distances for each sequence begin
370 //rowPositions[1] = position in file where distance related to sequence 1 start.
371 rowPositions = formatMatrix->getRowPositions();
372 rowPositions.push_back(-1); //dummy row for unweighted calc
377 //openfile for getMap to use
378 m->openInputFile(distFile, inRow);
380 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
384 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
385 if (globaldata->gListVector != NULL) {
386 vector<string> names;
388 //map names to rows in sparsematrix
389 for (int i = 0; i < globaldata->gListVector->size(); i++) {
391 binnames = globaldata->gListVector->get(i);
393 m->splitAtComma(binnames, names);
395 for (int j = 0; j < names.size(); j++) {
396 nameToIndex[names[j]] = i;
399 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
402 if (m->control_pressed) {
403 if (large) { inRow.close(); remove(distFile.c_str()); }
407 if (groupfile != "") {
408 //read in group map info.
409 groupMap = new GroupMap(groupfile);
410 int error = groupMap->readMap();
411 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
413 if (Groups.size() != 0) {
414 SharedUtil* util = new SharedUtil();
415 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
420 //set format to list so input can get listvector
421 globaldata->setFormat("list");
424 read = new ReadOTUFile(listfile);
425 read->read(&*globaldata);
427 input = globaldata->ginput;
428 list = globaldata->gListVector;
429 string lastLabel = list->getLabel();
431 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
432 set<string> processedLabels;
433 set<string> userLabels = labels;
435 if (m->control_pressed) {
436 if (large) { inRow.close(); remove(distFile.c_str()); }
437 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
440 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
442 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
444 if (allLines == 1 || labels.count(list->getLabel()) == 1){
445 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
446 error = process(list);
447 if (error == 1) { return 0; } //there is an error in hte input files, abort command
449 if (m->control_pressed) {
450 if (large) { inRow.close(); remove(distFile.c_str()); }
451 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
452 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
455 processedLabels.insert(list->getLabel());
456 userLabels.erase(list->getLabel());
459 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
460 string saveLabel = list->getLabel();
463 list = input->getListVector(lastLabel);
464 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
465 error = process(list);
466 if (error == 1) { return 0; } //there is an error in hte input files, abort command
468 if (m->control_pressed) {
469 if (large) { inRow.close(); remove(distFile.c_str()); }
470 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
471 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
474 processedLabels.insert(list->getLabel());
475 userLabels.erase(list->getLabel());
477 //restore real lastlabel to save below
478 list->setLabel(saveLabel);
481 lastLabel = list->getLabel();
484 list = input->getListVector();
487 //output error messages about any remaining user labels
488 bool needToRun = false;
489 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
490 m->mothurOut("Your file does not include the label " + (*it));
491 if (processedLabels.count(lastLabel) != 1) {
492 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
495 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
499 //run last label if you need to
500 if (needToRun == true) {
501 if (list != NULL) { delete list; }
502 list = input->getListVector(lastLabel);
503 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
504 error = process(list);
506 if (error == 1) { return 0; } //there is an error in hte input files, abort command
508 if (m->control_pressed) {
509 if (large) { inRow.close(); remove(distFile.c_str()); }
510 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
511 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
515 //close and remove formatted matrix file
518 remove(distFile.c_str());
521 globaldata->gListVector = NULL;
522 delete input; globaldata->ginput = NULL;
525 if (!weighted) { nameFileMap.clear(); }
528 fasta = new FastaMap();
529 fasta->readFastaFile(fastafile);
531 //if user gave a namesfile then use it
532 if (namefile != "") { readNamesFile(); }
534 //output create and output the .rep.fasta files
535 map<string, string>::iterator itNameFile;
536 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
537 processNames(itNameFile->first, itNameFile->second);
541 if (groupfile != "") {
542 delete groupMap; globaldata->gGroupmap = NULL;
545 if (m->control_pressed) { return 0; }
547 m->mothurOutEndLine();
548 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
549 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
550 m->mothurOutEndLine();
554 catch(exception& e) {
555 m->errorOut(e, "GetOTURepCommand", "execute");
560 //**********************************************************************************************************************
561 void GetOTURepCommand::readNamesFile() {
564 vector<string> dupNames;
565 m->openInputFile(namefile, in);
567 string name, names, sequence;
570 in >> name; //read from first column A
571 in >> names; //read from second column A,B,C,D
575 //parse names into vector
576 m->splitAtComma(names, dupNames);
578 //store names in fasta map
579 sequence = fasta->getSequence(name);
580 for (int i = 0; i < dupNames.size(); i++) {
581 fasta->push_back(dupNames[i], sequence);
589 catch(exception& e) {
590 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
594 //**********************************************************************************************************************
595 //read names file to find the weighted rep for each bin
596 void GetOTURepCommand::readNamesFile(bool w) {
599 vector<string> dupNames;
600 m->openInputFile(namefile, in);
602 string name, names, sequence;
605 in >> name; m->gobble(in); //read from first column A
606 in >> names; //read from second column A,B,C,D
610 //parse names into vector
611 m->splitAtComma(names, dupNames);
613 for (int i = 0; i < dupNames.size(); i++) {
614 nameFileMap[dupNames[i]] = name;
622 catch(exception& e) {
623 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
627 //**********************************************************************************************************************
628 string GetOTURepCommand::findRep(vector<string> names) {
630 // if only 1 sequence in bin or processing the "unique" label, then
631 // the first sequence of the OTU is the representative one
632 if ((names.size() == 1) || (list->getLabel() == "unique")) {
635 vector<int> seqIndex(names.size());
636 vector<float> max_dist(names.size());
637 vector<float> total_dist(names.size());
638 map<string, string>::iterator itNameFile;
639 map<string, int>::iterator itNameIndex;
641 //fill seqIndex and initialize sums
642 for (size_t i = 0; i < names.size(); i++) {
644 seqIndex[i] = nameToIndex[names[i]];
646 if (namefile == "") {
647 itNameIndex = nameToIndex.find(names[i]);
649 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
650 if (large) { seqIndex[i] = (rowPositions.size()-1); }
651 else { seqIndex[i] = (seqVec.size()-1); }
653 seqIndex[i] = itNameIndex->second;
657 itNameFile = nameFileMap.find(names[i]);
659 if (itNameFile == nameFileMap.end()) {
660 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
662 string name1 = itNameFile->first;
663 string name2 = itNameFile->second;
665 if (name1 == name2) { //then you are unique so add your real dists
666 seqIndex[i] = nameToIndex[names[i]];
668 if (large) { seqIndex[i] = (rowPositions.size()-1); }
669 else { seqIndex[i] = (seqVec.size()-1); }
678 // loop through all entries in seqIndex
681 for (size_t i=0; i < seqIndex.size(); i++) {
682 if (m->control_pressed) { return "control"; }
684 if (!large) { currMap = seqVec[seqIndex[i]]; }
685 else { currMap = getMap(seqIndex[i]); }
687 for (size_t j=0; j < seqIndex.size(); j++) {
688 it = currMap.find(seqIndex[j]);
689 if (it != currMap.end()) {
690 max_dist[i] = max(max_dist[i], it->second);
691 max_dist[j] = max(max_dist[j], it->second);
692 total_dist[i] += it->second;
693 total_dist[j] += it->second;
694 }else{ //if you can't find the distance make it the cutoff
695 max_dist[i] = max(max_dist[i], cutoff);
696 max_dist[j] = max(max_dist[j], cutoff);
697 total_dist[i] += cutoff;
698 total_dist[j] += cutoff;
703 // sequence with the smallest maximum distance is the representative
704 //if tie occurs pick sequence with smallest average distance
707 for (size_t i=0; i < max_dist.size(); i++) {
708 if (m->control_pressed) { return "control"; }
709 if (max_dist[i] < min) {
712 }else if (max_dist[i] == min) {
713 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
714 float newAverage = total_dist[i] / (float) total_dist.size();
716 if (newAverage < currentAverage) {
723 return(names[minIndex]);
726 catch(exception& e) {
727 m->errorOut(e, "GetOTURepCommand", "FindRep");
732 //**********************************************************************************************************************
733 int GetOTURepCommand::process(ListVector* processList) {
735 string name, sequence;
739 if (outputDir == "") { outputDir += m->hasPath(listfile); }
741 ofstream newNamesOutput;
742 string outputNamesFile;
743 map<string, ofstream*> filehandles;
745 if (Groups.size() == 0) { //you don't want to use groups
746 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
747 m->openOutputFile(outputNamesFile, newNamesOutput);
748 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
749 outputNameFiles[outputNamesFile] = processList->getLabel();
750 }else{ //you want to use groups
752 for (int i=0; i<Groups.size(); i++) {
754 filehandles[Groups[i]] = temp;
755 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
757 m->openOutputFile(outputNamesFile, *(temp));
758 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
759 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
763 //for each bin in the list vector
764 for (int i = 0; i < processList->size(); i++) {
765 if (m->control_pressed) {
767 if (Groups.size() == 0) { //you don't want to use groups
768 newNamesOutput.close();
770 for (int j=0; j<Groups.size(); j++) {
771 (*(filehandles[Groups[j]])).close();
772 delete filehandles[Groups[j]];
778 string temp = processList->get(i);
779 vector<string> namesInBin;
780 m->splitAtComma(temp, namesInBin);
782 if (Groups.size() == 0) {
783 nameRep = findRep(namesInBin);
784 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
786 map<string, vector<string> > NamesInGroup;
787 for (int j=0; j<Groups.size(); j++) { //initialize groups
788 NamesInGroup[Groups[j]].resize(0);
791 for (int j=0; j<namesInBin.size(); j++) {
792 string thisgroup = groupMap->getGroup(namesInBin[j]);
794 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
796 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
797 NamesInGroup[thisgroup].push_back(namesInBin[j]);
801 //get rep for each group in otu
802 for (int j=0; j<Groups.size(); j++) {
803 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
804 //get rep for each group
805 nameRep = findRep(NamesInGroup[Groups[j]]);
807 //output group rep and other members of this group
808 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
810 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
811 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
814 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
820 if (Groups.size() == 0) { //you don't want to use groups
821 newNamesOutput.close();
823 for (int i=0; i<Groups.size(); i++) {
824 (*(filehandles[Groups[i]])).close();
825 delete filehandles[Groups[i]];
832 catch(exception& e) {
833 m->errorOut(e, "GetOTURepCommand", "process");
837 //**********************************************************************************************************************
838 int GetOTURepCommand::processNames(string filename, string label) {
842 if (outputDir == "") { outputDir += m->hasPath(listfile); }
843 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
844 m->openOutputFile(outputFileName, out);
845 vector<repStruct> reps;
846 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
849 string tempNameFile = filename + ".temp";
850 m->openOutputFile(tempNameFile, out2);
853 m->openInputFile(filename, in);
857 string rep, binnames;
858 in >> i >> rep >> binnames; m->gobble(in);
859 out2 << rep << '\t' << binnames << endl;
861 vector<string> names;
862 m->splitAtComma(binnames, names);
863 int binsize = names.size();
865 //if you have a groupfile
867 if (groupfile != "") {
868 map<string, string> groups;
869 map<string, string>::iterator groupIt;
871 //find the groups that are in this bin
872 for (size_t i = 0; i < names.size(); i++) {
873 string groupName = groupMap->getGroup(names[i]);
874 if (groupName == "not found") {
875 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
878 groups[groupName] = groupName;
882 //turn the groups into a string
883 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
884 group += groupIt->first + "-";
887 group = group.substr(0, group.length()-1);
891 //print out name and sequence for that bin
892 string sequence = fasta->getSequence(rep);
894 if (sequence != "not found") {
895 if (sorted == "") { //print them out
896 rep = rep + "\t" + toString(i+1);
897 rep = rep + "|" + toString(binsize);
898 if (groupfile != "") {
899 rep = rep + "|" + group;
901 out << ">" << rep << endl;
902 out << sequence << endl;
904 repStruct newRep(rep, i+1, binsize, group);
905 reps.push_back(newRep);
908 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
913 if (sorted != "") { //then sort them and print them
914 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
915 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
916 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
917 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
920 for (int i = 0; i < reps.size(); i++) {
921 string sequence = fasta->getSequence(reps[i].name);
922 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
923 outputName = outputName + "|" + toString(reps[i].size);
924 if (groupfile != "") {
925 outputName = outputName + "|" + reps[i].group;
927 out << ">" << outputName << endl;
928 out << sequence << endl;
936 remove(filename.c_str());
937 rename(tempNameFile.c_str(), filename.c_str());
942 catch(exception& e) {
943 m->errorOut(e, "GetOTURepCommand", "processNames");
947 //**********************************************************************************************************************
948 SeqMap GetOTURepCommand::getMap(int row) {
952 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
953 if (rowPositions[row] != -1){
955 inRow.seekg(rowPositions[row]);
957 int rowNum, numDists, colNum;
960 inRow >> rowNum >> numDists;
962 for(int i = 0; i < numDists; i++) {
963 inRow >> colNum >> dist;
964 rowMap[colNum] = dist;
971 catch(exception& e) {
972 m->errorOut(e, "GetOTURepCommand", "getMap");
976 //**********************************************************************************************************************