5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){
41 abort = true; calledHelp = true;
42 vector<string> tempOutNames;
43 outputTypes["fasta"] = tempOutNames;
44 outputTypes["name"] = tempOutNames;
47 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
51 //**********************************************************************************************************************
52 vector<string> GetOTURepCommand::getValidParameters(){
54 string Array[] = {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
55 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
59 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
63 //**********************************************************************************************************************
64 vector<string> GetOTURepCommand::getRequiredParameters(){
66 string Array[] = {"fasta","list"};
67 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
71 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
75 //**********************************************************************************************************************
76 vector<string> GetOTURepCommand::getRequiredFiles(){
78 vector<string> myArray;
82 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
86 //**********************************************************************************************************************
87 GetOTURepCommand::GetOTURepCommand(string option) {
89 globaldata = GlobalData::getInstance();
90 abort = false; calledHelp = false;
94 //allow user to run help
95 if (option == "help") {
96 help(); abort = true; calledHelp = true;
98 //valid paramters for this command
99 string Array[] = {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
100 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
102 OptionParser parser(option);
103 map<string, string> parameters = parser.getParameters();
105 ValidParameters validParameter;
106 map<string, string>::iterator it;
108 //check to make sure all parameters are valid for command
109 for (it = parameters.begin(); it != parameters.end(); it++) {
110 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
113 //initialize outputTypes
114 vector<string> tempOutNames;
115 outputTypes["fasta"] = tempOutNames;
116 outputTypes["name"] = tempOutNames;
118 //if the user changes the input directory command factory will send this info to us in the output parameter
119 string inputDir = validParameter.validFile(parameters, "inputdir", false);
120 if (inputDir == "not found"){ inputDir = ""; }
123 it = parameters.find("list");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["list"] = inputDir + it->second; }
131 it = parameters.find("fasta");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["fasta"] = inputDir + it->second; }
139 it = parameters.find("phylip");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["phylip"] = inputDir + it->second; }
147 it = parameters.find("column");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["column"] = inputDir + it->second; }
155 it = parameters.find("name");
156 //user has given a template file
157 if(it != parameters.end()){
158 path = m->hasPath(it->second);
159 //if the user has not given a path then, add inputdir. else leave path alone.
160 if (path == "") { parameters["name"] = inputDir + it->second; }
163 it = parameters.find("group");
164 //user has given a template file
165 if(it != parameters.end()){
166 path = m->hasPath(it->second);
167 //if the user has not given a path then, add inputdir. else leave path alone.
168 if (path == "") { parameters["group"] = inputDir + it->second; }
173 //if the user changes the output directory command factory will send this info to us in the output parameter
174 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
176 //check for required parameters
177 fastafile = validParameter.validFile(parameters, "fasta", true);
178 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
179 else if (fastafile == "not open") { abort = true; }
181 listfile = validParameter.validFile(parameters, "list", true);
182 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
183 else if (listfile == "not open") { abort = true; }
185 phylipfile = validParameter.validFile(parameters, "phylip", true);
186 if (phylipfile == "not found") { phylipfile = ""; }
187 else if (phylipfile == "not open") { abort = true; }
188 else { distFile = phylipfile; format = "phylip"; }
190 columnfile = validParameter.validFile(parameters, "column", true);
191 if (columnfile == "not found") { columnfile = ""; }
192 else if (columnfile == "not open") { abort = true; }
193 else { distFile = columnfile; format = "column"; }
195 namefile = validParameter.validFile(parameters, "name", true);
196 if (namefile == "not open") { abort = true; }
197 else if (namefile == "not found") { namefile = ""; }
199 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
200 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
202 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
204 //check for optional parameter and set defaults
205 // ...at some point should added some additional type checking...
206 label = validParameter.validFile(parameters, "label", false);
207 if (label == "not found") { label = ""; allLines = 1; }
209 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
210 else { allLines = 1; }
213 groupfile = validParameter.validFile(parameters, "group", true);
214 if (groupfile == "not open") { groupfile = ""; abort = true; }
215 else if (groupfile == "not found") { groupfile = ""; }
217 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
218 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
219 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
223 if ((sorted == "group") && (groupfile == "")) {
224 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
228 groups = validParameter.validFile(parameters, "groups", false);
229 if (groups == "not found") { groups = ""; }
231 if (groupfile == "") {
232 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
235 m->splitAtDash(groups, Groups);
238 globaldata->Groups = Groups;
240 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
241 large = m->isTrue(temp);
243 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { if (namefile == "") { temp = "F"; } else { temp = "t"; } }
244 weighted = m->isTrue(temp);
246 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
248 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
249 convert(temp, precision);
251 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
252 convert(temp, cutoff);
253 cutoff += (5 / (precision * 10.0));
256 catch(exception& e) {
257 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
262 //**********************************************************************************************************************
264 void GetOTURepCommand::help(){
266 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
267 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
268 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
269 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
270 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
271 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
272 m->mothurOut("The default value for label is all labels in your inputfile.\n");
273 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
274 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
275 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false with no namesfile and true when a name file is provided.\n");
276 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
277 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
278 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
279 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
280 m->mothurOut("The group parameter allows you provide a group file.\n");
281 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
282 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
283 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
284 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
286 catch(exception& e) {
287 m->errorOut(e, "GetOTURepCommand", "help");
292 //**********************************************************************************************************************
294 GetOTURepCommand::~GetOTURepCommand(){}
296 //**********************************************************************************************************************
298 int GetOTURepCommand::execute(){
301 if (abort == true) { if (calledHelp) { return 0; } return 2; }
305 //read distance files
306 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
307 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
308 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
310 readMatrix->setCutoff(cutoff);
313 nameMap = new NameAssignment(namefile);
315 }else{ nameMap = NULL; }
317 readMatrix->read(nameMap);
319 if (m->control_pressed) { delete readMatrix; return 0; }
322 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
323 globaldata->gListVector = readMatrix->getListVector();
325 SparseMatrix* matrix = readMatrix->getMatrix();
327 // Create a data structure to quickly access the distance information.
328 // It consists of a vector of distance maps, where each map contains
329 // all distances of a certain sequence. Vector and maps are accessed
330 // via the index of a sequence in the distance matrix
331 seqVec = vector<SeqMap>(globaldata->gListVector->size());
332 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
333 if (m->control_pressed) { delete readMatrix; return 0; }
334 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
336 //add dummy map for unweighted calc
338 seqVec.push_back(dummy);
344 if (m->control_pressed) { return 0; }
346 //process file and set up indexes
347 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
348 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
349 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
351 formatMatrix->setCutoff(cutoff);
354 nameMap = new NameAssignment(namefile);
356 }else{ nameMap = NULL; }
358 formatMatrix->read(nameMap);
360 if (m->control_pressed) { delete formatMatrix; return 0; }
363 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
364 globaldata->gListVector = formatMatrix->getListVector();
366 distFile = formatMatrix->getFormattedFileName();
368 //positions in file where the distances for each sequence begin
369 //rowPositions[1] = position in file where distance related to sequence 1 start.
370 rowPositions = formatMatrix->getRowPositions();
371 rowPositions.push_back(-1); //dummy row for unweighted calc
376 //openfile for getMap to use
377 m->openInputFile(distFile, inRow);
379 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
383 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
384 if (globaldata->gListVector != NULL) {
385 vector<string> names;
387 //map names to rows in sparsematrix
388 for (int i = 0; i < globaldata->gListVector->size(); i++) {
390 binnames = globaldata->gListVector->get(i);
392 m->splitAtComma(binnames, names);
394 for (int j = 0; j < names.size(); j++) {
395 nameToIndex[names[j]] = i;
398 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
401 if (m->control_pressed) {
402 if (large) { inRow.close(); remove(distFile.c_str()); }
406 if (groupfile != "") {
407 //read in group map info.
408 groupMap = new GroupMap(groupfile);
409 int error = groupMap->readMap();
410 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
412 if (Groups.size() != 0) {
413 SharedUtil* util = new SharedUtil();
414 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
419 //set format to list so input can get listvector
420 globaldata->setFormat("list");
423 read = new ReadOTUFile(listfile);
424 read->read(&*globaldata);
426 input = globaldata->ginput;
427 list = globaldata->gListVector;
428 string lastLabel = list->getLabel();
430 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
431 set<string> processedLabels;
432 set<string> userLabels = labels;
434 if (m->control_pressed) {
435 if (large) { inRow.close(); remove(distFile.c_str()); }
436 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
439 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
441 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
443 if (allLines == 1 || labels.count(list->getLabel()) == 1){
444 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
445 error = process(list);
446 if (error == 1) { return 0; } //there is an error in hte input files, abort command
448 if (m->control_pressed) {
449 if (large) { inRow.close(); remove(distFile.c_str()); }
450 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
451 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
454 processedLabels.insert(list->getLabel());
455 userLabels.erase(list->getLabel());
458 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
459 string saveLabel = list->getLabel();
462 list = input->getListVector(lastLabel);
463 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
464 error = process(list);
465 if (error == 1) { return 0; } //there is an error in hte input files, abort command
467 if (m->control_pressed) {
468 if (large) { inRow.close(); remove(distFile.c_str()); }
469 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
470 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
473 processedLabels.insert(list->getLabel());
474 userLabels.erase(list->getLabel());
476 //restore real lastlabel to save below
477 list->setLabel(saveLabel);
480 lastLabel = list->getLabel();
483 list = input->getListVector();
486 //output error messages about any remaining user labels
487 bool needToRun = false;
488 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
489 m->mothurOut("Your file does not include the label " + (*it));
490 if (processedLabels.count(lastLabel) != 1) {
491 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
494 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
498 //run last label if you need to
499 if (needToRun == true) {
500 if (list != NULL) { delete list; }
501 list = input->getListVector(lastLabel);
502 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
503 error = process(list);
505 if (error == 1) { return 0; } //there is an error in hte input files, abort command
507 if (m->control_pressed) {
508 if (large) { inRow.close(); remove(distFile.c_str()); }
509 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
510 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
514 //close and remove formatted matrix file
517 remove(distFile.c_str());
520 globaldata->gListVector = NULL;
521 delete input; globaldata->ginput = NULL;
524 if (!weighted) { nameFileMap.clear(); }
527 fasta = new FastaMap();
528 fasta->readFastaFile(fastafile);
530 //if user gave a namesfile then use it
531 if (namefile != "") { readNamesFile(); }
533 //output create and output the .rep.fasta files
534 map<string, string>::iterator itNameFile;
535 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
536 processNames(itNameFile->first, itNameFile->second);
540 if (groupfile != "") {
541 delete groupMap; globaldata->gGroupmap = NULL;
544 if (m->control_pressed) { return 0; }
546 //set fasta file as new current fastafile - use first one??
548 itTypes = outputTypes.find("fasta");
549 if (itTypes != outputTypes.end()) {
550 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
553 itTypes = outputTypes.find("name");
554 if (itTypes != outputTypes.end()) {
555 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
558 m->mothurOutEndLine();
559 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
560 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
561 m->mothurOutEndLine();
565 catch(exception& e) {
566 m->errorOut(e, "GetOTURepCommand", "execute");
571 //**********************************************************************************************************************
572 void GetOTURepCommand::readNamesFile() {
575 vector<string> dupNames;
576 m->openInputFile(namefile, in);
578 string name, names, sequence;
581 in >> name; //read from first column A
582 in >> names; //read from second column A,B,C,D
586 //parse names into vector
587 m->splitAtComma(names, dupNames);
589 //store names in fasta map
590 sequence = fasta->getSequence(name);
591 for (int i = 0; i < dupNames.size(); i++) {
592 fasta->push_back(dupNames[i], sequence);
600 catch(exception& e) {
601 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
605 //**********************************************************************************************************************
606 //read names file to find the weighted rep for each bin
607 void GetOTURepCommand::readNamesFile(bool w) {
610 vector<string> dupNames;
611 m->openInputFile(namefile, in);
613 string name, names, sequence;
616 in >> name; m->gobble(in); //read from first column A
617 in >> names; //read from second column A,B,C,D
621 //parse names into vector
622 m->splitAtComma(names, dupNames);
624 for (int i = 0; i < dupNames.size(); i++) {
625 nameFileMap[dupNames[i]] = name;
633 catch(exception& e) {
634 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
638 //**********************************************************************************************************************
639 string GetOTURepCommand::findRep(vector<string> names) {
641 // if only 1 sequence in bin or processing the "unique" label, then
642 // the first sequence of the OTU is the representative one
643 if ((names.size() == 1) || (list->getLabel() == "unique")) {
646 vector<int> seqIndex(names.size());
647 vector<float> max_dist(names.size());
648 vector<float> total_dist(names.size());
649 map<string, string>::iterator itNameFile;
650 map<string, int>::iterator itNameIndex;
652 //fill seqIndex and initialize sums
653 for (size_t i = 0; i < names.size(); i++) {
655 seqIndex[i] = nameToIndex[names[i]];
657 if (namefile == "") {
658 itNameIndex = nameToIndex.find(names[i]);
660 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
661 if (large) { seqIndex[i] = (rowPositions.size()-1); }
662 else { seqIndex[i] = (seqVec.size()-1); }
664 seqIndex[i] = itNameIndex->second;
668 itNameFile = nameFileMap.find(names[i]);
670 if (itNameFile == nameFileMap.end()) {
671 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
673 string name1 = itNameFile->first;
674 string name2 = itNameFile->second;
676 if (name1 == name2) { //then you are unique so add your real dists
677 seqIndex[i] = nameToIndex[names[i]];
679 if (large) { seqIndex[i] = (rowPositions.size()-1); }
680 else { seqIndex[i] = (seqVec.size()-1); }
689 // loop through all entries in seqIndex
692 for (size_t i=0; i < seqIndex.size(); i++) {
693 if (m->control_pressed) { return "control"; }
695 if (!large) { currMap = seqVec[seqIndex[i]]; }
696 else { currMap = getMap(seqIndex[i]); }
698 for (size_t j=0; j < seqIndex.size(); j++) {
699 it = currMap.find(seqIndex[j]);
700 if (it != currMap.end()) {
701 max_dist[i] = max(max_dist[i], it->second);
702 max_dist[j] = max(max_dist[j], it->second);
703 total_dist[i] += it->second;
704 total_dist[j] += it->second;
705 }else{ //if you can't find the distance make it the cutoff
706 max_dist[i] = max(max_dist[i], cutoff);
707 max_dist[j] = max(max_dist[j], cutoff);
708 total_dist[i] += cutoff;
709 total_dist[j] += cutoff;
714 // sequence with the smallest maximum distance is the representative
715 //if tie occurs pick sequence with smallest average distance
718 for (size_t i=0; i < max_dist.size(); i++) {
719 if (m->control_pressed) { return "control"; }
720 if (max_dist[i] < min) {
723 }else if (max_dist[i] == min) {
724 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
725 float newAverage = total_dist[i] / (float) total_dist.size();
727 if (newAverage < currentAverage) {
734 return(names[minIndex]);
737 catch(exception& e) {
738 m->errorOut(e, "GetOTURepCommand", "FindRep");
743 //**********************************************************************************************************************
744 int GetOTURepCommand::process(ListVector* processList) {
746 string name, sequence;
750 if (outputDir == "") { outputDir += m->hasPath(listfile); }
752 ofstream newNamesOutput;
753 string outputNamesFile;
754 map<string, ofstream*> filehandles;
756 if (Groups.size() == 0) { //you don't want to use groups
757 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
758 m->openOutputFile(outputNamesFile, newNamesOutput);
759 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
760 outputNameFiles[outputNamesFile] = processList->getLabel();
761 }else{ //you want to use groups
763 for (int i=0; i<Groups.size(); i++) {
765 filehandles[Groups[i]] = temp;
766 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
768 m->openOutputFile(outputNamesFile, *(temp));
769 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
770 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
774 //for each bin in the list vector
775 for (int i = 0; i < processList->size(); i++) {
776 if (m->control_pressed) {
778 if (Groups.size() == 0) { //you don't want to use groups
779 newNamesOutput.close();
781 for (int j=0; j<Groups.size(); j++) {
782 (*(filehandles[Groups[j]])).close();
783 delete filehandles[Groups[j]];
789 string temp = processList->get(i);
790 vector<string> namesInBin;
791 m->splitAtComma(temp, namesInBin);
793 if (Groups.size() == 0) {
794 nameRep = findRep(namesInBin);
795 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
797 map<string, vector<string> > NamesInGroup;
798 for (int j=0; j<Groups.size(); j++) { //initialize groups
799 NamesInGroup[Groups[j]].resize(0);
802 for (int j=0; j<namesInBin.size(); j++) {
803 string thisgroup = groupMap->getGroup(namesInBin[j]);
805 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
807 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
808 NamesInGroup[thisgroup].push_back(namesInBin[j]);
812 //get rep for each group in otu
813 for (int j=0; j<Groups.size(); j++) {
814 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
815 //get rep for each group
816 nameRep = findRep(NamesInGroup[Groups[j]]);
818 //output group rep and other members of this group
819 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
821 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
822 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
825 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
831 if (Groups.size() == 0) { //you don't want to use groups
832 newNamesOutput.close();
834 for (int i=0; i<Groups.size(); i++) {
835 (*(filehandles[Groups[i]])).close();
836 delete filehandles[Groups[i]];
843 catch(exception& e) {
844 m->errorOut(e, "GetOTURepCommand", "process");
848 //**********************************************************************************************************************
849 int GetOTURepCommand::processNames(string filename, string label) {
853 if (outputDir == "") { outputDir += m->hasPath(listfile); }
854 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
855 m->openOutputFile(outputFileName, out);
856 vector<repStruct> reps;
857 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
860 string tempNameFile = filename + ".temp";
861 m->openOutputFile(tempNameFile, out2);
864 m->openInputFile(filename, in);
868 string rep, binnames;
869 in >> i >> rep >> binnames; m->gobble(in);
870 out2 << rep << '\t' << binnames << endl;
872 vector<string> names;
873 m->splitAtComma(binnames, names);
874 int binsize = names.size();
876 //if you have a groupfile
878 if (groupfile != "") {
879 map<string, string> groups;
880 map<string, string>::iterator groupIt;
882 //find the groups that are in this bin
883 for (size_t i = 0; i < names.size(); i++) {
884 string groupName = groupMap->getGroup(names[i]);
885 if (groupName == "not found") {
886 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
889 groups[groupName] = groupName;
893 //turn the groups into a string
894 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
895 group += groupIt->first + "-";
898 group = group.substr(0, group.length()-1);
902 //print out name and sequence for that bin
903 string sequence = fasta->getSequence(rep);
905 if (sequence != "not found") {
906 if (sorted == "") { //print them out
907 rep = rep + "\t" + toString(i+1);
908 rep = rep + "|" + toString(binsize);
909 if (groupfile != "") {
910 rep = rep + "|" + group;
912 out << ">" << rep << endl;
913 out << sequence << endl;
915 repStruct newRep(rep, i+1, binsize, group);
916 reps.push_back(newRep);
919 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
924 if (sorted != "") { //then sort them and print them
925 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
926 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
927 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
928 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
931 for (int i = 0; i < reps.size(); i++) {
932 string sequence = fasta->getSequence(reps[i].name);
933 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
934 outputName = outputName + "|" + toString(reps[i].size);
935 if (groupfile != "") {
936 outputName = outputName + "|" + reps[i].group;
938 out << ">" << outputName << endl;
939 out << sequence << endl;
947 remove(filename.c_str());
948 rename(tempNameFile.c_str(), filename.c_str());
953 catch(exception& e) {
954 m->errorOut(e, "GetOTURepCommand", "processNames");
958 //**********************************************************************************************************************
959 SeqMap GetOTURepCommand::getMap(int row) {
963 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
964 if (rowPositions[row] != -1){
966 inRow.seekg(rowPositions[row]);
968 int rowNum, numDists, colNum;
971 inRow >> rowNum >> numDists;
973 for(int i = 0; i < numDists; i++) {
974 inRow >> colNum >> dist;
975 rowMap[colNum] = dist;
982 catch(exception& e) {
983 m->errorOut(e, "GetOTURepCommand", "getMap");
987 //**********************************************************************************************************************