5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){
42 //initialize outputTypes
43 vector<string> tempOutNames;
44 outputTypes["fasta"] = tempOutNames;
45 outputTypes["name"] = tempOutNames;
48 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
52 //**********************************************************************************************************************
53 vector<string> GetOTURepCommand::getValidParameters(){
55 string Array[] = {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
56 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
60 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
64 //**********************************************************************************************************************
65 vector<string> GetOTURepCommand::getRequiredParameters(){
67 string Array[] = {"fasta","list"};
68 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
72 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
76 //**********************************************************************************************************************
77 vector<string> GetOTURepCommand::getRequiredFiles(){
79 vector<string> myArray;
83 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
87 //**********************************************************************************************************************
88 GetOTURepCommand::GetOTURepCommand(string option) {
90 globaldata = GlobalData::getInstance();
95 //allow user to run help
96 if (option == "help") {
99 //valid paramters for this command
100 string Array[] = {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
101 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
103 OptionParser parser(option);
104 map<string, string> parameters = parser.getParameters();
106 ValidParameters validParameter;
107 map<string, string>::iterator it;
109 //check to make sure all parameters are valid for command
110 for (it = parameters.begin(); it != parameters.end(); it++) {
111 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
114 //initialize outputTypes
115 vector<string> tempOutNames;
116 outputTypes["fasta"] = tempOutNames;
117 outputTypes["name"] = tempOutNames;
119 //if the user changes the input directory command factory will send this info to us in the output parameter
120 string inputDir = validParameter.validFile(parameters, "inputdir", false);
121 if (inputDir == "not found"){ inputDir = ""; }
124 it = parameters.find("list");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["list"] = inputDir + it->second; }
132 it = parameters.find("fasta");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["fasta"] = inputDir + it->second; }
140 it = parameters.find("phylip");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["phylip"] = inputDir + it->second; }
148 it = parameters.find("column");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["column"] = inputDir + it->second; }
156 it = parameters.find("name");
157 //user has given a template file
158 if(it != parameters.end()){
159 path = m->hasPath(it->second);
160 //if the user has not given a path then, add inputdir. else leave path alone.
161 if (path == "") { parameters["name"] = inputDir + it->second; }
164 it = parameters.find("group");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["group"] = inputDir + it->second; }
174 //if the user changes the output directory command factory will send this info to us in the output parameter
175 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
177 //check for required parameters
178 fastafile = validParameter.validFile(parameters, "fasta", true);
179 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
180 else if (fastafile == "not open") { abort = true; }
182 listfile = validParameter.validFile(parameters, "list", true);
183 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
184 else if (listfile == "not open") { abort = true; }
186 phylipfile = validParameter.validFile(parameters, "phylip", true);
187 if (phylipfile == "not found") { phylipfile = ""; }
188 else if (phylipfile == "not open") { abort = true; }
189 else { distFile = phylipfile; format = "phylip"; }
191 columnfile = validParameter.validFile(parameters, "column", true);
192 if (columnfile == "not found") { columnfile = ""; }
193 else if (columnfile == "not open") { abort = true; }
194 else { distFile = columnfile; format = "column"; }
196 namefile = validParameter.validFile(parameters, "name", true);
197 if (namefile == "not open") { abort = true; }
198 else if (namefile == "not found") { namefile = ""; }
200 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
201 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
203 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
205 //check for optional parameter and set defaults
206 // ...at some point should added some additional type checking...
207 label = validParameter.validFile(parameters, "label", false);
208 if (label == "not found") { label = ""; allLines = 1; }
210 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
211 else { allLines = 1; }
214 groupfile = validParameter.validFile(parameters, "group", true);
215 if (groupfile == "not open") { groupfile = ""; abort = true; }
216 else if (groupfile == "not found") { groupfile = ""; }
218 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
219 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
220 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
224 if ((sorted == "group") && (groupfile == "")) {
225 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
229 groups = validParameter.validFile(parameters, "groups", false);
230 if (groups == "not found") { groups = ""; }
232 if (groupfile == "") {
233 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
236 m->splitAtDash(groups, Groups);
239 globaldata->Groups = Groups;
241 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
242 large = m->isTrue(temp);
244 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { if (namefile == "") { temp = "F"; } else { temp = "t"; } }
245 weighted = m->isTrue(temp);
247 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
249 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
250 convert(temp, precision);
252 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
253 convert(temp, cutoff);
254 cutoff += (5 / (precision * 10.0));
257 catch(exception& e) {
258 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
263 //**********************************************************************************************************************
265 void GetOTURepCommand::help(){
267 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
268 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
269 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
270 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
271 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
272 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
273 m->mothurOut("The default value for label is all labels in your inputfile.\n");
274 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
275 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
276 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false with no namesfile and true when a name file is provided.\n");
277 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
278 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
279 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
280 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
281 m->mothurOut("The group parameter allows you provide a group file.\n");
282 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
283 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
284 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
285 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
287 catch(exception& e) {
288 m->errorOut(e, "GetOTURepCommand", "help");
293 //**********************************************************************************************************************
295 GetOTURepCommand::~GetOTURepCommand(){}
297 //**********************************************************************************************************************
299 int GetOTURepCommand::execute(){
302 if (abort == true) { return 0; }
306 //read distance files
307 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
308 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
309 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
311 readMatrix->setCutoff(cutoff);
314 nameMap = new NameAssignment(namefile);
316 }else{ nameMap = NULL; }
318 readMatrix->read(nameMap);
320 if (m->control_pressed) { delete readMatrix; return 0; }
323 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
324 globaldata->gListVector = readMatrix->getListVector();
326 SparseMatrix* matrix = readMatrix->getMatrix();
328 // Create a data structure to quickly access the distance information.
329 // It consists of a vector of distance maps, where each map contains
330 // all distances of a certain sequence. Vector and maps are accessed
331 // via the index of a sequence in the distance matrix
332 seqVec = vector<SeqMap>(globaldata->gListVector->size());
333 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
334 if (m->control_pressed) { delete readMatrix; return 0; }
335 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
337 //add dummy map for unweighted calc
339 seqVec.push_back(dummy);
345 if (m->control_pressed) { return 0; }
347 //process file and set up indexes
348 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
349 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
350 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
352 formatMatrix->setCutoff(cutoff);
355 nameMap = new NameAssignment(namefile);
357 }else{ nameMap = NULL; }
359 formatMatrix->read(nameMap);
361 if (m->control_pressed) { delete formatMatrix; return 0; }
364 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
365 globaldata->gListVector = formatMatrix->getListVector();
367 distFile = formatMatrix->getFormattedFileName();
369 //positions in file where the distances for each sequence begin
370 //rowPositions[1] = position in file where distance related to sequence 1 start.
371 rowPositions = formatMatrix->getRowPositions();
372 rowPositions.push_back(-1); //dummy row for unweighted calc
377 //openfile for getMap to use
378 m->openInputFile(distFile, inRow);
380 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
384 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
385 if (globaldata->gListVector != NULL) {
386 vector<string> names;
388 //map names to rows in sparsematrix
389 for (int i = 0; i < globaldata->gListVector->size(); i++) {
391 binnames = globaldata->gListVector->get(i);
393 m->splitAtComma(binnames, names);
395 for (int j = 0; j < names.size(); j++) {
396 nameToIndex[names[j]] = i;
399 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
402 if (m->control_pressed) {
403 if (large) { inRow.close(); remove(distFile.c_str()); }
407 if (groupfile != "") {
408 //read in group map info.
409 groupMap = new GroupMap(groupfile);
410 int error = groupMap->readMap();
411 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
413 if (Groups.size() != 0) {
414 SharedUtil* util = new SharedUtil();
415 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
420 //set format to list so input can get listvector
421 globaldata->setFormat("list");
424 read = new ReadOTUFile(listfile);
425 read->read(&*globaldata);
427 input = globaldata->ginput;
428 list = globaldata->gListVector;
429 string lastLabel = list->getLabel();
431 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
432 set<string> processedLabels;
433 set<string> userLabels = labels;
435 if (m->control_pressed) {
436 if (large) { inRow.close(); remove(distFile.c_str()); }
437 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
440 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
442 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
444 if (allLines == 1 || labels.count(list->getLabel()) == 1){
445 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
446 error = process(list);
447 if (error == 1) { return 0; } //there is an error in hte input files, abort command
449 if (m->control_pressed) {
450 if (large) { inRow.close(); remove(distFile.c_str()); }
451 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
452 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
455 processedLabels.insert(list->getLabel());
456 userLabels.erase(list->getLabel());
459 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
460 string saveLabel = list->getLabel();
463 list = input->getListVector(lastLabel);
464 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
465 error = process(list);
466 if (error == 1) { return 0; } //there is an error in hte input files, abort command
468 if (m->control_pressed) {
469 if (large) { inRow.close(); remove(distFile.c_str()); }
470 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
471 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
474 processedLabels.insert(list->getLabel());
475 userLabels.erase(list->getLabel());
477 //restore real lastlabel to save below
478 list->setLabel(saveLabel);
481 lastLabel = list->getLabel();
484 list = input->getListVector();
487 //output error messages about any remaining user labels
488 bool needToRun = false;
489 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
490 m->mothurOut("Your file does not include the label " + (*it));
491 if (processedLabels.count(lastLabel) != 1) {
492 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
495 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
499 //run last label if you need to
500 if (needToRun == true) {
501 if (list != NULL) { delete list; }
502 list = input->getListVector(lastLabel);
503 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
504 error = process(list);
506 if (error == 1) { return 0; } //there is an error in hte input files, abort command
508 if (m->control_pressed) {
509 if (large) { inRow.close(); remove(distFile.c_str()); }
510 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
511 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
515 //close and remove formatted matrix file
518 remove(distFile.c_str());
521 globaldata->gListVector = NULL;
522 delete input; globaldata->ginput = NULL;
525 if (!weighted) { nameFileMap.clear(); }
528 fasta = new FastaMap();
529 fasta->readFastaFile(fastafile);
531 //if user gave a namesfile then use it
532 if (namefile != "") { readNamesFile(); }
534 //output create and output the .rep.fasta files
535 map<string, string>::iterator itNameFile;
536 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
537 processNames(itNameFile->first, itNameFile->second);
541 if (groupfile != "") {
542 delete groupMap; globaldata->gGroupmap = NULL;
545 if (m->control_pressed) { return 0; }
547 m->mothurOutEndLine();
548 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
549 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
550 m->mothurOutEndLine();
554 catch(exception& e) {
555 m->errorOut(e, "GetOTURepCommand", "execute");
560 //**********************************************************************************************************************
561 void GetOTURepCommand::readNamesFile() {
563 vector<string> dupNames;
564 m->openInputFile(namefile, inNames);
566 string name, names, sequence;
568 while(!inNames.eof()){
569 inNames >> name; //read from first column A
570 inNames >> names; //read from second column A,B,C,D
574 //parse names into vector
575 m->splitAtComma(names, dupNames);
577 //store names in fasta map
578 sequence = fasta->getSequence(name);
579 for (int i = 0; i < dupNames.size(); i++) {
580 fasta->push_back(dupNames[i], sequence);
588 catch(exception& e) {
589 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
593 //**********************************************************************************************************************
594 //read names file to find the weighted rep for each bin
595 void GetOTURepCommand::readNamesFile(bool w) {
597 vector<string> dupNames;
598 m->openInputFile(namefile, inNames);
600 string name, names, sequence;
602 while(!inNames.eof()){
603 inNames >> name; m->gobble(inNames); //read from first column A
604 inNames >> names; //read from second column A,B,C,D
608 //parse names into vector
609 m->splitAtComma(names, dupNames);
611 for (int i = 0; i < dupNames.size(); i++) {
612 nameFileMap[dupNames[i]] = name;
620 catch(exception& e) {
621 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
625 //**********************************************************************************************************************
626 string GetOTURepCommand::findRep(vector<string> names) {
628 // if only 1 sequence in bin or processing the "unique" label, then
629 // the first sequence of the OTU is the representative one
630 if ((names.size() == 1) || (list->getLabel() == "unique")) {
633 vector<int> seqIndex(names.size());
634 vector<float> max_dist(names.size());
635 vector<float> total_dist(names.size());
636 map<string, string>::iterator itNameFile;
637 map<string, int>::iterator itNameIndex;
639 //fill seqIndex and initialize sums
640 for (size_t i = 0; i < names.size(); i++) {
642 seqIndex[i] = nameToIndex[names[i]];
644 if (namefile == "") {
645 itNameIndex = nameToIndex.find(names[i]);
647 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
648 if (large) { seqIndex[i] = (rowPositions.size()-1); }
649 else { seqIndex[i] = (seqVec.size()-1); }
651 seqIndex[i] = itNameIndex->second;
655 itNameFile = nameFileMap.find(names[i]);
657 if (itNameFile == nameFileMap.end()) {
658 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
660 string name1 = itNameFile->first;
661 string name2 = itNameFile->second;
663 if (name1 == name2) { //then you are unique so add your real dists
664 seqIndex[i] = nameToIndex[names[i]];
666 if (large) { seqIndex[i] = (rowPositions.size()-1); }
667 else { seqIndex[i] = (seqVec.size()-1); }
676 // loop through all entries in seqIndex
679 for (size_t i=0; i < seqIndex.size(); i++) {
680 if (m->control_pressed) { return "control"; }
682 if (!large) { currMap = seqVec[seqIndex[i]]; }
683 else { currMap = getMap(seqIndex[i]); }
685 for (size_t j=0; j < seqIndex.size(); j++) {
686 it = currMap.find(seqIndex[j]);
687 if (it != currMap.end()) {
688 max_dist[i] = max(max_dist[i], it->second);
689 max_dist[j] = max(max_dist[j], it->second);
690 total_dist[i] += it->second;
691 total_dist[j] += it->second;
692 }else{ //if you can't find the distance make it the cutoff
693 max_dist[i] = max(max_dist[i], cutoff);
694 max_dist[j] = max(max_dist[j], cutoff);
695 total_dist[i] += cutoff;
696 total_dist[j] += cutoff;
701 // sequence with the smallest maximum distance is the representative
702 //if tie occurs pick sequence with smallest average distance
705 for (size_t i=0; i < max_dist.size(); i++) {
706 if (m->control_pressed) { return "control"; }
707 if (max_dist[i] < min) {
710 }else if (max_dist[i] == min) {
711 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
712 float newAverage = total_dist[i] / (float) total_dist.size();
714 if (newAverage < currentAverage) {
721 return(names[minIndex]);
724 catch(exception& e) {
725 m->errorOut(e, "GetOTURepCommand", "FindRep");
730 //**********************************************************************************************************************
731 int GetOTURepCommand::process(ListVector* processList) {
733 string name, sequence;
737 if (outputDir == "") { outputDir += m->hasPath(listfile); }
739 ofstream newNamesOutput;
740 string outputNamesFile;
741 map<string, ofstream*> filehandles;
743 if (Groups.size() == 0) { //you don't want to use groups
744 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
745 m->openOutputFile(outputNamesFile, newNamesOutput);
746 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
747 outputNameFiles[outputNamesFile] = processList->getLabel();
748 }else{ //you want to use groups
750 for (int i=0; i<Groups.size(); i++) {
752 filehandles[Groups[i]] = temp;
753 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
755 m->openOutputFile(outputNamesFile, *(temp));
756 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
757 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
761 //for each bin in the list vector
762 for (int i = 0; i < processList->size(); i++) {
763 if (m->control_pressed) {
765 if (Groups.size() == 0) { //you don't want to use groups
766 newNamesOutput.close();
768 for (int j=0; j<Groups.size(); j++) {
769 (*(filehandles[Groups[j]])).close();
770 delete filehandles[Groups[j]];
776 string temp = processList->get(i);
777 vector<string> namesInBin;
778 m->splitAtComma(temp, namesInBin);
780 if (Groups.size() == 0) {
781 nameRep = findRep(namesInBin);
782 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
784 map<string, vector<string> > NamesInGroup;
785 for (int j=0; j<Groups.size(); j++) { //initialize groups
786 NamesInGroup[Groups[j]].resize(0);
789 for (int j=0; j<namesInBin.size(); j++) {
790 string thisgroup = groupMap->getGroup(namesInBin[j]);
792 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
794 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
795 NamesInGroup[thisgroup].push_back(namesInBin[j]);
799 //get rep for each group in otu
800 for (int j=0; j<Groups.size(); j++) {
801 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
802 //get rep for each group
803 nameRep = findRep(NamesInGroup[Groups[j]]);
805 //output group rep and other members of this group
806 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
808 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
809 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
812 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
818 if (Groups.size() == 0) { //you don't want to use groups
819 newNamesOutput.close();
821 for (int i=0; i<Groups.size(); i++) {
822 (*(filehandles[Groups[i]])).close();
823 delete filehandles[Groups[i]];
830 catch(exception& e) {
831 m->errorOut(e, "GetOTURepCommand", "process");
835 //**********************************************************************************************************************
836 int GetOTURepCommand::processNames(string filename, string label) {
840 if (outputDir == "") { outputDir += m->hasPath(listfile); }
841 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
842 m->openOutputFile(outputFileName, out);
843 vector<repStruct> reps;
844 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
847 string tempNameFile = filename + ".temp";
848 m->openOutputFile(tempNameFile, out2);
851 m->openInputFile(filename, in);
855 string rep, binnames;
856 in >> i >> rep >> binnames; m->gobble(in);
857 out2 << rep << '\t' << binnames << endl;
859 vector<string> names;
860 m->splitAtComma(binnames, names);
861 int binsize = names.size();
863 //if you have a groupfile
865 if (groupfile != "") {
866 map<string, string> groups;
867 map<string, string>::iterator groupIt;
869 //find the groups that are in this bin
870 for (size_t i = 0; i < names.size(); i++) {
871 string groupName = groupMap->getGroup(names[i]);
872 if (groupName == "not found") {
873 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
876 groups[groupName] = groupName;
880 //turn the groups into a string
881 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
882 group += groupIt->first + "-";
885 group = group.substr(0, group.length()-1);
889 //print out name and sequence for that bin
890 string sequence = fasta->getSequence(rep);
892 if (sequence != "not found") {
893 if (sorted == "") { //print them out
894 rep = rep + "\t" + toString(i+1);
895 rep = rep + "|" + toString(binsize);
896 if (groupfile != "") {
897 rep = rep + "|" + group;
899 out << ">" << rep << endl;
900 out << sequence << endl;
902 repStruct newRep(rep, i+1, binsize, group);
903 reps.push_back(newRep);
906 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
911 if (sorted != "") { //then sort them and print them
912 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
913 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
914 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
915 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
918 for (int i = 0; i < reps.size(); i++) {
919 string sequence = fasta->getSequence(reps[i].name);
920 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
921 outputName = outputName + "|" + toString(reps[i].size);
922 if (groupfile != "") {
923 outputName = outputName + "|" + reps[i].group;
925 out << ">" << outputName << endl;
926 out << sequence << endl;
934 remove(filename.c_str());
935 rename(tempNameFile.c_str(), filename.c_str());
940 catch(exception& e) {
941 m->errorOut(e, "GetOTURepCommand", "processNames");
945 //**********************************************************************************************************************
946 SeqMap GetOTURepCommand::getMap(int row) {
950 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
951 if (rowPositions[row] != -1){
953 inRow.seekg(rowPositions[row]);
955 int rowNum, numDists, colNum;
958 inRow >> rowNum >> numDists;
960 for(int i = 0; i < numDists; i++) {
961 inRow >> colNum >> dist;
962 rowMap[colNum] = dist;
969 catch(exception& e) {
970 m->errorOut(e, "GetOTURepCommand", "getMap");
974 //**********************************************************************************************************************