5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
106 catch(exception& e) {
107 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option) {
114 abort = false; calledHelp = false;
117 //allow user to run help
118 if (option == "help") {
119 help(); abort = true; calledHelp = true;
120 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
122 vector<string> myArray = setParameters();
124 OptionParser parser(option);
125 map<string, string> parameters = parser.getParameters();
127 ValidParameters validParameter;
128 map<string, string>::iterator it;
130 //check to make sure all parameters are valid for command
131 for (it = parameters.begin(); it != parameters.end(); it++) {
132 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
135 //initialize outputTypes
136 vector<string> tempOutNames;
137 outputTypes["fasta"] = tempOutNames;
138 outputTypes["name"] = tempOutNames;
140 //if the user changes the input directory command factory will send this info to us in the output parameter
141 string inputDir = validParameter.validFile(parameters, "inputdir", false);
142 if (inputDir == "not found"){ inputDir = ""; }
145 it = parameters.find("list");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["list"] = inputDir + it->second; }
153 it = parameters.find("fasta");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["fasta"] = inputDir + it->second; }
161 it = parameters.find("phylip");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["phylip"] = inputDir + it->second; }
169 it = parameters.find("column");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["column"] = inputDir + it->second; }
177 it = parameters.find("name");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["name"] = inputDir + it->second; }
185 it = parameters.find("group");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["group"] = inputDir + it->second; }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") { fastafile = ""; }
201 else if (fastafile == "not open") { abort = true; }
202 else { m->setFastaFile(fastafile); }
204 listfile = validParameter.validFile(parameters, "list", true);
205 if (listfile == "not found") {
206 listfile = m->getListFile();
207 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
208 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
210 else if (listfile == "not open") { abort = true; }
211 else { m->setListFile(listfile); }
213 phylipfile = validParameter.validFile(parameters, "phylip", true);
214 if (phylipfile == "not found") { phylipfile = ""; }
215 else if (phylipfile == "not open") { abort = true; }
216 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
218 columnfile = validParameter.validFile(parameters, "column", true);
219 if (columnfile == "not found") { columnfile = ""; }
220 else if (columnfile == "not open") { abort = true; }
221 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
223 namefile = validParameter.validFile(parameters, "name", true);
224 if (namefile == "not open") { abort = true; }
225 else if (namefile == "not found") { namefile = ""; }
226 else { m->setNameFile(namefile); }
228 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
229 //give priority to column, then phylip
230 columnfile = m->getColumnFile();
231 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
233 phylipfile = m->getPhylipFile();
234 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
236 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
240 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
242 if (columnfile != "") {
243 if (namefile == "") {
244 namefile = m->getNameFile();
245 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
247 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
253 //check for optional parameter and set defaults
254 // ...at some point should added some additional type checking...
255 label = validParameter.validFile(parameters, "label", false);
256 if (label == "not found") { label = ""; allLines = 1; }
258 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
259 else { allLines = 1; }
262 groupfile = validParameter.validFile(parameters, "group", true);
263 if (groupfile == "not open") { groupfile = ""; abort = true; }
264 else if (groupfile == "not found") { groupfile = ""; }
265 else { m->setGroupFile(groupfile); }
267 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
268 if (sorted == "none") { sorted=""; }
269 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
270 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
274 if ((sorted == "group") && (groupfile == "")) {
275 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
279 groups = validParameter.validFile(parameters, "groups", false);
280 if (groups == "not found") { groups = ""; }
282 if (groupfile == "") {
283 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
286 m->splitAtDash(groups, Groups);
289 m->setGroups(Groups);
291 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
292 large = m->isTrue(temp);
294 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
295 weighted = m->isTrue(temp);
297 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
299 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
300 m->mothurConvert(temp, precision);
302 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
303 m->mothurConvert(temp, cutoff);
304 cutoff += (5 / (precision * 10.0));
307 catch(exception& e) {
308 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
313 //**********************************************************************************************************************
315 int GetOTURepCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
323 //read distance files
324 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
325 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
326 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
328 readMatrix->setCutoff(cutoff);
331 nameMap = new NameAssignment(namefile);
333 }else{ nameMap = NULL; }
335 readMatrix->read(nameMap);
337 if (m->control_pressed) { delete readMatrix; return 0; }
339 list = readMatrix->getListVector();
341 SparseMatrix* matrix = readMatrix->getMatrix();
343 // Create a data structure to quickly access the distance information.
344 // It consists of a vector of distance maps, where each map contains
345 // all distances of a certain sequence. Vector and maps are accessed
346 // via the index of a sequence in the distance matrix
347 seqVec = vector<SeqMap>(list->size());
348 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
349 if (m->control_pressed) { delete readMatrix; return 0; }
350 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
352 //add dummy map for unweighted calc
354 seqVec.push_back(dummy);
360 if (m->control_pressed) { return 0; }
362 //process file and set up indexes
363 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
364 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
365 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
367 formatMatrix->setCutoff(cutoff);
370 nameMap = new NameAssignment(namefile);
372 }else{ nameMap = NULL; }
374 formatMatrix->read(nameMap);
376 if (m->control_pressed) { delete formatMatrix; return 0; }
378 list = formatMatrix->getListVector();
380 distFile = formatMatrix->getFormattedFileName();
382 //positions in file where the distances for each sequence begin
383 //rowPositions[1] = position in file where distance related to sequence 1 start.
384 rowPositions = formatMatrix->getRowPositions();
385 rowPositions.push_back(-1); //dummy row for unweighted calc
390 //openfile for getMap to use
391 m->openInputFile(distFile, inRow);
393 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
397 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
399 vector<string> names;
401 //map names to rows in sparsematrix
402 for (int i = 0; i < list->size(); i++) {
404 binnames = list->get(i);
406 m->splitAtComma(binnames, names);
408 for (int j = 0; j < names.size(); j++) {
409 nameToIndex[names[j]] = i;
412 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
415 if (m->control_pressed) {
416 if (large) { inRow.close(); m->mothurRemove(distFile); }
420 if (groupfile != "") {
421 //read in group map info.
422 groupMap = new GroupMap(groupfile);
423 int error = groupMap->readMap();
424 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
426 if (Groups.size() != 0) {
427 SharedUtil* util = new SharedUtil();
428 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
429 util->setGroups(Groups, gNamesOfGroups, "getoturep");
430 groupMap->setNamesOfGroups(gNamesOfGroups);
435 //done with listvector from matrix
436 if (list != NULL) { delete list; }
438 input = new InputData(listfile, "list");
439 list = input->getListVector();
440 string lastLabel = list->getLabel();
442 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
443 set<string> processedLabels;
444 set<string> userLabels = labels;
446 if (m->control_pressed) {
447 if (large) { inRow.close(); m->mothurRemove(distFile); }
448 delete input; delete list; return 0;
451 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
453 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
455 if (allLines == 1 || labels.count(list->getLabel()) == 1){
456 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
457 error = process(list);
458 if (error == 1) { return 0; } //there is an error in hte input files, abort command
460 if (m->control_pressed) {
461 if (large) { inRow.close(); m->mothurRemove(distFile); }
462 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
463 delete input; delete list; return 0;
466 processedLabels.insert(list->getLabel());
467 userLabels.erase(list->getLabel());
470 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
471 string saveLabel = list->getLabel();
474 list = input->getListVector(lastLabel);
475 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
476 error = process(list);
477 if (error == 1) { return 0; } //there is an error in hte input files, abort command
479 if (m->control_pressed) {
480 if (large) { inRow.close(); m->mothurRemove(distFile); }
481 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
482 delete input; delete list; return 0;
485 processedLabels.insert(list->getLabel());
486 userLabels.erase(list->getLabel());
488 //restore real lastlabel to save below
489 list->setLabel(saveLabel);
492 lastLabel = list->getLabel();
495 list = input->getListVector();
498 //output error messages about any remaining user labels
499 bool needToRun = false;
500 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
501 m->mothurOut("Your file does not include the label " + (*it));
502 if (processedLabels.count(lastLabel) != 1) {
503 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
506 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
510 //run last label if you need to
511 if (needToRun == true) {
512 if (list != NULL) { delete list; }
513 list = input->getListVector(lastLabel);
514 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
515 error = process(list);
517 if (error == 1) { return 0; } //there is an error in hte input files, abort command
519 if (m->control_pressed) {
520 if (large) { inRow.close(); m->mothurRemove(distFile); }
521 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
522 delete input; delete list; return 0;
526 //close and remove formatted matrix file
529 m->mothurRemove(distFile);
534 if (!weighted) { nameFileMap.clear(); }
537 if (fastafile != "") {
539 fasta = new FastaMap();
540 fasta->readFastaFile(fastafile);
542 //if user gave a namesfile then use it
543 if (namefile != "") { readNamesFile(); }
545 //output create and output the .rep.fasta files
546 map<string, string>::iterator itNameFile;
547 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
548 processFastaNames(itNameFile->first, itNameFile->second);
551 //output create and output the .rep.fasta files
552 map<string, string>::iterator itNameFile;
553 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
554 processNames(itNameFile->first, itNameFile->second);
559 if (groupfile != "") { delete groupMap; }
561 if (m->control_pressed) { return 0; }
563 //set fasta file as new current fastafile - use first one??
565 itTypes = outputTypes.find("fasta");
566 if (itTypes != outputTypes.end()) {
567 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
570 itTypes = outputTypes.find("name");
571 if (itTypes != outputTypes.end()) {
572 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
575 m->mothurOutEndLine();
576 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
577 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
578 m->mothurOutEndLine();
582 catch(exception& e) {
583 m->errorOut(e, "GetOTURepCommand", "execute");
588 //**********************************************************************************************************************
589 void GetOTURepCommand::readNamesFile() {
592 vector<string> dupNames;
593 m->openInputFile(namefile, in);
595 string name, names, sequence;
598 in >> name; //read from first column A
599 in >> names; //read from second column A,B,C,D
603 //parse names into vector
604 m->splitAtComma(names, dupNames);
606 //store names in fasta map
607 sequence = fasta->getSequence(name);
608 for (int i = 0; i < dupNames.size(); i++) {
609 fasta->push_back(dupNames[i], sequence);
617 catch(exception& e) {
618 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
622 //**********************************************************************************************************************
623 //read names file to find the weighted rep for each bin
624 void GetOTURepCommand::readNamesFile(bool w) {
627 vector<string> dupNames;
628 m->openInputFile(namefile, in);
630 string name, names, sequence;
633 in >> name; m->gobble(in); //read from first column A
634 in >> names; //read from second column A,B,C,D
638 //parse names into vector
639 m->splitAtComma(names, dupNames);
641 for (int i = 0; i < dupNames.size(); i++) {
642 nameFileMap[dupNames[i]] = name;
650 catch(exception& e) {
651 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
655 //**********************************************************************************************************************
656 string GetOTURepCommand::findRep(vector<string> names) {
658 // if only 1 sequence in bin or processing the "unique" label, then
659 // the first sequence of the OTU is the representative one
660 if ((names.size() == 1)) {
663 vector<int> seqIndex(names.size());
664 vector<float> max_dist(names.size());
665 vector<float> total_dist(names.size());
666 map<string, string>::iterator itNameFile;
667 map<string, int>::iterator itNameIndex;
669 //fill seqIndex and initialize sums
670 for (size_t i = 0; i < names.size(); i++) {
672 seqIndex[i] = nameToIndex[names[i]];
674 if (namefile == "") {
675 itNameIndex = nameToIndex.find(names[i]);
677 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
678 if (large) { seqIndex[i] = (rowPositions.size()-1); }
679 else { seqIndex[i] = (seqVec.size()-1); }
681 seqIndex[i] = itNameIndex->second;
685 itNameFile = nameFileMap.find(names[i]);
687 if (itNameFile == nameFileMap.end()) {
688 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
690 string name1 = itNameFile->first;
691 string name2 = itNameFile->second;
693 if (name1 == name2) { //then you are unique so add your real dists
694 seqIndex[i] = nameToIndex[names[i]];
696 if (large) { seqIndex[i] = (rowPositions.size()-1); }
697 else { seqIndex[i] = (seqVec.size()-1); }
706 // loop through all entries in seqIndex
709 for (size_t i=0; i < seqIndex.size(); i++) {
710 if (m->control_pressed) { return "control"; }
712 if (!large) { currMap = seqVec[seqIndex[i]]; }
713 else { currMap = getMap(seqIndex[i]); }
715 for (size_t j=0; j < seqIndex.size(); j++) {
716 it = currMap.find(seqIndex[j]);
717 if (it != currMap.end()) {
718 max_dist[i] = max(max_dist[i], it->second);
719 max_dist[j] = max(max_dist[j], it->second);
720 total_dist[i] += it->second;
721 total_dist[j] += it->second;
722 }else{ //if you can't find the distance make it the cutoff
723 max_dist[i] = max(max_dist[i], cutoff);
724 max_dist[j] = max(max_dist[j], cutoff);
725 total_dist[i] += cutoff;
726 total_dist[j] += cutoff;
731 // sequence with the smallest maximum distance is the representative
732 //if tie occurs pick sequence with smallest average distance
735 for (size_t i=0; i < max_dist.size(); i++) {
736 if (m->control_pressed) { return "control"; }
737 if (max_dist[i] < min) {
740 }else if (max_dist[i] == min) {
741 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
742 float newAverage = total_dist[i] / (float) total_dist.size();
744 if (newAverage < currentAverage) {
751 return(names[minIndex]);
754 catch(exception& e) {
755 m->errorOut(e, "GetOTURepCommand", "FindRep");
760 //**********************************************************************************************************************
761 int GetOTURepCommand::process(ListVector* processList) {
763 string name, sequence;
767 if (outputDir == "") { outputDir += m->hasPath(listfile); }
769 ofstream newNamesOutput;
770 string outputNamesFile;
771 map<string, ofstream*> filehandles;
773 if (Groups.size() == 0) { //you don't want to use groups
774 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
775 m->openOutputFile(outputNamesFile, newNamesOutput);
776 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
777 outputNameFiles[outputNamesFile] = processList->getLabel();
778 }else{ //you want to use groups
780 for (int i=0; i<Groups.size(); i++) {
782 filehandles[Groups[i]] = temp;
783 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
785 m->openOutputFile(outputNamesFile, *(temp));
786 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
787 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
791 //for each bin in the list vector
792 for (int i = 0; i < processList->size(); i++) {
793 if (m->control_pressed) {
795 if (Groups.size() == 0) { //you don't want to use groups
796 newNamesOutput.close();
798 for (int j=0; j<Groups.size(); j++) {
799 (*(filehandles[Groups[j]])).close();
800 delete filehandles[Groups[j]];
806 string temp = processList->get(i);
807 vector<string> namesInBin;
808 m->splitAtComma(temp, namesInBin);
810 if (Groups.size() == 0) {
811 nameRep = findRep(namesInBin);
812 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
814 map<string, vector<string> > NamesInGroup;
815 for (int j=0; j<Groups.size(); j++) { //initialize groups
816 NamesInGroup[Groups[j]].resize(0);
819 for (int j=0; j<namesInBin.size(); j++) {
820 string thisgroup = groupMap->getGroup(namesInBin[j]);
822 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
824 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
825 NamesInGroup[thisgroup].push_back(namesInBin[j]);
829 //get rep for each group in otu
830 for (int j=0; j<Groups.size(); j++) {
831 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
832 //get rep for each group
833 nameRep = findRep(NamesInGroup[Groups[j]]);
835 //output group rep and other members of this group
836 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
838 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
839 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
842 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
848 if (Groups.size() == 0) { //you don't want to use groups
849 newNamesOutput.close();
851 for (int i=0; i<Groups.size(); i++) {
852 (*(filehandles[Groups[i]])).close();
853 delete filehandles[Groups[i]];
860 catch(exception& e) {
861 m->errorOut(e, "GetOTURepCommand", "process");
865 //**********************************************************************************************************************
866 int GetOTURepCommand::processFastaNames(string filename, string label) {
870 if (outputDir == "") { outputDir += m->hasPath(listfile); }
871 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
872 m->openOutputFile(outputFileName, out);
873 vector<repStruct> reps;
874 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
877 string tempNameFile = filename + ".temp";
878 m->openOutputFile(tempNameFile, out2);
881 m->openInputFile(filename, in);
885 string rep, binnames;
886 in >> i >> rep >> binnames; m->gobble(in);
887 out2 << rep << '\t' << binnames << endl;
889 vector<string> names;
890 m->splitAtComma(binnames, names);
891 int binsize = names.size();
893 //if you have a groupfile
895 if (groupfile != "") {
896 map<string, string> groups;
897 map<string, string>::iterator groupIt;
899 //find the groups that are in this bin
900 for (size_t i = 0; i < names.size(); i++) {
901 string groupName = groupMap->getGroup(names[i]);
902 if (groupName == "not found") {
903 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
906 groups[groupName] = groupName;
910 //turn the groups into a string
911 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
912 group += groupIt->first + "-";
915 group = group.substr(0, group.length()-1);
919 //print out name and sequence for that bin
920 string sequence = fasta->getSequence(rep);
922 if (sequence != "not found") {
923 if (sorted == "") { //print them out
924 rep = rep + "\t" + toString(i+1);
925 rep = rep + "|" + toString(binsize);
926 if (groupfile != "") {
927 rep = rep + "|" + group;
929 out << ">" << rep << endl;
930 out << sequence << endl;
932 repStruct newRep(rep, i+1, binsize, group);
933 reps.push_back(newRep);
936 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
941 if (sorted != "") { //then sort them and print them
942 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
943 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
944 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
945 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
948 for (int i = 0; i < reps.size(); i++) {
949 string sequence = fasta->getSequence(reps[i].name);
950 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
951 outputName = outputName + "|" + toString(reps[i].size);
952 if (groupfile != "") {
953 outputName = outputName + "|" + reps[i].group;
955 out << ">" << outputName << endl;
956 out << sequence << endl;
964 m->mothurRemove(filename);
965 rename(tempNameFile.c_str(), filename.c_str());
970 catch(exception& e) {
971 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
975 //**********************************************************************************************************************
976 int GetOTURepCommand::processNames(string filename, string label) {
980 if (outputDir == "") { outputDir += m->hasPath(listfile); }
983 string tempNameFile = filename + ".temp";
984 m->openOutputFile(tempNameFile, out2);
987 m->openInputFile(filename, in);
990 string rep, binnames;
992 if (m->control_pressed) { break; }
993 in >> i >> rep >> binnames; m->gobble(in);
994 out2 << rep << '\t' << binnames << endl;
999 m->mothurRemove(filename);
1000 rename(tempNameFile.c_str(), filename.c_str());
1004 catch(exception& e) {
1005 m->errorOut(e, "GetOTURepCommand", "processNames");
1009 //**********************************************************************************************************************
1010 SeqMap GetOTURepCommand::getMap(int row) {
1014 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1015 if (rowPositions[row] != -1){
1017 inRow.seekg(rowPositions[row]);
1019 int rowNum, numDists, colNum;
1022 inRow >> rowNum >> numDists;
1024 for(int i = 0; i < numDists; i++) {
1025 inRow >> colNum >> dist;
1026 rowMap[colNum] = dist;
1033 catch(exception& e) {
1034 m->errorOut(e, "GetOTURepCommand", "getMap");
1038 //**********************************************************************************************************************