5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
106 catch(exception& e) {
107 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option) {
114 abort = false; calledHelp = false;
117 //allow user to run help
118 if (option == "help") {
119 help(); abort = true; calledHelp = true;
120 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
122 vector<string> myArray = setParameters();
124 OptionParser parser(option);
125 map<string, string> parameters = parser.getParameters();
127 ValidParameters validParameter;
128 map<string, string>::iterator it;
130 //check to make sure all parameters are valid for command
131 for (it = parameters.begin(); it != parameters.end(); it++) {
132 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
135 //initialize outputTypes
136 vector<string> tempOutNames;
137 outputTypes["fasta"] = tempOutNames;
138 outputTypes["name"] = tempOutNames;
140 //if the user changes the input directory command factory will send this info to us in the output parameter
141 string inputDir = validParameter.validFile(parameters, "inputdir", false);
142 if (inputDir == "not found"){ inputDir = ""; }
145 it = parameters.find("list");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["list"] = inputDir + it->second; }
153 it = parameters.find("fasta");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["fasta"] = inputDir + it->second; }
161 it = parameters.find("phylip");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["phylip"] = inputDir + it->second; }
169 it = parameters.find("column");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["column"] = inputDir + it->second; }
177 it = parameters.find("name");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["name"] = inputDir + it->second; }
185 it = parameters.find("group");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["group"] = inputDir + it->second; }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") {
201 fastafile = m->getFastaFile();
202 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
205 else if (fastafile == "not open") { abort = true; }
206 else { m->setFastaFile(fastafile); }
208 listfile = validParameter.validFile(parameters, "list", true);
209 if (listfile == "not found") {
210 listfile = m->getListFile();
211 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
212 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
214 else if (listfile == "not open") { abort = true; }
215 else { m->setListFile(listfile); }
217 phylipfile = validParameter.validFile(parameters, "phylip", true);
218 if (phylipfile == "not found") { phylipfile = ""; }
219 else if (phylipfile == "not open") { abort = true; }
220 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
222 columnfile = validParameter.validFile(parameters, "column", true);
223 if (columnfile == "not found") { columnfile = ""; }
224 else if (columnfile == "not open") { abort = true; }
225 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
227 namefile = validParameter.validFile(parameters, "name", true);
228 if (namefile == "not open") { abort = true; }
229 else if (namefile == "not found") { namefile = ""; }
230 else { m->setNameFile(namefile); }
232 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
233 //give priority to column, then phylip
234 columnfile = m->getColumnFile();
235 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
237 phylipfile = m->getPhylipFile();
238 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
240 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
244 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
246 if (columnfile != "") {
247 if (namefile == "") {
248 namefile = m->getNameFile();
249 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
257 //check for optional parameter and set defaults
258 // ...at some point should added some additional type checking...
259 label = validParameter.validFile(parameters, "label", false);
260 if (label == "not found") { label = ""; allLines = 1; }
262 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
263 else { allLines = 1; }
266 groupfile = validParameter.validFile(parameters, "group", true);
267 if (groupfile == "not open") { groupfile = ""; abort = true; }
268 else if (groupfile == "not found") { groupfile = ""; }
269 else { m->setGroupFile(groupfile); }
271 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
272 if (sorted == "none") { sorted=""; }
273 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
274 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
278 if ((sorted == "group") && (groupfile == "")) {
279 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
283 groups = validParameter.validFile(parameters, "groups", false);
284 if (groups == "not found") { groups = ""; }
286 if (groupfile == "") {
287 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
290 m->splitAtDash(groups, Groups);
295 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
296 large = m->isTrue(temp);
298 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
299 weighted = m->isTrue(temp);
301 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
303 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
304 convert(temp, precision);
306 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
307 convert(temp, cutoff);
308 cutoff += (5 / (precision * 10.0));
311 catch(exception& e) {
312 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
317 //**********************************************************************************************************************
319 int GetOTURepCommand::execute(){
322 if (abort == true) { if (calledHelp) { return 0; } return 2; }
327 //read distance files
328 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
329 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
330 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
332 readMatrix->setCutoff(cutoff);
335 nameMap = new NameAssignment(namefile);
337 }else{ nameMap = NULL; }
339 readMatrix->read(nameMap);
341 if (m->control_pressed) { delete readMatrix; return 0; }
343 list = readMatrix->getListVector();
345 SparseMatrix* matrix = readMatrix->getMatrix();
347 // Create a data structure to quickly access the distance information.
348 // It consists of a vector of distance maps, where each map contains
349 // all distances of a certain sequence. Vector and maps are accessed
350 // via the index of a sequence in the distance matrix
351 seqVec = vector<SeqMap>(list->size());
352 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
353 if (m->control_pressed) { delete readMatrix; return 0; }
354 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
356 //add dummy map for unweighted calc
358 seqVec.push_back(dummy);
364 if (m->control_pressed) { return 0; }
366 //process file and set up indexes
367 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
368 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
369 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
371 formatMatrix->setCutoff(cutoff);
374 nameMap = new NameAssignment(namefile);
376 }else{ nameMap = NULL; }
378 formatMatrix->read(nameMap);
380 if (m->control_pressed) { delete formatMatrix; return 0; }
382 list = formatMatrix->getListVector();
384 distFile = formatMatrix->getFormattedFileName();
386 //positions in file where the distances for each sequence begin
387 //rowPositions[1] = position in file where distance related to sequence 1 start.
388 rowPositions = formatMatrix->getRowPositions();
389 rowPositions.push_back(-1); //dummy row for unweighted calc
394 //openfile for getMap to use
395 m->openInputFile(distFile, inRow);
397 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
401 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
403 vector<string> names;
405 //map names to rows in sparsematrix
406 for (int i = 0; i < list->size(); i++) {
408 binnames = list->get(i);
410 m->splitAtComma(binnames, names);
412 for (int j = 0; j < names.size(); j++) {
413 nameToIndex[names[j]] = i;
416 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
419 if (m->control_pressed) {
420 if (large) { inRow.close(); remove(distFile.c_str()); }
424 if (groupfile != "") {
425 //read in group map info.
426 groupMap = new GroupMap(groupfile);
427 int error = groupMap->readMap();
428 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
430 if (Groups.size() != 0) {
431 SharedUtil* util = new SharedUtil();
432 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
437 //done with listvector from matrix
438 if (list != NULL) { delete list; }
440 input = new InputData(listfile, "list");
441 list = input->getListVector();
442 string lastLabel = list->getLabel();
444 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
445 set<string> processedLabels;
446 set<string> userLabels = labels;
448 if (m->control_pressed) {
449 if (large) { inRow.close(); remove(distFile.c_str()); }
450 delete input; delete list; return 0;
453 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
455 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
457 if (allLines == 1 || labels.count(list->getLabel()) == 1){
458 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
459 error = process(list);
460 if (error == 1) { return 0; } //there is an error in hte input files, abort command
462 if (m->control_pressed) {
463 if (large) { inRow.close(); remove(distFile.c_str()); }
464 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
465 delete input; delete list; return 0;
468 processedLabels.insert(list->getLabel());
469 userLabels.erase(list->getLabel());
472 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
473 string saveLabel = list->getLabel();
476 list = input->getListVector(lastLabel);
477 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
478 error = process(list);
479 if (error == 1) { return 0; } //there is an error in hte input files, abort command
481 if (m->control_pressed) {
482 if (large) { inRow.close(); remove(distFile.c_str()); }
483 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
484 delete input; delete list; return 0;
487 processedLabels.insert(list->getLabel());
488 userLabels.erase(list->getLabel());
490 //restore real lastlabel to save below
491 list->setLabel(saveLabel);
494 lastLabel = list->getLabel();
497 list = input->getListVector();
500 //output error messages about any remaining user labels
501 bool needToRun = false;
502 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
503 m->mothurOut("Your file does not include the label " + (*it));
504 if (processedLabels.count(lastLabel) != 1) {
505 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
508 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
512 //run last label if you need to
513 if (needToRun == true) {
514 if (list != NULL) { delete list; }
515 list = input->getListVector(lastLabel);
516 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
517 error = process(list);
519 if (error == 1) { return 0; } //there is an error in hte input files, abort command
521 if (m->control_pressed) {
522 if (large) { inRow.close(); remove(distFile.c_str()); }
523 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
524 delete input; delete list; return 0;
528 //close and remove formatted matrix file
531 remove(distFile.c_str());
536 if (!weighted) { nameFileMap.clear(); }
539 fasta = new FastaMap();
540 fasta->readFastaFile(fastafile);
542 //if user gave a namesfile then use it
543 if (namefile != "") { readNamesFile(); }
545 //output create and output the .rep.fasta files
546 map<string, string>::iterator itNameFile;
547 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
548 processNames(itNameFile->first, itNameFile->second);
552 if (groupfile != "") { delete groupMap; }
554 if (m->control_pressed) { return 0; }
556 //set fasta file as new current fastafile - use first one??
558 itTypes = outputTypes.find("fasta");
559 if (itTypes != outputTypes.end()) {
560 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
563 itTypes = outputTypes.find("name");
564 if (itTypes != outputTypes.end()) {
565 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
568 m->mothurOutEndLine();
569 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
570 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
571 m->mothurOutEndLine();
575 catch(exception& e) {
576 m->errorOut(e, "GetOTURepCommand", "execute");
581 //**********************************************************************************************************************
582 void GetOTURepCommand::readNamesFile() {
585 vector<string> dupNames;
586 m->openInputFile(namefile, in);
588 string name, names, sequence;
591 in >> name; //read from first column A
592 in >> names; //read from second column A,B,C,D
596 //parse names into vector
597 m->splitAtComma(names, dupNames);
599 //store names in fasta map
600 sequence = fasta->getSequence(name);
601 for (int i = 0; i < dupNames.size(); i++) {
602 fasta->push_back(dupNames[i], sequence);
610 catch(exception& e) {
611 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
615 //**********************************************************************************************************************
616 //read names file to find the weighted rep for each bin
617 void GetOTURepCommand::readNamesFile(bool w) {
620 vector<string> dupNames;
621 m->openInputFile(namefile, in);
623 string name, names, sequence;
626 in >> name; m->gobble(in); //read from first column A
627 in >> names; //read from second column A,B,C,D
631 //parse names into vector
632 m->splitAtComma(names, dupNames);
634 for (int i = 0; i < dupNames.size(); i++) {
635 nameFileMap[dupNames[i]] = name;
643 catch(exception& e) {
644 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
648 //**********************************************************************************************************************
649 string GetOTURepCommand::findRep(vector<string> names) {
651 // if only 1 sequence in bin or processing the "unique" label, then
652 // the first sequence of the OTU is the representative one
653 if ((names.size() == 1)) {
656 vector<int> seqIndex(names.size());
657 vector<float> max_dist(names.size());
658 vector<float> total_dist(names.size());
659 map<string, string>::iterator itNameFile;
660 map<string, int>::iterator itNameIndex;
662 //fill seqIndex and initialize sums
663 for (size_t i = 0; i < names.size(); i++) {
665 seqIndex[i] = nameToIndex[names[i]];
667 if (namefile == "") {
668 itNameIndex = nameToIndex.find(names[i]);
670 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
671 if (large) { seqIndex[i] = (rowPositions.size()-1); }
672 else { seqIndex[i] = (seqVec.size()-1); }
674 seqIndex[i] = itNameIndex->second;
678 itNameFile = nameFileMap.find(names[i]);
680 if (itNameFile == nameFileMap.end()) {
681 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
683 string name1 = itNameFile->first;
684 string name2 = itNameFile->second;
686 if (name1 == name2) { //then you are unique so add your real dists
687 seqIndex[i] = nameToIndex[names[i]];
689 if (large) { seqIndex[i] = (rowPositions.size()-1); }
690 else { seqIndex[i] = (seqVec.size()-1); }
699 // loop through all entries in seqIndex
702 for (size_t i=0; i < seqIndex.size(); i++) {
703 if (m->control_pressed) { return "control"; }
705 if (!large) { currMap = seqVec[seqIndex[i]]; }
706 else { currMap = getMap(seqIndex[i]); }
708 for (size_t j=0; j < seqIndex.size(); j++) {
709 it = currMap.find(seqIndex[j]);
710 if (it != currMap.end()) {
711 max_dist[i] = max(max_dist[i], it->second);
712 max_dist[j] = max(max_dist[j], it->second);
713 total_dist[i] += it->second;
714 total_dist[j] += it->second;
715 }else{ //if you can't find the distance make it the cutoff
716 max_dist[i] = max(max_dist[i], cutoff);
717 max_dist[j] = max(max_dist[j], cutoff);
718 total_dist[i] += cutoff;
719 total_dist[j] += cutoff;
724 // sequence with the smallest maximum distance is the representative
725 //if tie occurs pick sequence with smallest average distance
728 for (size_t i=0; i < max_dist.size(); i++) {
729 if (m->control_pressed) { return "control"; }
730 if (max_dist[i] < min) {
733 }else if (max_dist[i] == min) {
734 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
735 float newAverage = total_dist[i] / (float) total_dist.size();
737 if (newAverage < currentAverage) {
744 return(names[minIndex]);
747 catch(exception& e) {
748 m->errorOut(e, "GetOTURepCommand", "FindRep");
753 //**********************************************************************************************************************
754 int GetOTURepCommand::process(ListVector* processList) {
756 string name, sequence;
760 if (outputDir == "") { outputDir += m->hasPath(listfile); }
762 ofstream newNamesOutput;
763 string outputNamesFile;
764 map<string, ofstream*> filehandles;
766 if (Groups.size() == 0) { //you don't want to use groups
767 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
768 m->openOutputFile(outputNamesFile, newNamesOutput);
769 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
770 outputNameFiles[outputNamesFile] = processList->getLabel();
771 }else{ //you want to use groups
773 for (int i=0; i<Groups.size(); i++) {
775 filehandles[Groups[i]] = temp;
776 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
778 m->openOutputFile(outputNamesFile, *(temp));
779 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
780 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
784 //for each bin in the list vector
785 for (int i = 0; i < processList->size(); i++) {
786 if (m->control_pressed) {
788 if (Groups.size() == 0) { //you don't want to use groups
789 newNamesOutput.close();
791 for (int j=0; j<Groups.size(); j++) {
792 (*(filehandles[Groups[j]])).close();
793 delete filehandles[Groups[j]];
799 string temp = processList->get(i);
800 vector<string> namesInBin;
801 m->splitAtComma(temp, namesInBin);
803 if (Groups.size() == 0) {
804 nameRep = findRep(namesInBin);
805 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
807 map<string, vector<string> > NamesInGroup;
808 for (int j=0; j<Groups.size(); j++) { //initialize groups
809 NamesInGroup[Groups[j]].resize(0);
812 for (int j=0; j<namesInBin.size(); j++) {
813 string thisgroup = groupMap->getGroup(namesInBin[j]);
815 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
817 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
818 NamesInGroup[thisgroup].push_back(namesInBin[j]);
822 //get rep for each group in otu
823 for (int j=0; j<Groups.size(); j++) {
824 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
825 //get rep for each group
826 nameRep = findRep(NamesInGroup[Groups[j]]);
828 //output group rep and other members of this group
829 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
831 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
832 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
835 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
841 if (Groups.size() == 0) { //you don't want to use groups
842 newNamesOutput.close();
844 for (int i=0; i<Groups.size(); i++) {
845 (*(filehandles[Groups[i]])).close();
846 delete filehandles[Groups[i]];
853 catch(exception& e) {
854 m->errorOut(e, "GetOTURepCommand", "process");
858 //**********************************************************************************************************************
859 int GetOTURepCommand::processNames(string filename, string label) {
863 if (outputDir == "") { outputDir += m->hasPath(listfile); }
864 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
865 m->openOutputFile(outputFileName, out);
866 vector<repStruct> reps;
867 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
870 string tempNameFile = filename + ".temp";
871 m->openOutputFile(tempNameFile, out2);
874 m->openInputFile(filename, in);
878 string rep, binnames;
879 in >> i >> rep >> binnames; m->gobble(in);
880 out2 << rep << '\t' << binnames << endl;
882 vector<string> names;
883 m->splitAtComma(binnames, names);
884 int binsize = names.size();
886 //if you have a groupfile
888 if (groupfile != "") {
889 map<string, string> groups;
890 map<string, string>::iterator groupIt;
892 //find the groups that are in this bin
893 for (size_t i = 0; i < names.size(); i++) {
894 string groupName = groupMap->getGroup(names[i]);
895 if (groupName == "not found") {
896 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
899 groups[groupName] = groupName;
903 //turn the groups into a string
904 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
905 group += groupIt->first + "-";
908 group = group.substr(0, group.length()-1);
912 //print out name and sequence for that bin
913 string sequence = fasta->getSequence(rep);
915 if (sequence != "not found") {
916 if (sorted == "") { //print them out
917 rep = rep + "\t" + toString(i+1);
918 rep = rep + "|" + toString(binsize);
919 if (groupfile != "") {
920 rep = rep + "|" + group;
922 out << ">" << rep << endl;
923 out << sequence << endl;
925 repStruct newRep(rep, i+1, binsize, group);
926 reps.push_back(newRep);
929 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
934 if (sorted != "") { //then sort them and print them
935 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
936 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
937 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
938 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
941 for (int i = 0; i < reps.size(); i++) {
942 string sequence = fasta->getSequence(reps[i].name);
943 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
944 outputName = outputName + "|" + toString(reps[i].size);
945 if (groupfile != "") {
946 outputName = outputName + "|" + reps[i].group;
948 out << ">" << outputName << endl;
949 out << sequence << endl;
957 remove(filename.c_str());
958 rename(tempNameFile.c_str(), filename.c_str());
963 catch(exception& e) {
964 m->errorOut(e, "GetOTURepCommand", "processNames");
968 //**********************************************************************************************************************
969 SeqMap GetOTURepCommand::getMap(int row) {
973 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
974 if (rowPositions[row] != -1){
976 inRow.seekg(rowPositions[row]);
978 int rowNum, numDists, colNum;
981 inRow >> rowNum >> numDists;
983 for(int i = 0; i < numDists; i++) {
984 inRow >> colNum >> dist;
985 rowMap[colNum] = dist;
992 catch(exception& e) {
993 m->errorOut(e, "GetOTURepCommand", "getMap");
997 //**********************************************************************************************************************