5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
106 catch(exception& e) {
107 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option) {
114 abort = false; calledHelp = false;
117 //allow user to run help
118 if (option == "help") {
119 help(); abort = true; calledHelp = true;
121 vector<string> myArray = setParameters();
123 OptionParser parser(option);
124 map<string, string> parameters = parser.getParameters();
126 ValidParameters validParameter;
127 map<string, string>::iterator it;
129 //check to make sure all parameters are valid for command
130 for (it = parameters.begin(); it != parameters.end(); it++) {
131 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
134 //initialize outputTypes
135 vector<string> tempOutNames;
136 outputTypes["fasta"] = tempOutNames;
137 outputTypes["name"] = tempOutNames;
139 //if the user changes the input directory command factory will send this info to us in the output parameter
140 string inputDir = validParameter.validFile(parameters, "inputdir", false);
141 if (inputDir == "not found"){ inputDir = ""; }
144 it = parameters.find("list");
145 //user has given a template file
146 if(it != parameters.end()){
147 path = m->hasPath(it->second);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { parameters["list"] = inputDir + it->second; }
152 it = parameters.find("fasta");
153 //user has given a template file
154 if(it != parameters.end()){
155 path = m->hasPath(it->second);
156 //if the user has not given a path then, add inputdir. else leave path alone.
157 if (path == "") { parameters["fasta"] = inputDir + it->second; }
160 it = parameters.find("phylip");
161 //user has given a template file
162 if(it != parameters.end()){
163 path = m->hasPath(it->second);
164 //if the user has not given a path then, add inputdir. else leave path alone.
165 if (path == "") { parameters["phylip"] = inputDir + it->second; }
168 it = parameters.find("column");
169 //user has given a template file
170 if(it != parameters.end()){
171 path = m->hasPath(it->second);
172 //if the user has not given a path then, add inputdir. else leave path alone.
173 if (path == "") { parameters["column"] = inputDir + it->second; }
176 it = parameters.find("name");
177 //user has given a template file
178 if(it != parameters.end()){
179 path = m->hasPath(it->second);
180 //if the user has not given a path then, add inputdir. else leave path alone.
181 if (path == "") { parameters["name"] = inputDir + it->second; }
184 it = parameters.find("group");
185 //user has given a template file
186 if(it != parameters.end()){
187 path = m->hasPath(it->second);
188 //if the user has not given a path then, add inputdir. else leave path alone.
189 if (path == "") { parameters["group"] = inputDir + it->second; }
194 //if the user changes the output directory command factory will send this info to us in the output parameter
195 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
197 //check for required parameters
198 fastafile = validParameter.validFile(parameters, "fasta", true);
199 if (fastafile == "not found") {
200 fastafile = m->getFastaFile();
201 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
202 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204 else if (fastafile == "not open") { abort = true; }
206 listfile = validParameter.validFile(parameters, "list", true);
207 if (listfile == "not found") {
208 listfile = m->getListFile();
209 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
210 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
212 else if (listfile == "not open") { abort = true; }
214 phylipfile = validParameter.validFile(parameters, "phylip", true);
215 if (phylipfile == "not found") { phylipfile = ""; }
216 else if (phylipfile == "not open") { abort = true; }
217 else { distFile = phylipfile; format = "phylip"; }
219 columnfile = validParameter.validFile(parameters, "column", true);
220 if (columnfile == "not found") { columnfile = ""; }
221 else if (columnfile == "not open") { abort = true; }
222 else { distFile = columnfile; format = "column"; }
224 namefile = validParameter.validFile(parameters, "name", true);
225 if (namefile == "not open") { abort = true; }
226 else if (namefile == "not found") { namefile = ""; }
228 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
229 //give priority to column, then phylip
230 columnfile = m->getColumnFile();
231 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
233 phylipfile = m->getPhylipFile();
234 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
236 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
240 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
242 if (columnfile != "") {
243 if (namefile == "") {
244 namefile = m->getNameFile();
245 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
247 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
253 //check for optional parameter and set defaults
254 // ...at some point should added some additional type checking...
255 label = validParameter.validFile(parameters, "label", false);
256 if (label == "not found") { label = ""; allLines = 1; }
258 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
259 else { allLines = 1; }
262 groupfile = validParameter.validFile(parameters, "group", true);
263 if (groupfile == "not open") { groupfile = ""; abort = true; }
264 else if (groupfile == "not found") { groupfile = ""; }
266 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
267 if (sorted == "none") { sorted=""; }
268 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
269 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
273 if ((sorted == "group") && (groupfile == "")) {
274 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
278 groups = validParameter.validFile(parameters, "groups", false);
279 if (groups == "not found") { groups = ""; }
281 if (groupfile == "") {
282 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
285 m->splitAtDash(groups, Groups);
290 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
291 large = m->isTrue(temp);
293 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
294 weighted = m->isTrue(temp);
296 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
298 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
299 convert(temp, precision);
301 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
302 convert(temp, cutoff);
303 cutoff += (5 / (precision * 10.0));
306 catch(exception& e) {
307 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
312 //**********************************************************************************************************************
314 int GetOTURepCommand::execute(){
317 if (abort == true) { if (calledHelp) { return 0; } return 2; }
322 //read distance files
323 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
324 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
325 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
327 readMatrix->setCutoff(cutoff);
330 nameMap = new NameAssignment(namefile);
332 }else{ nameMap = NULL; }
334 readMatrix->read(nameMap);
336 if (m->control_pressed) { delete readMatrix; return 0; }
338 list = readMatrix->getListVector();
340 SparseMatrix* matrix = readMatrix->getMatrix();
342 // Create a data structure to quickly access the distance information.
343 // It consists of a vector of distance maps, where each map contains
344 // all distances of a certain sequence. Vector and maps are accessed
345 // via the index of a sequence in the distance matrix
346 seqVec = vector<SeqMap>(list->size());
347 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
348 if (m->control_pressed) { delete readMatrix; return 0; }
349 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
351 //add dummy map for unweighted calc
353 seqVec.push_back(dummy);
359 if (m->control_pressed) { return 0; }
361 //process file and set up indexes
362 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
363 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
364 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
366 formatMatrix->setCutoff(cutoff);
369 nameMap = new NameAssignment(namefile);
371 }else{ nameMap = NULL; }
373 formatMatrix->read(nameMap);
375 if (m->control_pressed) { delete formatMatrix; return 0; }
377 list = formatMatrix->getListVector();
379 distFile = formatMatrix->getFormattedFileName();
381 //positions in file where the distances for each sequence begin
382 //rowPositions[1] = position in file where distance related to sequence 1 start.
383 rowPositions = formatMatrix->getRowPositions();
384 rowPositions.push_back(-1); //dummy row for unweighted calc
389 //openfile for getMap to use
390 m->openInputFile(distFile, inRow);
392 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
396 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
398 vector<string> names;
400 //map names to rows in sparsematrix
401 for (int i = 0; i < list->size(); i++) {
403 binnames = list->get(i);
405 m->splitAtComma(binnames, names);
407 for (int j = 0; j < names.size(); j++) {
408 nameToIndex[names[j]] = i;
411 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
414 if (m->control_pressed) {
415 if (large) { inRow.close(); remove(distFile.c_str()); }
419 if (groupfile != "") {
420 //read in group map info.
421 groupMap = new GroupMap(groupfile);
422 int error = groupMap->readMap();
423 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
425 if (Groups.size() != 0) {
426 SharedUtil* util = new SharedUtil();
427 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
432 //done with listvector from matrix
433 if (list != NULL) { delete list; }
435 input = new InputData(listfile, "list");
436 list = input->getListVector();
437 string lastLabel = list->getLabel();
439 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
440 set<string> processedLabels;
441 set<string> userLabels = labels;
443 if (m->control_pressed) {
444 if (large) { inRow.close(); remove(distFile.c_str()); }
445 delete input; delete list; return 0;
448 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
450 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
452 if (allLines == 1 || labels.count(list->getLabel()) == 1){
453 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
454 error = process(list);
455 if (error == 1) { return 0; } //there is an error in hte input files, abort command
457 if (m->control_pressed) {
458 if (large) { inRow.close(); remove(distFile.c_str()); }
459 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
460 delete input; delete list; return 0;
463 processedLabels.insert(list->getLabel());
464 userLabels.erase(list->getLabel());
467 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
468 string saveLabel = list->getLabel();
471 list = input->getListVector(lastLabel);
472 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
473 error = process(list);
474 if (error == 1) { return 0; } //there is an error in hte input files, abort command
476 if (m->control_pressed) {
477 if (large) { inRow.close(); remove(distFile.c_str()); }
478 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
479 delete input; delete list; return 0;
482 processedLabels.insert(list->getLabel());
483 userLabels.erase(list->getLabel());
485 //restore real lastlabel to save below
486 list->setLabel(saveLabel);
489 lastLabel = list->getLabel();
492 list = input->getListVector();
495 //output error messages about any remaining user labels
496 bool needToRun = false;
497 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
498 m->mothurOut("Your file does not include the label " + (*it));
499 if (processedLabels.count(lastLabel) != 1) {
500 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
503 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
507 //run last label if you need to
508 if (needToRun == true) {
509 if (list != NULL) { delete list; }
510 list = input->getListVector(lastLabel);
511 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
512 error = process(list);
514 if (error == 1) { return 0; } //there is an error in hte input files, abort command
516 if (m->control_pressed) {
517 if (large) { inRow.close(); remove(distFile.c_str()); }
518 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
519 delete input; delete list; return 0;
523 //close and remove formatted matrix file
526 remove(distFile.c_str());
531 if (!weighted) { nameFileMap.clear(); }
534 fasta = new FastaMap();
535 fasta->readFastaFile(fastafile);
537 //if user gave a namesfile then use it
538 if (namefile != "") { readNamesFile(); }
540 //output create and output the .rep.fasta files
541 map<string, string>::iterator itNameFile;
542 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
543 processNames(itNameFile->first, itNameFile->second);
547 if (groupfile != "") { delete groupMap; }
549 if (m->control_pressed) { return 0; }
551 //set fasta file as new current fastafile - use first one??
553 itTypes = outputTypes.find("fasta");
554 if (itTypes != outputTypes.end()) {
555 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
558 itTypes = outputTypes.find("name");
559 if (itTypes != outputTypes.end()) {
560 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
563 m->mothurOutEndLine();
564 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
565 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
566 m->mothurOutEndLine();
570 catch(exception& e) {
571 m->errorOut(e, "GetOTURepCommand", "execute");
576 //**********************************************************************************************************************
577 void GetOTURepCommand::readNamesFile() {
580 vector<string> dupNames;
581 m->openInputFile(namefile, in);
583 string name, names, sequence;
586 in >> name; //read from first column A
587 in >> names; //read from second column A,B,C,D
591 //parse names into vector
592 m->splitAtComma(names, dupNames);
594 //store names in fasta map
595 sequence = fasta->getSequence(name);
596 for (int i = 0; i < dupNames.size(); i++) {
597 fasta->push_back(dupNames[i], sequence);
605 catch(exception& e) {
606 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
610 //**********************************************************************************************************************
611 //read names file to find the weighted rep for each bin
612 void GetOTURepCommand::readNamesFile(bool w) {
615 vector<string> dupNames;
616 m->openInputFile(namefile, in);
618 string name, names, sequence;
621 in >> name; m->gobble(in); //read from first column A
622 in >> names; //read from second column A,B,C,D
626 //parse names into vector
627 m->splitAtComma(names, dupNames);
629 for (int i = 0; i < dupNames.size(); i++) {
630 nameFileMap[dupNames[i]] = name;
638 catch(exception& e) {
639 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
643 //**********************************************************************************************************************
644 string GetOTURepCommand::findRep(vector<string> names) {
646 // if only 1 sequence in bin or processing the "unique" label, then
647 // the first sequence of the OTU is the representative one
648 if ((names.size() == 1) || (list->getLabel() == "unique")) {
651 vector<int> seqIndex(names.size());
652 vector<float> max_dist(names.size());
653 vector<float> total_dist(names.size());
654 map<string, string>::iterator itNameFile;
655 map<string, int>::iterator itNameIndex;
657 //fill seqIndex and initialize sums
658 for (size_t i = 0; i < names.size(); i++) {
660 seqIndex[i] = nameToIndex[names[i]];
662 if (namefile == "") {
663 itNameIndex = nameToIndex.find(names[i]);
665 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
666 if (large) { seqIndex[i] = (rowPositions.size()-1); }
667 else { seqIndex[i] = (seqVec.size()-1); }
669 seqIndex[i] = itNameIndex->second;
673 itNameFile = nameFileMap.find(names[i]);
675 if (itNameFile == nameFileMap.end()) {
676 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
678 string name1 = itNameFile->first;
679 string name2 = itNameFile->second;
681 if (name1 == name2) { //then you are unique so add your real dists
682 seqIndex[i] = nameToIndex[names[i]];
684 if (large) { seqIndex[i] = (rowPositions.size()-1); }
685 else { seqIndex[i] = (seqVec.size()-1); }
694 // loop through all entries in seqIndex
697 for (size_t i=0; i < seqIndex.size(); i++) {
698 if (m->control_pressed) { return "control"; }
700 if (!large) { currMap = seqVec[seqIndex[i]]; }
701 else { currMap = getMap(seqIndex[i]); }
703 for (size_t j=0; j < seqIndex.size(); j++) {
704 it = currMap.find(seqIndex[j]);
705 if (it != currMap.end()) {
706 max_dist[i] = max(max_dist[i], it->second);
707 max_dist[j] = max(max_dist[j], it->second);
708 total_dist[i] += it->second;
709 total_dist[j] += it->second;
710 }else{ //if you can't find the distance make it the cutoff
711 max_dist[i] = max(max_dist[i], cutoff);
712 max_dist[j] = max(max_dist[j], cutoff);
713 total_dist[i] += cutoff;
714 total_dist[j] += cutoff;
719 // sequence with the smallest maximum distance is the representative
720 //if tie occurs pick sequence with smallest average distance
723 for (size_t i=0; i < max_dist.size(); i++) {
724 if (m->control_pressed) { return "control"; }
725 if (max_dist[i] < min) {
728 }else if (max_dist[i] == min) {
729 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
730 float newAverage = total_dist[i] / (float) total_dist.size();
732 if (newAverage < currentAverage) {
739 return(names[minIndex]);
742 catch(exception& e) {
743 m->errorOut(e, "GetOTURepCommand", "FindRep");
748 //**********************************************************************************************************************
749 int GetOTURepCommand::process(ListVector* processList) {
751 string name, sequence;
755 if (outputDir == "") { outputDir += m->hasPath(listfile); }
757 ofstream newNamesOutput;
758 string outputNamesFile;
759 map<string, ofstream*> filehandles;
761 if (Groups.size() == 0) { //you don't want to use groups
762 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
763 m->openOutputFile(outputNamesFile, newNamesOutput);
764 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
765 outputNameFiles[outputNamesFile] = processList->getLabel();
766 }else{ //you want to use groups
768 for (int i=0; i<Groups.size(); i++) {
770 filehandles[Groups[i]] = temp;
771 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
773 m->openOutputFile(outputNamesFile, *(temp));
774 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
775 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
779 //for each bin in the list vector
780 for (int i = 0; i < processList->size(); i++) {
781 if (m->control_pressed) {
783 if (Groups.size() == 0) { //you don't want to use groups
784 newNamesOutput.close();
786 for (int j=0; j<Groups.size(); j++) {
787 (*(filehandles[Groups[j]])).close();
788 delete filehandles[Groups[j]];
794 string temp = processList->get(i);
795 vector<string> namesInBin;
796 m->splitAtComma(temp, namesInBin);
798 if (Groups.size() == 0) {
799 nameRep = findRep(namesInBin);
800 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
802 map<string, vector<string> > NamesInGroup;
803 for (int j=0; j<Groups.size(); j++) { //initialize groups
804 NamesInGroup[Groups[j]].resize(0);
807 for (int j=0; j<namesInBin.size(); j++) {
808 string thisgroup = groupMap->getGroup(namesInBin[j]);
810 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
812 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
813 NamesInGroup[thisgroup].push_back(namesInBin[j]);
817 //get rep for each group in otu
818 for (int j=0; j<Groups.size(); j++) {
819 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
820 //get rep for each group
821 nameRep = findRep(NamesInGroup[Groups[j]]);
823 //output group rep and other members of this group
824 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
826 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
827 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
830 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
836 if (Groups.size() == 0) { //you don't want to use groups
837 newNamesOutput.close();
839 for (int i=0; i<Groups.size(); i++) {
840 (*(filehandles[Groups[i]])).close();
841 delete filehandles[Groups[i]];
848 catch(exception& e) {
849 m->errorOut(e, "GetOTURepCommand", "process");
853 //**********************************************************************************************************************
854 int GetOTURepCommand::processNames(string filename, string label) {
858 if (outputDir == "") { outputDir += m->hasPath(listfile); }
859 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
860 m->openOutputFile(outputFileName, out);
861 vector<repStruct> reps;
862 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
865 string tempNameFile = filename + ".temp";
866 m->openOutputFile(tempNameFile, out2);
869 m->openInputFile(filename, in);
873 string rep, binnames;
874 in >> i >> rep >> binnames; m->gobble(in);
875 out2 << rep << '\t' << binnames << endl;
877 vector<string> names;
878 m->splitAtComma(binnames, names);
879 int binsize = names.size();
881 //if you have a groupfile
883 if (groupfile != "") {
884 map<string, string> groups;
885 map<string, string>::iterator groupIt;
887 //find the groups that are in this bin
888 for (size_t i = 0; i < names.size(); i++) {
889 string groupName = groupMap->getGroup(names[i]);
890 if (groupName == "not found") {
891 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
894 groups[groupName] = groupName;
898 //turn the groups into a string
899 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
900 group += groupIt->first + "-";
903 group = group.substr(0, group.length()-1);
907 //print out name and sequence for that bin
908 string sequence = fasta->getSequence(rep);
910 if (sequence != "not found") {
911 if (sorted == "") { //print them out
912 rep = rep + "\t" + toString(i+1);
913 rep = rep + "|" + toString(binsize);
914 if (groupfile != "") {
915 rep = rep + "|" + group;
917 out << ">" << rep << endl;
918 out << sequence << endl;
920 repStruct newRep(rep, i+1, binsize, group);
921 reps.push_back(newRep);
924 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
929 if (sorted != "") { //then sort them and print them
930 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
931 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
932 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
933 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
936 for (int i = 0; i < reps.size(); i++) {
937 string sequence = fasta->getSequence(reps[i].name);
938 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
939 outputName = outputName + "|" + toString(reps[i].size);
940 if (groupfile != "") {
941 outputName = outputName + "|" + reps[i].group;
943 out << ">" << outputName << endl;
944 out << sequence << endl;
952 remove(filename.c_str());
953 rename(tempNameFile.c_str(), filename.c_str());
958 catch(exception& e) {
959 m->errorOut(e, "GetOTURepCommand", "processNames");
963 //**********************************************************************************************************************
964 SeqMap GetOTURepCommand::getMap(int row) {
968 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
969 if (rowPositions[row] != -1){
971 inRow.seekg(rowPositions[row]);
973 int rowNum, numDists, colNum;
976 inRow >> rowNum >> numDists;
978 for(int i = 0; i < numDists; i++) {
979 inRow >> colNum >> dist;
980 rowMap[colNum] = dist;
987 catch(exception& e) {
988 m->errorOut(e, "GetOTURepCommand", "getMap");
992 //**********************************************************************************************************************