5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
106 catch(exception& e) {
107 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option) {
114 abort = false; calledHelp = false;
117 //allow user to run help
118 if (option == "help") {
119 help(); abort = true; calledHelp = true;
120 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
122 vector<string> myArray = setParameters();
124 OptionParser parser(option);
125 map<string, string> parameters = parser.getParameters();
127 ValidParameters validParameter;
128 map<string, string>::iterator it;
130 //check to make sure all parameters are valid for command
131 for (it = parameters.begin(); it != parameters.end(); it++) {
132 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
135 //initialize outputTypes
136 vector<string> tempOutNames;
137 outputTypes["fasta"] = tempOutNames;
138 outputTypes["name"] = tempOutNames;
140 //if the user changes the input directory command factory will send this info to us in the output parameter
141 string inputDir = validParameter.validFile(parameters, "inputdir", false);
142 if (inputDir == "not found"){ inputDir = ""; }
145 it = parameters.find("list");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["list"] = inputDir + it->second; }
153 it = parameters.find("fasta");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["fasta"] = inputDir + it->second; }
161 it = parameters.find("phylip");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["phylip"] = inputDir + it->second; }
169 it = parameters.find("column");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["column"] = inputDir + it->second; }
177 it = parameters.find("name");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["name"] = inputDir + it->second; }
185 it = parameters.find("group");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["group"] = inputDir + it->second; }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") {
201 fastafile = m->getFastaFile();
202 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
205 else if (fastafile == "not open") { abort = true; }
206 else { m->setFastaFile(fastafile); }
208 listfile = validParameter.validFile(parameters, "list", true);
209 if (listfile == "not found") {
210 listfile = m->getListFile();
211 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
212 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
214 else if (listfile == "not open") { abort = true; }
215 else { m->setListFile(listfile); }
217 phylipfile = validParameter.validFile(parameters, "phylip", true);
218 if (phylipfile == "not found") { phylipfile = ""; }
219 else if (phylipfile == "not open") { abort = true; }
220 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
222 columnfile = validParameter.validFile(parameters, "column", true);
223 if (columnfile == "not found") { columnfile = ""; }
224 else if (columnfile == "not open") { abort = true; }
225 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
227 namefile = validParameter.validFile(parameters, "name", true);
228 if (namefile == "not open") { abort = true; }
229 else if (namefile == "not found") { namefile = ""; }
230 else { m->setNameFile(namefile); }
232 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
233 //give priority to column, then phylip
234 columnfile = m->getColumnFile();
235 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
237 phylipfile = m->getPhylipFile();
238 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
240 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
244 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
246 if (columnfile != "") {
247 if (namefile == "") {
248 namefile = m->getNameFile();
249 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
251 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
257 //check for optional parameter and set defaults
258 // ...at some point should added some additional type checking...
259 label = validParameter.validFile(parameters, "label", false);
260 if (label == "not found") { label = ""; allLines = 1; }
262 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
263 else { allLines = 1; }
266 groupfile = validParameter.validFile(parameters, "group", true);
267 if (groupfile == "not open") { groupfile = ""; abort = true; }
268 else if (groupfile == "not found") { groupfile = ""; }
269 else { m->setGroupFile(groupfile); }
271 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
272 if (sorted == "none") { sorted=""; }
273 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
274 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
278 if ((sorted == "group") && (groupfile == "")) {
279 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
283 groups = validParameter.validFile(parameters, "groups", false);
284 if (groups == "not found") { groups = ""; }
286 if (groupfile == "") {
287 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
290 m->splitAtDash(groups, Groups);
293 m->setGroups(Groups);
295 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
296 large = m->isTrue(temp);
298 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
299 weighted = m->isTrue(temp);
301 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
303 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
304 convert(temp, precision);
306 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
307 convert(temp, cutoff);
308 cutoff += (5 / (precision * 10.0));
311 catch(exception& e) {
312 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
317 //**********************************************************************************************************************
319 int GetOTURepCommand::execute(){
322 if (abort == true) { if (calledHelp) { return 0; } return 2; }
327 //read distance files
328 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
329 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
330 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
332 readMatrix->setCutoff(cutoff);
335 nameMap = new NameAssignment(namefile);
337 }else{ nameMap = NULL; }
339 readMatrix->read(nameMap);
341 if (m->control_pressed) { delete readMatrix; return 0; }
343 list = readMatrix->getListVector();
345 SparseMatrix* matrix = readMatrix->getMatrix();
347 // Create a data structure to quickly access the distance information.
348 // It consists of a vector of distance maps, where each map contains
349 // all distances of a certain sequence. Vector and maps are accessed
350 // via the index of a sequence in the distance matrix
351 seqVec = vector<SeqMap>(list->size());
352 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
353 if (m->control_pressed) { delete readMatrix; return 0; }
354 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
356 //add dummy map for unweighted calc
358 seqVec.push_back(dummy);
364 if (m->control_pressed) { return 0; }
366 //process file and set up indexes
367 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
368 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
369 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
371 formatMatrix->setCutoff(cutoff);
374 nameMap = new NameAssignment(namefile);
376 }else{ nameMap = NULL; }
378 formatMatrix->read(nameMap);
380 if (m->control_pressed) { delete formatMatrix; return 0; }
382 list = formatMatrix->getListVector();
384 distFile = formatMatrix->getFormattedFileName();
386 //positions in file where the distances for each sequence begin
387 //rowPositions[1] = position in file where distance related to sequence 1 start.
388 rowPositions = formatMatrix->getRowPositions();
389 rowPositions.push_back(-1); //dummy row for unweighted calc
394 //openfile for getMap to use
395 m->openInputFile(distFile, inRow);
397 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
401 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
403 vector<string> names;
405 //map names to rows in sparsematrix
406 for (int i = 0; i < list->size(); i++) {
408 binnames = list->get(i);
410 m->splitAtComma(binnames, names);
412 for (int j = 0; j < names.size(); j++) {
413 nameToIndex[names[j]] = i;
416 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
419 if (m->control_pressed) {
420 if (large) { inRow.close(); m->mothurRemove(distFile); }
424 if (groupfile != "") {
425 //read in group map info.
426 groupMap = new GroupMap(groupfile);
427 int error = groupMap->readMap();
428 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
430 if (Groups.size() != 0) {
431 SharedUtil* util = new SharedUtil();
432 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
433 util->setGroups(Groups, gNamesOfGroups, "getoturep");
434 groupMap->setNamesOfGroups(gNamesOfGroups);
439 //done with listvector from matrix
440 if (list != NULL) { delete list; }
442 input = new InputData(listfile, "list");
443 list = input->getListVector();
444 string lastLabel = list->getLabel();
446 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
447 set<string> processedLabels;
448 set<string> userLabels = labels;
450 if (m->control_pressed) {
451 if (large) { inRow.close(); m->mothurRemove(distFile); }
452 delete input; delete list; return 0;
455 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
457 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
459 if (allLines == 1 || labels.count(list->getLabel()) == 1){
460 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
461 error = process(list);
462 if (error == 1) { return 0; } //there is an error in hte input files, abort command
464 if (m->control_pressed) {
465 if (large) { inRow.close(); m->mothurRemove(distFile); }
466 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
467 delete input; delete list; return 0;
470 processedLabels.insert(list->getLabel());
471 userLabels.erase(list->getLabel());
474 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
475 string saveLabel = list->getLabel();
478 list = input->getListVector(lastLabel);
479 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
480 error = process(list);
481 if (error == 1) { return 0; } //there is an error in hte input files, abort command
483 if (m->control_pressed) {
484 if (large) { inRow.close(); m->mothurRemove(distFile); }
485 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
486 delete input; delete list; return 0;
489 processedLabels.insert(list->getLabel());
490 userLabels.erase(list->getLabel());
492 //restore real lastlabel to save below
493 list->setLabel(saveLabel);
496 lastLabel = list->getLabel();
499 list = input->getListVector();
502 //output error messages about any remaining user labels
503 bool needToRun = false;
504 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
505 m->mothurOut("Your file does not include the label " + (*it));
506 if (processedLabels.count(lastLabel) != 1) {
507 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
510 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
514 //run last label if you need to
515 if (needToRun == true) {
516 if (list != NULL) { delete list; }
517 list = input->getListVector(lastLabel);
518 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
519 error = process(list);
521 if (error == 1) { return 0; } //there is an error in hte input files, abort command
523 if (m->control_pressed) {
524 if (large) { inRow.close(); m->mothurRemove(distFile); }
525 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
526 delete input; delete list; return 0;
530 //close and remove formatted matrix file
533 m->mothurRemove(distFile);
538 if (!weighted) { nameFileMap.clear(); }
541 fasta = new FastaMap();
542 fasta->readFastaFile(fastafile);
544 //if user gave a namesfile then use it
545 if (namefile != "") { readNamesFile(); }
547 //output create and output the .rep.fasta files
548 map<string, string>::iterator itNameFile;
549 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
550 processNames(itNameFile->first, itNameFile->second);
554 if (groupfile != "") { delete groupMap; }
556 if (m->control_pressed) { return 0; }
558 //set fasta file as new current fastafile - use first one??
560 itTypes = outputTypes.find("fasta");
561 if (itTypes != outputTypes.end()) {
562 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
565 itTypes = outputTypes.find("name");
566 if (itTypes != outputTypes.end()) {
567 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
570 m->mothurOutEndLine();
571 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
572 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
573 m->mothurOutEndLine();
577 catch(exception& e) {
578 m->errorOut(e, "GetOTURepCommand", "execute");
583 //**********************************************************************************************************************
584 void GetOTURepCommand::readNamesFile() {
587 vector<string> dupNames;
588 m->openInputFile(namefile, in);
590 string name, names, sequence;
593 in >> name; //read from first column A
594 in >> names; //read from second column A,B,C,D
598 //parse names into vector
599 m->splitAtComma(names, dupNames);
601 //store names in fasta map
602 sequence = fasta->getSequence(name);
603 for (int i = 0; i < dupNames.size(); i++) {
604 fasta->push_back(dupNames[i], sequence);
612 catch(exception& e) {
613 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
617 //**********************************************************************************************************************
618 //read names file to find the weighted rep for each bin
619 void GetOTURepCommand::readNamesFile(bool w) {
622 vector<string> dupNames;
623 m->openInputFile(namefile, in);
625 string name, names, sequence;
628 in >> name; m->gobble(in); //read from first column A
629 in >> names; //read from second column A,B,C,D
633 //parse names into vector
634 m->splitAtComma(names, dupNames);
636 for (int i = 0; i < dupNames.size(); i++) {
637 nameFileMap[dupNames[i]] = name;
645 catch(exception& e) {
646 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
650 //**********************************************************************************************************************
651 string GetOTURepCommand::findRep(vector<string> names) {
653 // if only 1 sequence in bin or processing the "unique" label, then
654 // the first sequence of the OTU is the representative one
655 if ((names.size() == 1)) {
658 vector<int> seqIndex(names.size());
659 vector<float> max_dist(names.size());
660 vector<float> total_dist(names.size());
661 map<string, string>::iterator itNameFile;
662 map<string, int>::iterator itNameIndex;
664 //fill seqIndex and initialize sums
665 for (size_t i = 0; i < names.size(); i++) {
667 seqIndex[i] = nameToIndex[names[i]];
669 if (namefile == "") {
670 itNameIndex = nameToIndex.find(names[i]);
672 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
673 if (large) { seqIndex[i] = (rowPositions.size()-1); }
674 else { seqIndex[i] = (seqVec.size()-1); }
676 seqIndex[i] = itNameIndex->second;
680 itNameFile = nameFileMap.find(names[i]);
682 if (itNameFile == nameFileMap.end()) {
683 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
685 string name1 = itNameFile->first;
686 string name2 = itNameFile->second;
688 if (name1 == name2) { //then you are unique so add your real dists
689 seqIndex[i] = nameToIndex[names[i]];
691 if (large) { seqIndex[i] = (rowPositions.size()-1); }
692 else { seqIndex[i] = (seqVec.size()-1); }
701 // loop through all entries in seqIndex
704 for (size_t i=0; i < seqIndex.size(); i++) {
705 if (m->control_pressed) { return "control"; }
707 if (!large) { currMap = seqVec[seqIndex[i]]; }
708 else { currMap = getMap(seqIndex[i]); }
710 for (size_t j=0; j < seqIndex.size(); j++) {
711 it = currMap.find(seqIndex[j]);
712 if (it != currMap.end()) {
713 max_dist[i] = max(max_dist[i], it->second);
714 max_dist[j] = max(max_dist[j], it->second);
715 total_dist[i] += it->second;
716 total_dist[j] += it->second;
717 }else{ //if you can't find the distance make it the cutoff
718 max_dist[i] = max(max_dist[i], cutoff);
719 max_dist[j] = max(max_dist[j], cutoff);
720 total_dist[i] += cutoff;
721 total_dist[j] += cutoff;
726 // sequence with the smallest maximum distance is the representative
727 //if tie occurs pick sequence with smallest average distance
730 for (size_t i=0; i < max_dist.size(); i++) {
731 if (m->control_pressed) { return "control"; }
732 if (max_dist[i] < min) {
735 }else if (max_dist[i] == min) {
736 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
737 float newAverage = total_dist[i] / (float) total_dist.size();
739 if (newAverage < currentAverage) {
746 return(names[minIndex]);
749 catch(exception& e) {
750 m->errorOut(e, "GetOTURepCommand", "FindRep");
755 //**********************************************************************************************************************
756 int GetOTURepCommand::process(ListVector* processList) {
758 string name, sequence;
762 if (outputDir == "") { outputDir += m->hasPath(listfile); }
764 ofstream newNamesOutput;
765 string outputNamesFile;
766 map<string, ofstream*> filehandles;
768 if (Groups.size() == 0) { //you don't want to use groups
769 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
770 m->openOutputFile(outputNamesFile, newNamesOutput);
771 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
772 outputNameFiles[outputNamesFile] = processList->getLabel();
773 }else{ //you want to use groups
775 for (int i=0; i<Groups.size(); i++) {
777 filehandles[Groups[i]] = temp;
778 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
780 m->openOutputFile(outputNamesFile, *(temp));
781 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
782 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
786 //for each bin in the list vector
787 for (int i = 0; i < processList->size(); i++) {
788 if (m->control_pressed) {
790 if (Groups.size() == 0) { //you don't want to use groups
791 newNamesOutput.close();
793 for (int j=0; j<Groups.size(); j++) {
794 (*(filehandles[Groups[j]])).close();
795 delete filehandles[Groups[j]];
801 string temp = processList->get(i);
802 vector<string> namesInBin;
803 m->splitAtComma(temp, namesInBin);
805 if (Groups.size() == 0) {
806 nameRep = findRep(namesInBin);
807 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
809 map<string, vector<string> > NamesInGroup;
810 for (int j=0; j<Groups.size(); j++) { //initialize groups
811 NamesInGroup[Groups[j]].resize(0);
814 for (int j=0; j<namesInBin.size(); j++) {
815 string thisgroup = groupMap->getGroup(namesInBin[j]);
817 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
819 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
820 NamesInGroup[thisgroup].push_back(namesInBin[j]);
824 //get rep for each group in otu
825 for (int j=0; j<Groups.size(); j++) {
826 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
827 //get rep for each group
828 nameRep = findRep(NamesInGroup[Groups[j]]);
830 //output group rep and other members of this group
831 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
833 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
834 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
837 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
843 if (Groups.size() == 0) { //you don't want to use groups
844 newNamesOutput.close();
846 for (int i=0; i<Groups.size(); i++) {
847 (*(filehandles[Groups[i]])).close();
848 delete filehandles[Groups[i]];
855 catch(exception& e) {
856 m->errorOut(e, "GetOTURepCommand", "process");
860 //**********************************************************************************************************************
861 int GetOTURepCommand::processNames(string filename, string label) {
865 if (outputDir == "") { outputDir += m->hasPath(listfile); }
866 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
867 m->openOutputFile(outputFileName, out);
868 vector<repStruct> reps;
869 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
872 string tempNameFile = filename + ".temp";
873 m->openOutputFile(tempNameFile, out2);
876 m->openInputFile(filename, in);
880 string rep, binnames;
881 in >> i >> rep >> binnames; m->gobble(in);
882 out2 << rep << '\t' << binnames << endl;
884 vector<string> names;
885 m->splitAtComma(binnames, names);
886 int binsize = names.size();
888 //if you have a groupfile
890 if (groupfile != "") {
891 map<string, string> groups;
892 map<string, string>::iterator groupIt;
894 //find the groups that are in this bin
895 for (size_t i = 0; i < names.size(); i++) {
896 string groupName = groupMap->getGroup(names[i]);
897 if (groupName == "not found") {
898 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
901 groups[groupName] = groupName;
905 //turn the groups into a string
906 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
907 group += groupIt->first + "-";
910 group = group.substr(0, group.length()-1);
914 //print out name and sequence for that bin
915 string sequence = fasta->getSequence(rep);
917 if (sequence != "not found") {
918 if (sorted == "") { //print them out
919 rep = rep + "\t" + toString(i+1);
920 rep = rep + "|" + toString(binsize);
921 if (groupfile != "") {
922 rep = rep + "|" + group;
924 out << ">" << rep << endl;
925 out << sequence << endl;
927 repStruct newRep(rep, i+1, binsize, group);
928 reps.push_back(newRep);
931 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
936 if (sorted != "") { //then sort them and print them
937 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
938 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
939 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
940 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
943 for (int i = 0; i < reps.size(); i++) {
944 string sequence = fasta->getSequence(reps[i].name);
945 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
946 outputName = outputName + "|" + toString(reps[i].size);
947 if (groupfile != "") {
948 outputName = outputName + "|" + reps[i].group;
950 out << ">" << outputName << endl;
951 out << sequence << endl;
959 m->mothurRemove(filename);
960 rename(tempNameFile.c_str(), filename.c_str());
965 catch(exception& e) {
966 m->errorOut(e, "GetOTURepCommand", "processNames");
970 //**********************************************************************************************************************
971 SeqMap GetOTURepCommand::getMap(int row) {
975 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
976 if (rowPositions[row] != -1){
978 inRow.seekg(rowPositions[row]);
980 int rowNum, numDists, colNum;
983 inRow >> rowNum >> numDists;
985 for(int i = 0; i < numDists; i++) {
986 inRow >> colNum >> dist;
987 rowMap[colNum] = dist;
994 catch(exception& e) {
995 m->errorOut(e, "GetOTURepCommand", "getMap");
999 //**********************************************************************************************************************