5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){
100 string outputFileName = "";
101 map<string, vector<string> >::iterator it;
103 //is this a type this command creates
104 it = outputTypes.find(type);
105 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
107 if (type == "fasta") { outputFileName = "rep.fasta"; }
108 else if (type == "name") { outputFileName = "rep.names"; }
109 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
111 return outputFileName;
113 catch(exception& e) {
114 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
118 //**********************************************************************************************************************
119 GetOTURepCommand::GetOTURepCommand(){
121 abort = true; calledHelp = true;
123 vector<string> tempOutNames;
124 outputTypes["fasta"] = tempOutNames;
125 outputTypes["name"] = tempOutNames;
127 catch(exception& e) {
128 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
132 //**********************************************************************************************************************
133 GetOTURepCommand::GetOTURepCommand(string option) {
135 abort = false; calledHelp = false;
138 //allow user to run help
139 if (option == "help") {
140 help(); abort = true; calledHelp = true;
141 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
143 vector<string> myArray = setParameters();
145 OptionParser parser(option);
146 map<string, string> parameters = parser.getParameters();
148 ValidParameters validParameter;
149 map<string, string>::iterator it;
151 //check to make sure all parameters are valid for command
152 for (it = parameters.begin(); it != parameters.end(); it++) {
153 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
156 //initialize outputTypes
157 vector<string> tempOutNames;
158 outputTypes["fasta"] = tempOutNames;
159 outputTypes["name"] = tempOutNames;
161 //if the user changes the input directory command factory will send this info to us in the output parameter
162 string inputDir = validParameter.validFile(parameters, "inputdir", false);
163 if (inputDir == "not found"){ inputDir = ""; }
166 it = parameters.find("list");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["list"] = inputDir + it->second; }
174 it = parameters.find("fasta");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["fasta"] = inputDir + it->second; }
182 it = parameters.find("phylip");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["phylip"] = inputDir + it->second; }
190 it = parameters.find("column");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["column"] = inputDir + it->second; }
198 it = parameters.find("name");
199 //user has given a template file
200 if(it != parameters.end()){
201 path = m->hasPath(it->second);
202 //if the user has not given a path then, add inputdir. else leave path alone.
203 if (path == "") { parameters["name"] = inputDir + it->second; }
206 it = parameters.find("group");
207 //user has given a template file
208 if(it != parameters.end()){
209 path = m->hasPath(it->second);
210 //if the user has not given a path then, add inputdir. else leave path alone.
211 if (path == "") { parameters["group"] = inputDir + it->second; }
216 //if the user changes the output directory command factory will send this info to us in the output parameter
217 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
219 //check for required parameters
220 fastafile = validParameter.validFile(parameters, "fasta", true);
221 if (fastafile == "not found") { fastafile = ""; }
222 else if (fastafile == "not open") { abort = true; }
223 else { m->setFastaFile(fastafile); }
225 listfile = validParameter.validFile(parameters, "list", true);
226 if (listfile == "not found") {
227 listfile = m->getListFile();
228 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
229 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
231 else if (listfile == "not open") { abort = true; }
232 else { m->setListFile(listfile); }
234 phylipfile = validParameter.validFile(parameters, "phylip", true);
235 if (phylipfile == "not found") { phylipfile = ""; }
236 else if (phylipfile == "not open") { abort = true; }
237 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
239 columnfile = validParameter.validFile(parameters, "column", true);
240 if (columnfile == "not found") { columnfile = ""; }
241 else if (columnfile == "not open") { abort = true; }
242 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
244 namefile = validParameter.validFile(parameters, "name", true);
245 if (namefile == "not open") { abort = true; }
246 else if (namefile == "not found") { namefile = ""; }
247 else { m->setNameFile(namefile); }
249 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
250 //give priority to column, then phylip
251 columnfile = m->getColumnFile();
252 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
254 phylipfile = m->getPhylipFile();
255 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
257 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
261 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
263 if (columnfile != "") {
264 if (namefile == "") {
265 namefile = m->getNameFile();
266 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
268 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
274 //check for optional parameter and set defaults
275 // ...at some point should added some additional type checking...
276 label = validParameter.validFile(parameters, "label", false);
277 if (label == "not found") { label = ""; allLines = 1; }
279 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
280 else { allLines = 1; }
283 groupfile = validParameter.validFile(parameters, "group", true);
284 if (groupfile == "not open") { groupfile = ""; abort = true; }
285 else if (groupfile == "not found") { groupfile = ""; }
286 else { m->setGroupFile(groupfile); }
288 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
289 if (sorted == "none") { sorted=""; }
290 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
291 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
295 if ((sorted == "group") && (groupfile == "")) {
296 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
300 groups = validParameter.validFile(parameters, "groups", false);
301 if (groups == "not found") { groups = ""; }
303 if (groupfile == "") {
304 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
307 m->splitAtDash(groups, Groups);
310 m->setGroups(Groups);
312 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
313 large = m->isTrue(temp);
315 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
316 weighted = m->isTrue(temp);
318 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
320 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
321 m->mothurConvert(temp, precision);
323 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
324 m->mothurConvert(temp, cutoff);
325 cutoff += (5 / (precision * 10.0));
328 catch(exception& e) {
329 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
334 //**********************************************************************************************************************
336 int GetOTURepCommand::execute(){
339 if (abort == true) { if (calledHelp) { return 0; } return 2; }
344 //read distance files
345 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
346 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
347 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
349 readMatrix->setCutoff(cutoff);
352 nameMap = new NameAssignment(namefile);
354 }else{ nameMap = NULL; }
356 readMatrix->read(nameMap);
358 if (m->control_pressed) { delete readMatrix; return 0; }
360 list = readMatrix->getListVector();
362 SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
364 // Create a data structure to quickly access the distance information.
365 // It consists of a vector of distance maps, where each map contains
366 // all distances of a certain sequence. Vector and maps are accessed
367 // via the index of a sequence in the distance matrix
368 seqVec = vector<SeqMap>(list->size());
369 for (int i = 0; i < matrix->seqVec.size(); i++) {
370 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
371 if (m->control_pressed) { delete readMatrix; return 0; }
372 //already added everyone else in row
373 if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
376 //add dummy map for unweighted calc
378 seqVec.push_back(dummy);
384 if (m->control_pressed) { return 0; }
386 //process file and set up indexes
387 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
388 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
389 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
391 formatMatrix->setCutoff(cutoff);
394 nameMap = new NameAssignment(namefile);
396 }else{ nameMap = NULL; }
398 formatMatrix->read(nameMap);
400 if (m->control_pressed) { delete formatMatrix; return 0; }
402 list = formatMatrix->getListVector();
404 distFile = formatMatrix->getFormattedFileName();
406 //positions in file where the distances for each sequence begin
407 //rowPositions[1] = position in file where distance related to sequence 1 start.
408 rowPositions = formatMatrix->getRowPositions();
409 rowPositions.push_back(-1); //dummy row for unweighted calc
414 //openfile for getMap to use
415 m->openInputFile(distFile, inRow);
417 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
421 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
423 vector<string> names;
425 //map names to rows in sparsematrix
426 for (int i = 0; i < list->size(); i++) {
428 binnames = list->get(i);
430 m->splitAtComma(binnames, names);
432 for (int j = 0; j < names.size(); j++) {
433 nameToIndex[names[j]] = i;
436 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
439 if (m->control_pressed) {
440 if (large) { inRow.close(); m->mothurRemove(distFile); }
444 if (groupfile != "") {
445 //read in group map info.
446 groupMap = new GroupMap(groupfile);
447 int error = groupMap->readMap();
448 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
450 if (Groups.size() != 0) {
451 SharedUtil* util = new SharedUtil();
452 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
453 util->setGroups(Groups, gNamesOfGroups, "getoturep");
454 groupMap->setNamesOfGroups(gNamesOfGroups);
459 //done with listvector from matrix
460 if (list != NULL) { delete list; }
462 input = new InputData(listfile, "list");
463 list = input->getListVector();
464 string lastLabel = list->getLabel();
466 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
467 set<string> processedLabels;
468 set<string> userLabels = labels;
470 if (m->control_pressed) {
471 if (large) { inRow.close(); m->mothurRemove(distFile); }
472 delete input; delete list; return 0;
475 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
477 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
479 if (allLines == 1 || labels.count(list->getLabel()) == 1){
480 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
481 error = process(list);
482 if (error == 1) { return 0; } //there is an error in hte input files, abort command
484 if (m->control_pressed) {
485 if (large) { inRow.close(); m->mothurRemove(distFile); }
486 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
487 delete input; delete list; return 0;
490 processedLabels.insert(list->getLabel());
491 userLabels.erase(list->getLabel());
494 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
495 string saveLabel = list->getLabel();
498 list = input->getListVector(lastLabel);
499 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
500 error = process(list);
501 if (error == 1) { return 0; } //there is an error in hte input files, abort command
503 if (m->control_pressed) {
504 if (large) { inRow.close(); m->mothurRemove(distFile); }
505 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
506 delete input; delete list; return 0;
509 processedLabels.insert(list->getLabel());
510 userLabels.erase(list->getLabel());
512 //restore real lastlabel to save below
513 list->setLabel(saveLabel);
516 lastLabel = list->getLabel();
519 list = input->getListVector();
522 //output error messages about any remaining user labels
523 bool needToRun = false;
524 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
525 m->mothurOut("Your file does not include the label " + (*it));
526 if (processedLabels.count(lastLabel) != 1) {
527 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
530 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
534 //run last label if you need to
535 if (needToRun == true) {
536 if (list != NULL) { delete list; }
537 list = input->getListVector(lastLabel);
538 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
539 error = process(list);
541 if (error == 1) { return 0; } //there is an error in hte input files, abort command
543 if (m->control_pressed) {
544 if (large) { inRow.close(); m->mothurRemove(distFile); }
545 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
546 delete input; delete list; return 0;
550 //close and remove formatted matrix file
553 m->mothurRemove(distFile);
558 if (!weighted) { nameFileMap.clear(); }
561 if (fastafile != "") {
563 fasta = new FastaMap();
564 fasta->readFastaFile(fastafile);
566 //if user gave a namesfile then use it
567 if (namefile != "") { readNamesFile(); }
569 //output create and output the .rep.fasta files
570 map<string, string>::iterator itNameFile;
571 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
572 processFastaNames(itNameFile->first, itNameFile->second);
575 //output create and output the .rep.fasta files
576 map<string, string>::iterator itNameFile;
577 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
578 processNames(itNameFile->first, itNameFile->second);
583 if (groupfile != "") { delete groupMap; }
585 if (m->control_pressed) { return 0; }
587 //set fasta file as new current fastafile - use first one??
589 itTypes = outputTypes.find("fasta");
590 if (itTypes != outputTypes.end()) {
591 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
594 itTypes = outputTypes.find("name");
595 if (itTypes != outputTypes.end()) {
596 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
599 m->mothurOutEndLine();
600 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
601 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
602 m->mothurOutEndLine();
606 catch(exception& e) {
607 m->errorOut(e, "GetOTURepCommand", "execute");
612 //**********************************************************************************************************************
613 void GetOTURepCommand::readNamesFile() {
616 vector<string> dupNames;
617 m->openInputFile(namefile, in);
619 string name, names, sequence;
622 in >> name; //read from first column A
623 in >> names; //read from second column A,B,C,D
627 //parse names into vector
628 m->splitAtComma(names, dupNames);
630 //store names in fasta map
631 sequence = fasta->getSequence(name);
632 for (int i = 0; i < dupNames.size(); i++) {
633 fasta->push_back(dupNames[i], sequence);
641 catch(exception& e) {
642 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
646 //**********************************************************************************************************************
647 //read names file to find the weighted rep for each bin
648 void GetOTURepCommand::readNamesFile(bool w) {
651 vector<string> dupNames;
652 m->openInputFile(namefile, in);
654 string name, names, sequence;
657 in >> name; m->gobble(in); //read from first column A
658 in >> names; //read from second column A,B,C,D
662 //parse names into vector
663 m->splitAtComma(names, dupNames);
665 for (int i = 0; i < dupNames.size(); i++) {
666 nameFileMap[dupNames[i]] = name;
674 catch(exception& e) {
675 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
679 //**********************************************************************************************************************
680 string GetOTURepCommand::findRep(vector<string> names) {
682 // if only 1 sequence in bin or processing the "unique" label, then
683 // the first sequence of the OTU is the representative one
684 if ((names.size() == 1)) {
687 vector<int> seqIndex(names.size());
688 vector<float> max_dist(names.size());
689 vector<float> total_dist(names.size());
690 map<string, string>::iterator itNameFile;
691 map<string, int>::iterator itNameIndex;
693 //fill seqIndex and initialize sums
694 for (size_t i = 0; i < names.size(); i++) {
696 seqIndex[i] = nameToIndex[names[i]];
698 if (namefile == "") {
699 itNameIndex = nameToIndex.find(names[i]);
701 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
702 if (large) { seqIndex[i] = (rowPositions.size()-1); }
703 else { seqIndex[i] = (seqVec.size()-1); }
705 seqIndex[i] = itNameIndex->second;
709 itNameFile = nameFileMap.find(names[i]);
711 if (itNameFile == nameFileMap.end()) {
712 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
714 string name1 = itNameFile->first;
715 string name2 = itNameFile->second;
717 if (name1 == name2) { //then you are unique so add your real dists
718 seqIndex[i] = nameToIndex[names[i]];
720 if (large) { seqIndex[i] = (rowPositions.size()-1); }
721 else { seqIndex[i] = (seqVec.size()-1); }
730 // loop through all entries in seqIndex
733 for (size_t i=0; i < seqIndex.size(); i++) {
734 if (m->control_pressed) { return "control"; }
736 if (!large) { currMap = seqVec[seqIndex[i]]; }
737 else { currMap = getMap(seqIndex[i]); }
739 for (size_t j=0; j < seqIndex.size(); j++) {
740 it = currMap.find(seqIndex[j]);
741 if (it != currMap.end()) {
742 max_dist[i] = max(max_dist[i], it->second);
743 max_dist[j] = max(max_dist[j], it->second);
744 total_dist[i] += it->second;
745 total_dist[j] += it->second;
746 }else{ //if you can't find the distance make it the cutoff
747 max_dist[i] = max(max_dist[i], cutoff);
748 max_dist[j] = max(max_dist[j], cutoff);
749 total_dist[i] += cutoff;
750 total_dist[j] += cutoff;
755 // sequence with the smallest maximum distance is the representative
756 //if tie occurs pick sequence with smallest average distance
759 for (size_t i=0; i < max_dist.size(); i++) {
760 if (m->control_pressed) { return "control"; }
761 if (max_dist[i] < min) {
764 }else if (max_dist[i] == min) {
765 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
766 float newAverage = total_dist[i] / (float) total_dist.size();
768 if (newAverage < currentAverage) {
775 return(names[minIndex]);
778 catch(exception& e) {
779 m->errorOut(e, "GetOTURepCommand", "FindRep");
784 //**********************************************************************************************************************
785 int GetOTURepCommand::process(ListVector* processList) {
787 string name, sequence;
791 if (outputDir == "") { outputDir += m->hasPath(listfile); }
793 ofstream newNamesOutput;
794 string outputNamesFile;
795 map<string, ofstream*> filehandles;
797 if (Groups.size() == 0) { //you don't want to use groups
798 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name");
799 m->openOutputFile(outputNamesFile, newNamesOutput);
800 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
801 outputNameFiles[outputNamesFile] = processList->getLabel();
802 }else{ //you want to use groups
804 for (int i=0; i<Groups.size(); i++) {
806 filehandles[Groups[i]] = temp;
807 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name");
809 m->openOutputFile(outputNamesFile, *(temp));
810 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
811 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
815 //for each bin in the list vector
816 for (int i = 0; i < processList->size(); i++) {
817 if (m->control_pressed) {
819 if (Groups.size() == 0) { //you don't want to use groups
820 newNamesOutput.close();
822 for (int j=0; j<Groups.size(); j++) {
823 (*(filehandles[Groups[j]])).close();
824 delete filehandles[Groups[j]];
830 string temp = processList->get(i);
831 vector<string> namesInBin;
832 m->splitAtComma(temp, namesInBin);
834 if (Groups.size() == 0) {
835 nameRep = findRep(namesInBin);
836 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
838 map<string, vector<string> > NamesInGroup;
839 for (int j=0; j<Groups.size(); j++) { //initialize groups
840 NamesInGroup[Groups[j]].resize(0);
843 for (int j=0; j<namesInBin.size(); j++) {
844 string thisgroup = groupMap->getGroup(namesInBin[j]);
846 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
848 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
849 NamesInGroup[thisgroup].push_back(namesInBin[j]);
853 //get rep for each group in otu
854 for (int j=0; j<Groups.size(); j++) {
855 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
856 //get rep for each group
857 nameRep = findRep(NamesInGroup[Groups[j]]);
859 //output group rep and other members of this group
860 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
862 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
863 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
866 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
872 if (Groups.size() == 0) { //you don't want to use groups
873 newNamesOutput.close();
875 for (int i=0; i<Groups.size(); i++) {
876 (*(filehandles[Groups[i]])).close();
877 delete filehandles[Groups[i]];
884 catch(exception& e) {
885 m->errorOut(e, "GetOTURepCommand", "process");
889 //**********************************************************************************************************************
890 int GetOTURepCommand::processFastaNames(string filename, string label) {
894 if (outputDir == "") { outputDir += m->hasPath(listfile); }
895 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
896 m->openOutputFile(outputFileName, out);
897 vector<repStruct> reps;
898 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
901 string tempNameFile = filename + ".temp";
902 m->openOutputFile(tempNameFile, out2);
905 m->openInputFile(filename, in);
909 string rep, binnames;
910 in >> i >> rep >> binnames; m->gobble(in);
911 out2 << rep << '\t' << binnames << endl;
913 vector<string> names;
914 m->splitAtComma(binnames, names);
915 int binsize = names.size();
917 //if you have a groupfile
919 if (groupfile != "") {
920 map<string, string> groups;
921 map<string, string>::iterator groupIt;
923 //find the groups that are in this bin
924 for (size_t i = 0; i < names.size(); i++) {
925 string groupName = groupMap->getGroup(names[i]);
926 if (groupName == "not found") {
927 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
930 groups[groupName] = groupName;
934 //turn the groups into a string
935 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
936 group += groupIt->first + "-";
939 group = group.substr(0, group.length()-1);
943 //print out name and sequence for that bin
944 string sequence = fasta->getSequence(rep);
946 if (sequence != "not found") {
947 if (sorted == "") { //print them out
948 rep = rep + "\t" + toString(i+1);
949 rep = rep + "|" + toString(binsize);
950 if (groupfile != "") {
951 rep = rep + "|" + group;
953 out << ">" << rep << endl;
954 out << sequence << endl;
956 repStruct newRep(rep, i+1, binsize, group);
957 reps.push_back(newRep);
960 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
965 if (sorted != "") { //then sort them and print them
966 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
967 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
968 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
969 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
972 for (int i = 0; i < reps.size(); i++) {
973 string sequence = fasta->getSequence(reps[i].name);
974 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
975 outputName = outputName + "|" + toString(reps[i].size);
976 if (groupfile != "") {
977 outputName = outputName + "|" + reps[i].group;
979 out << ">" << outputName << endl;
980 out << sequence << endl;
988 m->mothurRemove(filename);
989 rename(tempNameFile.c_str(), filename.c_str());
994 catch(exception& e) {
995 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
999 //**********************************************************************************************************************
1000 int GetOTURepCommand::processNames(string filename, string label) {
1003 //create output file
1004 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1007 string tempNameFile = filename + ".temp";
1008 m->openOutputFile(tempNameFile, out2);
1011 m->openInputFile(filename, in);
1014 string rep, binnames;
1016 if (m->control_pressed) { break; }
1017 in >> i >> rep >> binnames; m->gobble(in);
1018 out2 << rep << '\t' << binnames << endl;
1023 m->mothurRemove(filename);
1024 rename(tempNameFile.c_str(), filename.c_str());
1028 catch(exception& e) {
1029 m->errorOut(e, "GetOTURepCommand", "processNames");
1033 //**********************************************************************************************************************
1034 SeqMap GetOTURepCommand::getMap(int row) {
1038 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1039 if (rowPositions[row] != -1){
1041 inRow.seekg(rowPositions[row]);
1043 int rowNum, numDists, colNum;
1046 inRow >> rowNum >> numDists;
1048 for(int i = 0; i < numDists; i++) {
1049 inRow >> colNum >> dist;
1050 rowMap[colNum] = dist;
1057 catch(exception& e) {
1058 m->errorOut(e, "GetOTURepCommand", "getMap");
1062 //**********************************************************************************************************************