5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){
100 string outputFileName = "";
101 map<string, vector<string> >::iterator it;
103 //is this a type this command creates
104 it = outputTypes.find(type);
105 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
107 if (type == "fasta") { outputFileName = "rep.fasta"; }
108 else if (type == "name") { outputFileName = "rep.names"; }
109 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
111 return outputFileName;
113 catch(exception& e) {
114 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
118 //**********************************************************************************************************************
119 GetOTURepCommand::GetOTURepCommand(){
121 abort = true; calledHelp = true;
123 vector<string> tempOutNames;
124 outputTypes["fasta"] = tempOutNames;
125 outputTypes["name"] = tempOutNames;
127 catch(exception& e) {
128 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
132 //**********************************************************************************************************************
133 GetOTURepCommand::GetOTURepCommand(string option) {
135 abort = false; calledHelp = false;
138 //allow user to run help
139 if (option == "help") {
140 help(); abort = true; calledHelp = true;
141 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
143 vector<string> myArray = setParameters();
145 OptionParser parser(option);
146 map<string, string> parameters = parser.getParameters();
148 ValidParameters validParameter;
149 map<string, string>::iterator it;
151 //check to make sure all parameters are valid for command
152 for (it = parameters.begin(); it != parameters.end(); it++) {
153 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
156 //initialize outputTypes
157 vector<string> tempOutNames;
158 outputTypes["fasta"] = tempOutNames;
159 outputTypes["name"] = tempOutNames;
161 //if the user changes the input directory command factory will send this info to us in the output parameter
162 string inputDir = validParameter.validFile(parameters, "inputdir", false);
163 if (inputDir == "not found"){ inputDir = ""; }
166 it = parameters.find("list");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["list"] = inputDir + it->second; }
174 it = parameters.find("fasta");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["fasta"] = inputDir + it->second; }
182 it = parameters.find("phylip");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["phylip"] = inputDir + it->second; }
190 it = parameters.find("column");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["column"] = inputDir + it->second; }
198 it = parameters.find("name");
199 //user has given a template file
200 if(it != parameters.end()){
201 path = m->hasPath(it->second);
202 //if the user has not given a path then, add inputdir. else leave path alone.
203 if (path == "") { parameters["name"] = inputDir + it->second; }
206 it = parameters.find("group");
207 //user has given a template file
208 if(it != parameters.end()){
209 path = m->hasPath(it->second);
210 //if the user has not given a path then, add inputdir. else leave path alone.
211 if (path == "") { parameters["group"] = inputDir + it->second; }
216 //if the user changes the output directory command factory will send this info to us in the output parameter
217 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
219 //check for required parameters
220 fastafile = validParameter.validFile(parameters, "fasta", true);
221 if (fastafile == "not found") { fastafile = ""; }
222 else if (fastafile == "not open") { abort = true; }
223 else { m->setFastaFile(fastafile); }
225 listfile = validParameter.validFile(parameters, "list", true);
226 if (listfile == "not found") {
227 listfile = m->getListFile();
228 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
229 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
231 else if (listfile == "not open") { abort = true; }
232 else { m->setListFile(listfile); }
234 phylipfile = validParameter.validFile(parameters, "phylip", true);
235 if (phylipfile == "not found") { phylipfile = ""; }
236 else if (phylipfile == "not open") { abort = true; }
237 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
239 columnfile = validParameter.validFile(parameters, "column", true);
240 if (columnfile == "not found") { columnfile = ""; }
241 else if (columnfile == "not open") { abort = true; }
242 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
244 namefile = validParameter.validFile(parameters, "name", true);
245 if (namefile == "not open") { abort = true; }
246 else if (namefile == "not found") { namefile = ""; }
247 else { m->setNameFile(namefile); }
249 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
250 //give priority to column, then phylip
251 columnfile = m->getColumnFile();
252 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
254 phylipfile = m->getPhylipFile();
255 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
257 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
261 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
263 if (columnfile != "") {
264 if (namefile == "") {
265 namefile = m->getNameFile();
266 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
268 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
274 //check for optional parameter and set defaults
275 // ...at some point should added some additional type checking...
276 label = validParameter.validFile(parameters, "label", false);
277 if (label == "not found") { label = ""; allLines = 1; }
279 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
280 else { allLines = 1; }
283 groupfile = validParameter.validFile(parameters, "group", true);
284 if (groupfile == "not open") { groupfile = ""; abort = true; }
285 else if (groupfile == "not found") { groupfile = ""; }
286 else { m->setGroupFile(groupfile); }
288 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
289 if (sorted == "none") { sorted=""; }
290 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
291 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
295 if ((sorted == "group") && (groupfile == "")) {
296 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
300 groups = validParameter.validFile(parameters, "groups", false);
301 if (groups == "not found") { groups = ""; }
303 if (groupfile == "") {
304 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
307 m->splitAtDash(groups, Groups);
310 m->setGroups(Groups);
312 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
313 large = m->isTrue(temp);
315 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
316 weighted = m->isTrue(temp);
318 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
320 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
321 m->mothurConvert(temp, precision);
323 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
324 m->mothurConvert(temp, cutoff);
325 cutoff += (5 / (precision * 10.0));
328 catch(exception& e) {
329 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
334 //**********************************************************************************************************************
336 int GetOTURepCommand::execute(){
339 if (abort == true) { if (calledHelp) { return 0; } return 2; }
344 //read distance files
345 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
346 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
347 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
349 readMatrix->setCutoff(cutoff);
352 nameMap = new NameAssignment(namefile);
354 }else{ nameMap = NULL; }
356 readMatrix->read(nameMap);
358 if (m->control_pressed) { delete readMatrix; return 0; }
360 list = readMatrix->getListVector();
362 SparseMatrix* matrix = readMatrix->getMatrix();
364 // Create a data structure to quickly access the distance information.
365 // It consists of a vector of distance maps, where each map contains
366 // all distances of a certain sequence. Vector and maps are accessed
367 // via the index of a sequence in the distance matrix
368 seqVec = vector<SeqMap>(list->size());
369 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
370 if (m->control_pressed) { delete readMatrix; return 0; }
371 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
373 //add dummy map for unweighted calc
375 seqVec.push_back(dummy);
381 if (m->control_pressed) { return 0; }
383 //process file and set up indexes
384 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
385 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
386 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
388 formatMatrix->setCutoff(cutoff);
391 nameMap = new NameAssignment(namefile);
393 }else{ nameMap = NULL; }
395 formatMatrix->read(nameMap);
397 if (m->control_pressed) { delete formatMatrix; return 0; }
399 list = formatMatrix->getListVector();
401 distFile = formatMatrix->getFormattedFileName();
403 //positions in file where the distances for each sequence begin
404 //rowPositions[1] = position in file where distance related to sequence 1 start.
405 rowPositions = formatMatrix->getRowPositions();
406 rowPositions.push_back(-1); //dummy row for unweighted calc
411 //openfile for getMap to use
412 m->openInputFile(distFile, inRow);
414 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
418 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
420 vector<string> names;
422 //map names to rows in sparsematrix
423 for (int i = 0; i < list->size(); i++) {
425 binnames = list->get(i);
427 m->splitAtComma(binnames, names);
429 for (int j = 0; j < names.size(); j++) {
430 nameToIndex[names[j]] = i;
433 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
436 if (m->control_pressed) {
437 if (large) { inRow.close(); m->mothurRemove(distFile); }
441 if (groupfile != "") {
442 //read in group map info.
443 groupMap = new GroupMap(groupfile);
444 int error = groupMap->readMap();
445 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
447 if (Groups.size() != 0) {
448 SharedUtil* util = new SharedUtil();
449 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
450 util->setGroups(Groups, gNamesOfGroups, "getoturep");
451 groupMap->setNamesOfGroups(gNamesOfGroups);
456 //done with listvector from matrix
457 if (list != NULL) { delete list; }
459 input = new InputData(listfile, "list");
460 list = input->getListVector();
461 string lastLabel = list->getLabel();
463 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
464 set<string> processedLabels;
465 set<string> userLabels = labels;
467 if (m->control_pressed) {
468 if (large) { inRow.close(); m->mothurRemove(distFile); }
469 delete input; delete list; return 0;
472 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
474 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
476 if (allLines == 1 || labels.count(list->getLabel()) == 1){
477 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
478 error = process(list);
479 if (error == 1) { return 0; } //there is an error in hte input files, abort command
481 if (m->control_pressed) {
482 if (large) { inRow.close(); m->mothurRemove(distFile); }
483 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
484 delete input; delete list; return 0;
487 processedLabels.insert(list->getLabel());
488 userLabels.erase(list->getLabel());
491 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
492 string saveLabel = list->getLabel();
495 list = input->getListVector(lastLabel);
496 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
497 error = process(list);
498 if (error == 1) { return 0; } //there is an error in hte input files, abort command
500 if (m->control_pressed) {
501 if (large) { inRow.close(); m->mothurRemove(distFile); }
502 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
503 delete input; delete list; return 0;
506 processedLabels.insert(list->getLabel());
507 userLabels.erase(list->getLabel());
509 //restore real lastlabel to save below
510 list->setLabel(saveLabel);
513 lastLabel = list->getLabel();
516 list = input->getListVector();
519 //output error messages about any remaining user labels
520 bool needToRun = false;
521 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
522 m->mothurOut("Your file does not include the label " + (*it));
523 if (processedLabels.count(lastLabel) != 1) {
524 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
527 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
531 //run last label if you need to
532 if (needToRun == true) {
533 if (list != NULL) { delete list; }
534 list = input->getListVector(lastLabel);
535 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
536 error = process(list);
538 if (error == 1) { return 0; } //there is an error in hte input files, abort command
540 if (m->control_pressed) {
541 if (large) { inRow.close(); m->mothurRemove(distFile); }
542 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
543 delete input; delete list; return 0;
547 //close and remove formatted matrix file
550 m->mothurRemove(distFile);
555 if (!weighted) { nameFileMap.clear(); }
558 if (fastafile != "") {
560 fasta = new FastaMap();
561 fasta->readFastaFile(fastafile);
563 //if user gave a namesfile then use it
564 if (namefile != "") { readNamesFile(); }
566 //output create and output the .rep.fasta files
567 map<string, string>::iterator itNameFile;
568 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
569 processFastaNames(itNameFile->first, itNameFile->second);
572 //output create and output the .rep.fasta files
573 map<string, string>::iterator itNameFile;
574 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
575 processNames(itNameFile->first, itNameFile->second);
580 if (groupfile != "") { delete groupMap; }
582 if (m->control_pressed) { return 0; }
584 //set fasta file as new current fastafile - use first one??
586 itTypes = outputTypes.find("fasta");
587 if (itTypes != outputTypes.end()) {
588 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
591 itTypes = outputTypes.find("name");
592 if (itTypes != outputTypes.end()) {
593 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
596 m->mothurOutEndLine();
597 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
598 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
599 m->mothurOutEndLine();
603 catch(exception& e) {
604 m->errorOut(e, "GetOTURepCommand", "execute");
609 //**********************************************************************************************************************
610 void GetOTURepCommand::readNamesFile() {
613 vector<string> dupNames;
614 m->openInputFile(namefile, in);
616 string name, names, sequence;
619 in >> name; //read from first column A
620 in >> names; //read from second column A,B,C,D
624 //parse names into vector
625 m->splitAtComma(names, dupNames);
627 //store names in fasta map
628 sequence = fasta->getSequence(name);
629 for (int i = 0; i < dupNames.size(); i++) {
630 fasta->push_back(dupNames[i], sequence);
638 catch(exception& e) {
639 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
643 //**********************************************************************************************************************
644 //read names file to find the weighted rep for each bin
645 void GetOTURepCommand::readNamesFile(bool w) {
648 vector<string> dupNames;
649 m->openInputFile(namefile, in);
651 string name, names, sequence;
654 in >> name; m->gobble(in); //read from first column A
655 in >> names; //read from second column A,B,C,D
659 //parse names into vector
660 m->splitAtComma(names, dupNames);
662 for (int i = 0; i < dupNames.size(); i++) {
663 nameFileMap[dupNames[i]] = name;
671 catch(exception& e) {
672 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
676 //**********************************************************************************************************************
677 string GetOTURepCommand::findRep(vector<string> names) {
679 // if only 1 sequence in bin or processing the "unique" label, then
680 // the first sequence of the OTU is the representative one
681 if ((names.size() == 1)) {
684 vector<int> seqIndex(names.size());
685 vector<float> max_dist(names.size());
686 vector<float> total_dist(names.size());
687 map<string, string>::iterator itNameFile;
688 map<string, int>::iterator itNameIndex;
690 //fill seqIndex and initialize sums
691 for (size_t i = 0; i < names.size(); i++) {
693 seqIndex[i] = nameToIndex[names[i]];
695 if (namefile == "") {
696 itNameIndex = nameToIndex.find(names[i]);
698 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
699 if (large) { seqIndex[i] = (rowPositions.size()-1); }
700 else { seqIndex[i] = (seqVec.size()-1); }
702 seqIndex[i] = itNameIndex->second;
706 itNameFile = nameFileMap.find(names[i]);
708 if (itNameFile == nameFileMap.end()) {
709 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
711 string name1 = itNameFile->first;
712 string name2 = itNameFile->second;
714 if (name1 == name2) { //then you are unique so add your real dists
715 seqIndex[i] = nameToIndex[names[i]];
717 if (large) { seqIndex[i] = (rowPositions.size()-1); }
718 else { seqIndex[i] = (seqVec.size()-1); }
727 // loop through all entries in seqIndex
730 for (size_t i=0; i < seqIndex.size(); i++) {
731 if (m->control_pressed) { return "control"; }
733 if (!large) { currMap = seqVec[seqIndex[i]]; }
734 else { currMap = getMap(seqIndex[i]); }
736 for (size_t j=0; j < seqIndex.size(); j++) {
737 it = currMap.find(seqIndex[j]);
738 if (it != currMap.end()) {
739 max_dist[i] = max(max_dist[i], it->second);
740 max_dist[j] = max(max_dist[j], it->second);
741 total_dist[i] += it->second;
742 total_dist[j] += it->second;
743 }else{ //if you can't find the distance make it the cutoff
744 max_dist[i] = max(max_dist[i], cutoff);
745 max_dist[j] = max(max_dist[j], cutoff);
746 total_dist[i] += cutoff;
747 total_dist[j] += cutoff;
752 // sequence with the smallest maximum distance is the representative
753 //if tie occurs pick sequence with smallest average distance
756 for (size_t i=0; i < max_dist.size(); i++) {
757 if (m->control_pressed) { return "control"; }
758 if (max_dist[i] < min) {
761 }else if (max_dist[i] == min) {
762 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
763 float newAverage = total_dist[i] / (float) total_dist.size();
765 if (newAverage < currentAverage) {
772 return(names[minIndex]);
775 catch(exception& e) {
776 m->errorOut(e, "GetOTURepCommand", "FindRep");
781 //**********************************************************************************************************************
782 int GetOTURepCommand::process(ListVector* processList) {
784 string name, sequence;
788 if (outputDir == "") { outputDir += m->hasPath(listfile); }
790 ofstream newNamesOutput;
791 string outputNamesFile;
792 map<string, ofstream*> filehandles;
794 if (Groups.size() == 0) { //you don't want to use groups
795 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name");
796 m->openOutputFile(outputNamesFile, newNamesOutput);
797 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
798 outputNameFiles[outputNamesFile] = processList->getLabel();
799 }else{ //you want to use groups
801 for (int i=0; i<Groups.size(); i++) {
803 filehandles[Groups[i]] = temp;
804 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name");
806 m->openOutputFile(outputNamesFile, *(temp));
807 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
808 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
812 //for each bin in the list vector
813 for (int i = 0; i < processList->size(); i++) {
814 if (m->control_pressed) {
816 if (Groups.size() == 0) { //you don't want to use groups
817 newNamesOutput.close();
819 for (int j=0; j<Groups.size(); j++) {
820 (*(filehandles[Groups[j]])).close();
821 delete filehandles[Groups[j]];
827 string temp = processList->get(i);
828 vector<string> namesInBin;
829 m->splitAtComma(temp, namesInBin);
831 if (Groups.size() == 0) {
832 nameRep = findRep(namesInBin);
833 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
835 map<string, vector<string> > NamesInGroup;
836 for (int j=0; j<Groups.size(); j++) { //initialize groups
837 NamesInGroup[Groups[j]].resize(0);
840 for (int j=0; j<namesInBin.size(); j++) {
841 string thisgroup = groupMap->getGroup(namesInBin[j]);
843 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
845 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
846 NamesInGroup[thisgroup].push_back(namesInBin[j]);
850 //get rep for each group in otu
851 for (int j=0; j<Groups.size(); j++) {
852 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
853 //get rep for each group
854 nameRep = findRep(NamesInGroup[Groups[j]]);
856 //output group rep and other members of this group
857 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
859 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
860 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
863 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
869 if (Groups.size() == 0) { //you don't want to use groups
870 newNamesOutput.close();
872 for (int i=0; i<Groups.size(); i++) {
873 (*(filehandles[Groups[i]])).close();
874 delete filehandles[Groups[i]];
881 catch(exception& e) {
882 m->errorOut(e, "GetOTURepCommand", "process");
886 //**********************************************************************************************************************
887 int GetOTURepCommand::processFastaNames(string filename, string label) {
891 if (outputDir == "") { outputDir += m->hasPath(listfile); }
892 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
893 m->openOutputFile(outputFileName, out);
894 vector<repStruct> reps;
895 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
898 string tempNameFile = filename + ".temp";
899 m->openOutputFile(tempNameFile, out2);
902 m->openInputFile(filename, in);
906 string rep, binnames;
907 in >> i >> rep >> binnames; m->gobble(in);
908 out2 << rep << '\t' << binnames << endl;
910 vector<string> names;
911 m->splitAtComma(binnames, names);
912 int binsize = names.size();
914 //if you have a groupfile
916 if (groupfile != "") {
917 map<string, string> groups;
918 map<string, string>::iterator groupIt;
920 //find the groups that are in this bin
921 for (size_t i = 0; i < names.size(); i++) {
922 string groupName = groupMap->getGroup(names[i]);
923 if (groupName == "not found") {
924 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
927 groups[groupName] = groupName;
931 //turn the groups into a string
932 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
933 group += groupIt->first + "-";
936 group = group.substr(0, group.length()-1);
940 //print out name and sequence for that bin
941 string sequence = fasta->getSequence(rep);
943 if (sequence != "not found") {
944 if (sorted == "") { //print them out
945 rep = rep + "\t" + toString(i+1);
946 rep = rep + "|" + toString(binsize);
947 if (groupfile != "") {
948 rep = rep + "|" + group;
950 out << ">" << rep << endl;
951 out << sequence << endl;
953 repStruct newRep(rep, i+1, binsize, group);
954 reps.push_back(newRep);
957 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
962 if (sorted != "") { //then sort them and print them
963 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
964 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
965 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
966 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
969 for (int i = 0; i < reps.size(); i++) {
970 string sequence = fasta->getSequence(reps[i].name);
971 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
972 outputName = outputName + "|" + toString(reps[i].size);
973 if (groupfile != "") {
974 outputName = outputName + "|" + reps[i].group;
976 out << ">" << outputName << endl;
977 out << sequence << endl;
985 m->mothurRemove(filename);
986 rename(tempNameFile.c_str(), filename.c_str());
991 catch(exception& e) {
992 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
996 //**********************************************************************************************************************
997 int GetOTURepCommand::processNames(string filename, string label) {
1000 //create output file
1001 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1004 string tempNameFile = filename + ".temp";
1005 m->openOutputFile(tempNameFile, out2);
1008 m->openInputFile(filename, in);
1011 string rep, binnames;
1013 if (m->control_pressed) { break; }
1014 in >> i >> rep >> binnames; m->gobble(in);
1015 out2 << rep << '\t' << binnames << endl;
1020 m->mothurRemove(filename);
1021 rename(tempNameFile.c_str(), filename.c_str());
1025 catch(exception& e) {
1026 m->errorOut(e, "GetOTURepCommand", "processNames");
1030 //**********************************************************************************************************************
1031 SeqMap GetOTURepCommand::getMap(int row) {
1035 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1036 if (rowPositions[row] != -1){
1038 inRow.seekg(rowPositions[row]);
1040 int rowNum, numDists, colNum;
1043 inRow >> rowNum >> numDists;
1045 for(int i = 0; i < numDists; i++) {
1046 inRow >> colNum >> dist;
1047 rowMap[colNum] = dist;
1054 catch(exception& e) {
1055 m->errorOut(e, "GetOTURepCommand", "getMap");
1059 //**********************************************************************************************************************