5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
58 vector<string> myArray;
59 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
63 m->errorOut(e, "GetOTURepCommand", "setParameters");
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){
70 string helpString = "";
71 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n";
74 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77 helpString += "The default value for label is all labels in your inputfile.\n";
78 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
80 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
81 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85 helpString += "The group parameter allows you provide a group file.\n";
86 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
93 m->errorOut(e, "GetOTURepCommand", "getHelpString");
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
106 catch(exception& e) {
107 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option) {
114 abort = false; calledHelp = false;
117 //allow user to run help
118 if (option == "help") {
119 help(); abort = true; calledHelp = true;
120 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
122 vector<string> myArray = setParameters();
124 OptionParser parser(option);
125 map<string, string> parameters = parser.getParameters();
127 ValidParameters validParameter;
128 map<string, string>::iterator it;
130 //check to make sure all parameters are valid for command
131 for (it = parameters.begin(); it != parameters.end(); it++) {
132 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
135 //initialize outputTypes
136 vector<string> tempOutNames;
137 outputTypes["fasta"] = tempOutNames;
138 outputTypes["name"] = tempOutNames;
140 //if the user changes the input directory command factory will send this info to us in the output parameter
141 string inputDir = validParameter.validFile(parameters, "inputdir", false);
142 if (inputDir == "not found"){ inputDir = ""; }
145 it = parameters.find("list");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["list"] = inputDir + it->second; }
153 it = parameters.find("fasta");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["fasta"] = inputDir + it->second; }
161 it = parameters.find("phylip");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["phylip"] = inputDir + it->second; }
169 it = parameters.find("column");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["column"] = inputDir + it->second; }
177 it = parameters.find("name");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["name"] = inputDir + it->second; }
185 it = parameters.find("group");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["group"] = inputDir + it->second; }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") {
201 fastafile = m->getFastaFile();
202 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
205 else if (fastafile == "not open") { abort = true; }
207 listfile = validParameter.validFile(parameters, "list", true);
208 if (listfile == "not found") {
209 listfile = m->getListFile();
210 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
211 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
213 else if (listfile == "not open") { abort = true; }
215 phylipfile = validParameter.validFile(parameters, "phylip", true);
216 if (phylipfile == "not found") { phylipfile = ""; }
217 else if (phylipfile == "not open") { abort = true; }
218 else { distFile = phylipfile; format = "phylip"; }
220 columnfile = validParameter.validFile(parameters, "column", true);
221 if (columnfile == "not found") { columnfile = ""; }
222 else if (columnfile == "not open") { abort = true; }
223 else { distFile = columnfile; format = "column"; }
225 namefile = validParameter.validFile(parameters, "name", true);
226 if (namefile == "not open") { abort = true; }
227 else if (namefile == "not found") { namefile = ""; }
229 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
230 //give priority to column, then phylip
231 columnfile = m->getColumnFile();
232 if (columnfile != "") { m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
234 phylipfile = m->getPhylipFile();
235 if (phylipfile != "") { m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
237 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
241 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
243 if (columnfile != "") {
244 if (namefile == "") {
245 namefile = m->getNameFile();
246 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
248 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine();
254 //check for optional parameter and set defaults
255 // ...at some point should added some additional type checking...
256 label = validParameter.validFile(parameters, "label", false);
257 if (label == "not found") { label = ""; allLines = 1; }
259 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
260 else { allLines = 1; }
263 groupfile = validParameter.validFile(parameters, "group", true);
264 if (groupfile == "not open") { groupfile = ""; abort = true; }
265 else if (groupfile == "not found") { groupfile = ""; }
267 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
268 if (sorted == "none") { sorted=""; }
269 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
270 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
274 if ((sorted == "group") && (groupfile == "")) {
275 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
279 groups = validParameter.validFile(parameters, "groups", false);
280 if (groups == "not found") { groups = ""; }
282 if (groupfile == "") {
283 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
286 m->splitAtDash(groups, Groups);
291 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
292 large = m->isTrue(temp);
294 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
295 weighted = m->isTrue(temp);
297 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
299 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
300 convert(temp, precision);
302 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
303 convert(temp, cutoff);
304 cutoff += (5 / (precision * 10.0));
307 catch(exception& e) {
308 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
313 //**********************************************************************************************************************
315 int GetOTURepCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
323 //read distance files
324 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
325 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
326 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
328 readMatrix->setCutoff(cutoff);
331 nameMap = new NameAssignment(namefile);
333 }else{ nameMap = NULL; }
335 readMatrix->read(nameMap);
337 if (m->control_pressed) { delete readMatrix; return 0; }
339 list = readMatrix->getListVector();
341 SparseMatrix* matrix = readMatrix->getMatrix();
343 // Create a data structure to quickly access the distance information.
344 // It consists of a vector of distance maps, where each map contains
345 // all distances of a certain sequence. Vector and maps are accessed
346 // via the index of a sequence in the distance matrix
347 seqVec = vector<SeqMap>(list->size());
348 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
349 if (m->control_pressed) { delete readMatrix; return 0; }
350 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
352 //add dummy map for unweighted calc
354 seqVec.push_back(dummy);
360 if (m->control_pressed) { return 0; }
362 //process file and set up indexes
363 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
364 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
365 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
367 formatMatrix->setCutoff(cutoff);
370 nameMap = new NameAssignment(namefile);
372 }else{ nameMap = NULL; }
374 formatMatrix->read(nameMap);
376 if (m->control_pressed) { delete formatMatrix; return 0; }
378 list = formatMatrix->getListVector();
380 distFile = formatMatrix->getFormattedFileName();
382 //positions in file where the distances for each sequence begin
383 //rowPositions[1] = position in file where distance related to sequence 1 start.
384 rowPositions = formatMatrix->getRowPositions();
385 rowPositions.push_back(-1); //dummy row for unweighted calc
390 //openfile for getMap to use
391 m->openInputFile(distFile, inRow);
393 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
397 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
399 vector<string> names;
401 //map names to rows in sparsematrix
402 for (int i = 0; i < list->size(); i++) {
404 binnames = list->get(i);
406 m->splitAtComma(binnames, names);
408 for (int j = 0; j < names.size(); j++) {
409 nameToIndex[names[j]] = i;
412 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
415 if (m->control_pressed) {
416 if (large) { inRow.close(); remove(distFile.c_str()); }
420 if (groupfile != "") {
421 //read in group map info.
422 groupMap = new GroupMap(groupfile);
423 int error = groupMap->readMap();
424 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
426 if (Groups.size() != 0) {
427 SharedUtil* util = new SharedUtil();
428 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
433 //done with listvector from matrix
434 if (list != NULL) { delete list; }
436 input = new InputData(listfile, "list");
437 list = input->getListVector();
438 string lastLabel = list->getLabel();
440 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
441 set<string> processedLabels;
442 set<string> userLabels = labels;
444 if (m->control_pressed) {
445 if (large) { inRow.close(); remove(distFile.c_str()); }
446 delete input; delete list; return 0;
449 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
451 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
453 if (allLines == 1 || labels.count(list->getLabel()) == 1){
454 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
455 error = process(list);
456 if (error == 1) { return 0; } //there is an error in hte input files, abort command
458 if (m->control_pressed) {
459 if (large) { inRow.close(); remove(distFile.c_str()); }
460 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
461 delete input; delete list; return 0;
464 processedLabels.insert(list->getLabel());
465 userLabels.erase(list->getLabel());
468 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
469 string saveLabel = list->getLabel();
472 list = input->getListVector(lastLabel);
473 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
474 error = process(list);
475 if (error == 1) { return 0; } //there is an error in hte input files, abort command
477 if (m->control_pressed) {
478 if (large) { inRow.close(); remove(distFile.c_str()); }
479 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
480 delete input; delete list; return 0;
483 processedLabels.insert(list->getLabel());
484 userLabels.erase(list->getLabel());
486 //restore real lastlabel to save below
487 list->setLabel(saveLabel);
490 lastLabel = list->getLabel();
493 list = input->getListVector();
496 //output error messages about any remaining user labels
497 bool needToRun = false;
498 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
499 m->mothurOut("Your file does not include the label " + (*it));
500 if (processedLabels.count(lastLabel) != 1) {
501 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
504 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
508 //run last label if you need to
509 if (needToRun == true) {
510 if (list != NULL) { delete list; }
511 list = input->getListVector(lastLabel);
512 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
513 error = process(list);
515 if (error == 1) { return 0; } //there is an error in hte input files, abort command
517 if (m->control_pressed) {
518 if (large) { inRow.close(); remove(distFile.c_str()); }
519 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
520 delete input; delete list; return 0;
524 //close and remove formatted matrix file
527 remove(distFile.c_str());
532 if (!weighted) { nameFileMap.clear(); }
535 fasta = new FastaMap();
536 fasta->readFastaFile(fastafile);
538 //if user gave a namesfile then use it
539 if (namefile != "") { readNamesFile(); }
541 //output create and output the .rep.fasta files
542 map<string, string>::iterator itNameFile;
543 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
544 processNames(itNameFile->first, itNameFile->second);
548 if (groupfile != "") { delete groupMap; }
550 if (m->control_pressed) { return 0; }
552 //set fasta file as new current fastafile - use first one??
554 itTypes = outputTypes.find("fasta");
555 if (itTypes != outputTypes.end()) {
556 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
559 itTypes = outputTypes.find("name");
560 if (itTypes != outputTypes.end()) {
561 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
564 m->mothurOutEndLine();
565 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
566 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
567 m->mothurOutEndLine();
571 catch(exception& e) {
572 m->errorOut(e, "GetOTURepCommand", "execute");
577 //**********************************************************************************************************************
578 void GetOTURepCommand::readNamesFile() {
581 vector<string> dupNames;
582 m->openInputFile(namefile, in);
584 string name, names, sequence;
587 in >> name; //read from first column A
588 in >> names; //read from second column A,B,C,D
592 //parse names into vector
593 m->splitAtComma(names, dupNames);
595 //store names in fasta map
596 sequence = fasta->getSequence(name);
597 for (int i = 0; i < dupNames.size(); i++) {
598 fasta->push_back(dupNames[i], sequence);
606 catch(exception& e) {
607 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
611 //**********************************************************************************************************************
612 //read names file to find the weighted rep for each bin
613 void GetOTURepCommand::readNamesFile(bool w) {
616 vector<string> dupNames;
617 m->openInputFile(namefile, in);
619 string name, names, sequence;
622 in >> name; m->gobble(in); //read from first column A
623 in >> names; //read from second column A,B,C,D
627 //parse names into vector
628 m->splitAtComma(names, dupNames);
630 for (int i = 0; i < dupNames.size(); i++) {
631 nameFileMap[dupNames[i]] = name;
639 catch(exception& e) {
640 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
644 //**********************************************************************************************************************
645 string GetOTURepCommand::findRep(vector<string> names) {
647 // if only 1 sequence in bin or processing the "unique" label, then
648 // the first sequence of the OTU is the representative one
649 if ((names.size() == 1)) {
652 vector<int> seqIndex(names.size());
653 vector<float> max_dist(names.size());
654 vector<float> total_dist(names.size());
655 map<string, string>::iterator itNameFile;
656 map<string, int>::iterator itNameIndex;
658 //fill seqIndex and initialize sums
659 for (size_t i = 0; i < names.size(); i++) {
661 seqIndex[i] = nameToIndex[names[i]];
663 if (namefile == "") {
664 itNameIndex = nameToIndex.find(names[i]);
666 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
667 if (large) { seqIndex[i] = (rowPositions.size()-1); }
668 else { seqIndex[i] = (seqVec.size()-1); }
670 seqIndex[i] = itNameIndex->second;
674 itNameFile = nameFileMap.find(names[i]);
676 if (itNameFile == nameFileMap.end()) {
677 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
679 string name1 = itNameFile->first;
680 string name2 = itNameFile->second;
682 if (name1 == name2) { //then you are unique so add your real dists
683 seqIndex[i] = nameToIndex[names[i]];
685 if (large) { seqIndex[i] = (rowPositions.size()-1); }
686 else { seqIndex[i] = (seqVec.size()-1); }
695 // loop through all entries in seqIndex
698 for (size_t i=0; i < seqIndex.size(); i++) {
699 if (m->control_pressed) { return "control"; }
701 if (!large) { currMap = seqVec[seqIndex[i]]; }
702 else { currMap = getMap(seqIndex[i]); }
704 for (size_t j=0; j < seqIndex.size(); j++) {
705 it = currMap.find(seqIndex[j]);
706 if (it != currMap.end()) {
707 max_dist[i] = max(max_dist[i], it->second);
708 max_dist[j] = max(max_dist[j], it->second);
709 total_dist[i] += it->second;
710 total_dist[j] += it->second;
711 }else{ //if you can't find the distance make it the cutoff
712 max_dist[i] = max(max_dist[i], cutoff);
713 max_dist[j] = max(max_dist[j], cutoff);
714 total_dist[i] += cutoff;
715 total_dist[j] += cutoff;
720 // sequence with the smallest maximum distance is the representative
721 //if tie occurs pick sequence with smallest average distance
724 for (size_t i=0; i < max_dist.size(); i++) {
725 if (m->control_pressed) { return "control"; }
726 if (max_dist[i] < min) {
729 }else if (max_dist[i] == min) {
730 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
731 float newAverage = total_dist[i] / (float) total_dist.size();
733 if (newAverage < currentAverage) {
740 return(names[minIndex]);
743 catch(exception& e) {
744 m->errorOut(e, "GetOTURepCommand", "FindRep");
749 //**********************************************************************************************************************
750 int GetOTURepCommand::process(ListVector* processList) {
752 string name, sequence;
756 if (outputDir == "") { outputDir += m->hasPath(listfile); }
758 ofstream newNamesOutput;
759 string outputNamesFile;
760 map<string, ofstream*> filehandles;
762 if (Groups.size() == 0) { //you don't want to use groups
763 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
764 m->openOutputFile(outputNamesFile, newNamesOutput);
765 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
766 outputNameFiles[outputNamesFile] = processList->getLabel();
767 }else{ //you want to use groups
769 for (int i=0; i<Groups.size(); i++) {
771 filehandles[Groups[i]] = temp;
772 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
774 m->openOutputFile(outputNamesFile, *(temp));
775 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
776 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
780 //for each bin in the list vector
781 for (int i = 0; i < processList->size(); i++) {
782 if (m->control_pressed) {
784 if (Groups.size() == 0) { //you don't want to use groups
785 newNamesOutput.close();
787 for (int j=0; j<Groups.size(); j++) {
788 (*(filehandles[Groups[j]])).close();
789 delete filehandles[Groups[j]];
795 string temp = processList->get(i);
796 vector<string> namesInBin;
797 m->splitAtComma(temp, namesInBin);
799 if (Groups.size() == 0) {
800 nameRep = findRep(namesInBin);
801 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
803 map<string, vector<string> > NamesInGroup;
804 for (int j=0; j<Groups.size(); j++) { //initialize groups
805 NamesInGroup[Groups[j]].resize(0);
808 for (int j=0; j<namesInBin.size(); j++) {
809 string thisgroup = groupMap->getGroup(namesInBin[j]);
811 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
813 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
814 NamesInGroup[thisgroup].push_back(namesInBin[j]);
818 //get rep for each group in otu
819 for (int j=0; j<Groups.size(); j++) {
820 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
821 //get rep for each group
822 nameRep = findRep(NamesInGroup[Groups[j]]);
824 //output group rep and other members of this group
825 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
827 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
828 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
831 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
837 if (Groups.size() == 0) { //you don't want to use groups
838 newNamesOutput.close();
840 for (int i=0; i<Groups.size(); i++) {
841 (*(filehandles[Groups[i]])).close();
842 delete filehandles[Groups[i]];
849 catch(exception& e) {
850 m->errorOut(e, "GetOTURepCommand", "process");
854 //**********************************************************************************************************************
855 int GetOTURepCommand::processNames(string filename, string label) {
859 if (outputDir == "") { outputDir += m->hasPath(listfile); }
860 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
861 m->openOutputFile(outputFileName, out);
862 vector<repStruct> reps;
863 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
866 string tempNameFile = filename + ".temp";
867 m->openOutputFile(tempNameFile, out2);
870 m->openInputFile(filename, in);
874 string rep, binnames;
875 in >> i >> rep >> binnames; m->gobble(in);
876 out2 << rep << '\t' << binnames << endl;
878 vector<string> names;
879 m->splitAtComma(binnames, names);
880 int binsize = names.size();
882 //if you have a groupfile
884 if (groupfile != "") {
885 map<string, string> groups;
886 map<string, string>::iterator groupIt;
888 //find the groups that are in this bin
889 for (size_t i = 0; i < names.size(); i++) {
890 string groupName = groupMap->getGroup(names[i]);
891 if (groupName == "not found") {
892 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
895 groups[groupName] = groupName;
899 //turn the groups into a string
900 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
901 group += groupIt->first + "-";
904 group = group.substr(0, group.length()-1);
908 //print out name and sequence for that bin
909 string sequence = fasta->getSequence(rep);
911 if (sequence != "not found") {
912 if (sorted == "") { //print them out
913 rep = rep + "\t" + toString(i+1);
914 rep = rep + "|" + toString(binsize);
915 if (groupfile != "") {
916 rep = rep + "|" + group;
918 out << ">" << rep << endl;
919 out << sequence << endl;
921 repStruct newRep(rep, i+1, binsize, group);
922 reps.push_back(newRep);
925 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
930 if (sorted != "") { //then sort them and print them
931 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
932 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
933 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
934 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
937 for (int i = 0; i < reps.size(); i++) {
938 string sequence = fasta->getSequence(reps[i].name);
939 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
940 outputName = outputName + "|" + toString(reps[i].size);
941 if (groupfile != "") {
942 outputName = outputName + "|" + reps[i].group;
944 out << ">" << outputName << endl;
945 out << sequence << endl;
953 remove(filename.c_str());
954 rename(tempNameFile.c_str(), filename.c_str());
959 catch(exception& e) {
960 m->errorOut(e, "GetOTURepCommand", "processNames");
964 //**********************************************************************************************************************
965 SeqMap GetOTURepCommand::getMap(int row) {
969 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
970 if (rowPositions[row] != -1){
972 inRow.seekg(rowPositions[row]);
974 int rowNum, numDists, colNum;
977 inRow >> rowNum >> numDists;
979 for(int i = 0; i < numDists; i++) {
980 inRow >> colNum >> dist;
981 rowMap[colNum] = dist;
988 catch(exception& e) {
989 m->errorOut(e, "GetOTURepCommand", "getMap");
993 //**********************************************************************************************************************