5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){
42 //initialize outputTypes
43 vector<string> tempOutNames;
44 outputTypes["fasta"] = tempOutNames;
45 outputTypes["name"] = tempOutNames;
48 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
52 //**********************************************************************************************************************
53 vector<string> GetOTURepCommand::getValidParameters(){
55 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
56 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
60 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
64 //**********************************************************************************************************************
65 vector<string> GetOTURepCommand::getRequiredParameters(){
67 string Array[] = {"fasta","list"};
68 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
72 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
76 //**********************************************************************************************************************
77 vector<string> GetOTURepCommand::getRequiredFiles(){
79 vector<string> myArray;
83 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
87 //**********************************************************************************************************************
88 GetOTURepCommand::GetOTURepCommand(string option) {
90 globaldata = GlobalData::getInstance();
95 //allow user to run help
96 if (option == "help") {
99 //valid paramters for this command
100 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
101 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
103 OptionParser parser(option);
104 map<string, string> parameters = parser.getParameters();
106 ValidParameters validParameter;
107 map<string, string>::iterator it;
109 //check to make sure all parameters are valid for command
110 for (it = parameters.begin(); it != parameters.end(); it++) {
111 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
114 //initialize outputTypes
115 vector<string> tempOutNames;
116 outputTypes["fasta"] = tempOutNames;
117 outputTypes["name"] = tempOutNames;
119 //if the user changes the input directory command factory will send this info to us in the output parameter
120 string inputDir = validParameter.validFile(parameters, "inputdir", false);
121 if (inputDir == "not found"){ inputDir = ""; }
124 it = parameters.find("list");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["list"] = inputDir + it->second; }
132 it = parameters.find("fasta");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["fasta"] = inputDir + it->second; }
140 it = parameters.find("phylip");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["phylip"] = inputDir + it->second; }
148 it = parameters.find("column");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["column"] = inputDir + it->second; }
156 it = parameters.find("name");
157 //user has given a template file
158 if(it != parameters.end()){
159 path = m->hasPath(it->second);
160 //if the user has not given a path then, add inputdir. else leave path alone.
161 if (path == "") { parameters["name"] = inputDir + it->second; }
164 it = parameters.find("group");
165 //user has given a template file
166 if(it != parameters.end()){
167 path = m->hasPath(it->second);
168 //if the user has not given a path then, add inputdir. else leave path alone.
169 if (path == "") { parameters["group"] = inputDir + it->second; }
174 //if the user changes the output directory command factory will send this info to us in the output parameter
175 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
177 //check for required parameters
178 fastafile = validParameter.validFile(parameters, "fasta", true);
179 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
180 else if (fastafile == "not open") { abort = true; }
182 listfile = validParameter.validFile(parameters, "list", true);
183 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
184 else if (listfile == "not open") { abort = true; }
186 phylipfile = validParameter.validFile(parameters, "phylip", true);
187 if (phylipfile == "not found") { phylipfile = ""; }
188 else if (phylipfile == "not open") { abort = true; }
189 else { distFile = phylipfile; format = "phylip"; }
191 columnfile = validParameter.validFile(parameters, "column", true);
192 if (columnfile == "not found") { columnfile = ""; }
193 else if (columnfile == "not open") { abort = true; }
194 else { distFile = columnfile; format = "column"; }
196 namefile = validParameter.validFile(parameters, "name", true);
197 if (namefile == "not open") { abort = true; }
198 else if (namefile == "not found") { namefile = ""; }
200 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
201 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
203 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
205 //check for optional parameter and set defaults
206 // ...at some point should added some additional type checking...
207 label = validParameter.validFile(parameters, "label", false);
208 if (label == "not found") { label = ""; allLines = 1; }
210 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
211 else { allLines = 1; }
214 groupfile = validParameter.validFile(parameters, "group", true);
215 if (groupfile == "not open") { groupfile = ""; abort = true; }
216 else if (groupfile == "not found") { groupfile = ""; }
218 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
219 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
220 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
224 if ((sorted == "group") && (groupfile == "")) {
225 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
229 groups = validParameter.validFile(parameters, "groups", false);
230 if (groups == "not found") { groups = ""; }
232 if (groupfile == "") {
233 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
236 m->splitAtDash(groups, Groups);
239 globaldata->Groups = Groups;
241 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
242 large = m->isTrue(temp);
244 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
245 convert(temp, precision);
247 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
248 convert(temp, cutoff);
249 cutoff += (5 / (precision * 10.0));
252 catch(exception& e) {
253 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
258 //**********************************************************************************************************************
260 void GetOTURepCommand::help(){
262 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
263 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
264 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
265 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
266 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
267 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
268 m->mothurOut("The default value for label is all labels in your inputfile.\n");
269 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
270 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
271 m->mothurOut("The group parameter allows you provide a group file.\n");
272 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
273 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
274 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
275 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
277 catch(exception& e) {
278 m->errorOut(e, "GetOTURepCommand", "help");
283 //**********************************************************************************************************************
285 GetOTURepCommand::~GetOTURepCommand(){}
287 //**********************************************************************************************************************
289 int GetOTURepCommand::execute(){
292 if (abort == true) { return 0; }
296 //read distance files
297 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
298 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
299 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
301 readMatrix->setCutoff(cutoff);
304 nameMap = new NameAssignment(namefile);
306 }else{ nameMap = NULL; }
308 readMatrix->read(nameMap);
310 if (m->control_pressed) { delete readMatrix; return 0; }
313 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
314 globaldata->gListVector = readMatrix->getListVector();
316 SparseMatrix* matrix = readMatrix->getMatrix();
318 // Create a data structure to quickly access the distance information.
319 // It consists of a vector of distance maps, where each map contains
320 // all distances of a certain sequence. Vector and maps are accessed
321 // via the index of a sequence in the distance matrix
322 seqVec = vector<SeqMap>(globaldata->gListVector->size());
323 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
324 if (m->control_pressed) { delete readMatrix; return 0; }
325 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
332 if (m->control_pressed) { return 0; }
334 //process file and set up indexes
335 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
336 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
337 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
339 formatMatrix->setCutoff(cutoff);
342 nameMap = new NameAssignment(namefile);
344 }else{ nameMap = NULL; }
346 formatMatrix->read(nameMap);
348 if (m->control_pressed) { delete formatMatrix; return 0; }
351 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
352 globaldata->gListVector = formatMatrix->getListVector();
354 distFile = formatMatrix->getFormattedFileName();
356 //positions in file where the distances for each sequence begin
357 //rowPositions[1] = position in file where distance related to sequence 1 start.
358 rowPositions = formatMatrix->getRowPositions();
363 //openfile for getMap to use
364 m->openInputFile(distFile, inRow);
366 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
370 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
371 if (globaldata->gListVector != NULL) {
372 vector<string> names;
374 //map names to rows in sparsematrix
375 for (int i = 0; i < globaldata->gListVector->size(); i++) {
377 binnames = globaldata->gListVector->get(i);
379 m->splitAtComma(binnames, names);
381 for (int j = 0; j < names.size(); j++) {
382 nameToIndex[names[j]] = i;
385 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
388 if (m->control_pressed) {
389 if (large) { inRow.close(); remove(distFile.c_str()); }
393 if (groupfile != "") {
394 //read in group map info.
395 groupMap = new GroupMap(groupfile);
396 int error = groupMap->readMap();
397 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
399 if (Groups.size() != 0) {
400 SharedUtil* util = new SharedUtil();
401 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
406 //set format to list so input can get listvector
407 globaldata->setFormat("list");
410 read = new ReadOTUFile(listfile);
411 read->read(&*globaldata);
413 input = globaldata->ginput;
414 list = globaldata->gListVector;
415 string lastLabel = list->getLabel();
417 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
418 set<string> processedLabels;
419 set<string> userLabels = labels;
421 if (m->control_pressed) {
422 if (large) { inRow.close(); remove(distFile.c_str()); }
423 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
426 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
428 if (allLines == 1 || labels.count(list->getLabel()) == 1){
429 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
430 error = process(list);
431 if (error == 1) { return 0; } //there is an error in hte input files, abort command
433 if (m->control_pressed) {
434 if (large) { inRow.close(); remove(distFile.c_str()); }
435 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
436 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
439 processedLabels.insert(list->getLabel());
440 userLabels.erase(list->getLabel());
443 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
444 string saveLabel = list->getLabel();
447 list = input->getListVector(lastLabel);
448 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
449 error = process(list);
450 if (error == 1) { return 0; } //there is an error in hte input files, abort command
452 if (m->control_pressed) {
453 if (large) { inRow.close(); remove(distFile.c_str()); }
454 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
455 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
458 processedLabels.insert(list->getLabel());
459 userLabels.erase(list->getLabel());
461 //restore real lastlabel to save below
462 list->setLabel(saveLabel);
465 lastLabel = list->getLabel();
468 list = input->getListVector();
471 //output error messages about any remaining user labels
472 bool needToRun = false;
473 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
474 m->mothurOut("Your file does not include the label " + (*it));
475 if (processedLabels.count(lastLabel) != 1) {
476 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
479 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
483 //run last label if you need to
484 if (needToRun == true) {
485 if (list != NULL) { delete list; }
486 list = input->getListVector(lastLabel);
487 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
488 error = process(list);
490 if (error == 1) { return 0; } //there is an error in hte input files, abort command
492 if (m->control_pressed) {
493 if (large) { inRow.close(); remove(distFile.c_str()); }
494 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
495 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
499 //close and remove formatted matrix file
502 remove(distFile.c_str());
505 globaldata->gListVector = NULL;
506 delete input; globaldata->ginput = NULL;
510 fasta = new FastaMap();
511 fasta->readFastaFile(fastafile);
513 //if user gave a namesfile then use it
514 if (namefile != "") { readNamesFile(); }
516 //output create and output the .rep.fasta files
517 map<string, string>::iterator itNameFile;
518 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
519 processNames(itNameFile->first, itNameFile->second);
523 if (groupfile != "") {
524 delete groupMap; globaldata->gGroupmap = NULL;
527 if (m->control_pressed) { return 0; }
529 m->mothurOutEndLine();
530 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
531 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
532 m->mothurOutEndLine();
536 catch(exception& e) {
537 m->errorOut(e, "GetOTURepCommand", "execute");
542 //**********************************************************************************************************************
543 void GetOTURepCommand::readNamesFile() {
545 vector<string> dupNames;
546 m->openInputFile(namefile, inNames);
548 string name, names, sequence;
551 inNames >> name; //read from first column A
552 inNames >> names; //read from second column A,B,C,D
556 //parse names into vector
557 m->splitAtComma(names, dupNames);
559 //store names in fasta map
560 sequence = fasta->getSequence(name);
561 for (int i = 0; i < dupNames.size(); i++) {
562 fasta->push_back(dupNames[i], sequence);
570 catch(exception& e) {
571 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
575 //**********************************************************************************************************************
576 string GetOTURepCommand::findRep(vector<string> names) {
578 // if only 1 sequence in bin or processing the "unique" label, then
579 // the first sequence of the OTU is the representative one
580 if ((names.size() == 2) || (names.size() == 1) || (list->getLabel() == "unique")) {
583 vector<int> seqIndex(names.size());
584 vector<float> max_dist(names.size());
585 vector<float> total_dist(names.size());
587 //fill seqIndex and initialize sums
588 for (size_t i = 0; i < names.size(); i++) {
589 seqIndex[i] = nameToIndex[names[i]];
594 // loop through all entries in seqIndex
597 for (size_t i=0; i < seqIndex.size(); i++) {
598 if (m->control_pressed) { return "control"; }
600 if (!large) { currMap = seqVec[seqIndex[i]]; }
601 else { currMap = getMap(seqIndex[i]); }
603 for (size_t j=0; j < seqIndex.size(); j++) {
604 it = currMap.find(seqIndex[j]);
605 if (it != currMap.end()) {
606 max_dist[i] = max(max_dist[i], it->second);
607 max_dist[j] = max(max_dist[j], it->second);
608 total_dist[i] += it->second;
609 total_dist[j] += it->second;
610 }else{ //if you can't find the distance make it the cutoff
611 max_dist[i] = max(max_dist[i], cutoff);
612 max_dist[j] = max(max_dist[j], cutoff);
613 total_dist[i] += cutoff;
614 total_dist[j] += cutoff;
619 // sequence with the smallest maximum distance is the representative
620 //if tie occurs pick sequence with smallest average distance
623 for (size_t i=0; i < max_dist.size(); i++) {
624 if (m->control_pressed) { return "control"; }
625 if (max_dist[i] < min) {
628 }else if (max_dist[i] == min) {
629 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
630 float newAverage = total_dist[i] / (float) total_dist.size();
632 if (newAverage < currentAverage) {
639 return(names[minIndex]);
642 catch(exception& e) {
643 m->errorOut(e, "GetOTURepCommand", "FindRep");
648 //**********************************************************************************************************************
649 int GetOTURepCommand::process(ListVector* processList) {
651 string name, sequence;
655 if (outputDir == "") { outputDir += m->hasPath(listfile); }
657 ofstream newNamesOutput;
658 string outputNamesFile;
659 map<string, ofstream*> filehandles;
661 if (Groups.size() == 0) { //you don't want to use groups
662 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
663 m->openOutputFile(outputNamesFile, newNamesOutput);
664 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
665 outputNameFiles[outputNamesFile] = processList->getLabel();
666 }else{ //you want to use groups
668 for (int i=0; i<Groups.size(); i++) {
670 filehandles[Groups[i]] = temp;
671 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
673 m->openOutputFile(outputNamesFile, *(temp));
674 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
675 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
679 //for each bin in the list vector
680 for (int i = 0; i < processList->size(); i++) {
681 if (m->control_pressed) {
683 if (Groups.size() == 0) { //you don't want to use groups
684 newNamesOutput.close();
686 for (int j=0; j<Groups.size(); j++) {
687 (*(filehandles[Groups[j]])).close();
688 delete filehandles[Groups[j]];
694 string temp = processList->get(i);
695 vector<string> namesInBin;
696 m->splitAtComma(temp, namesInBin);
698 if (Groups.size() == 0) {
699 nameRep = findRep(namesInBin);
700 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
702 map<string, vector<string> > NamesInGroup;
703 for (int j=0; j<Groups.size(); j++) { //initialize groups
704 NamesInGroup[Groups[j]].resize(0);
707 for (int j=0; j<namesInBin.size(); j++) {
708 string thisgroup = groupMap->getGroup(namesInBin[j]);
710 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
712 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
713 NamesInGroup[thisgroup].push_back(namesInBin[j]);
717 //get rep for each group in otu
718 for (int j=0; j<Groups.size(); j++) {
719 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
720 //get rep for each group
721 nameRep = findRep(NamesInGroup[Groups[j]]);
723 //output group rep and other members of this group
724 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
726 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
727 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
730 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
736 if (Groups.size() == 0) { //you don't want to use groups
737 newNamesOutput.close();
739 for (int i=0; i<Groups.size(); i++) {
740 (*(filehandles[Groups[i]])).close();
741 delete filehandles[Groups[i]];
748 catch(exception& e) {
749 m->errorOut(e, "GetOTURepCommand", "process");
753 //**********************************************************************************************************************
754 int GetOTURepCommand::processNames(string filename, string label) {
758 if (outputDir == "") { outputDir += m->hasPath(listfile); }
759 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
760 m->openOutputFile(outputFileName, out);
761 vector<repStruct> reps;
762 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
765 string tempNameFile = filename + ".temp";
766 m->openOutputFile(tempNameFile, out2);
769 m->openInputFile(filename, in);
773 string rep, binnames;
774 in >> i >> rep >> binnames; m->gobble(in);
775 out2 << rep << '\t' << binnames << endl;
777 vector<string> names;
778 m->splitAtComma(binnames, names);
779 int binsize = names.size();
781 //if you have a groupfile
783 if (groupfile != "") {
784 map<string, string> groups;
785 map<string, string>::iterator groupIt;
787 //find the groups that are in this bin
788 for (size_t i = 0; i < names.size(); i++) {
789 string groupName = groupMap->getGroup(names[i]);
790 if (groupName == "not found") {
791 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
794 groups[groupName] = groupName;
798 //turn the groups into a string
799 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
800 group += groupIt->first + "-";
803 group = group.substr(0, group.length()-1);
807 //print out name and sequence for that bin
808 string sequence = fasta->getSequence(rep);
810 if (sequence != "not found") {
811 if (sorted == "") { //print them out
812 rep = rep + "\t" + toString(i+1);
813 rep = rep + "|" + toString(binsize);
814 if (groupfile != "") {
815 rep = rep + "|" + group;
817 out << ">" << rep << endl;
818 out << sequence << endl;
820 repStruct newRep(rep, i+1, binsize, group);
821 reps.push_back(newRep);
824 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
829 if (sorted != "") { //then sort them and print them
830 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
831 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
832 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
833 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
836 for (int i = 0; i < reps.size(); i++) {
837 string sequence = fasta->getSequence(reps[i].name);
838 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
839 outputName = outputName + "|" + toString(reps[i].size);
840 if (groupfile != "") {
841 outputName = outputName + "|" + reps[i].group;
843 out << ">" << outputName << endl;
844 out << sequence << endl;
852 remove(filename.c_str());
853 rename(tempNameFile.c_str(), filename.c_str());
858 catch(exception& e) {
859 m->errorOut(e, "GetOTURepCommand", "processNames");
863 //**********************************************************************************************************************
864 SeqMap GetOTURepCommand::getMap(int row) {
868 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
869 if (rowPositions[row] != -1){
871 inRow.seekg(rowPositions[row]);
873 int rowNum, numDists, colNum;
876 inRow >> rowNum >> numDists;
878 for(int i = 0; i < numDists; i++) {
879 inRow >> colNum >> dist;
880 rowMap[colNum] = dist;
887 catch(exception& e) {
888 m->errorOut(e, "GetOTURepCommand", "getMap");
892 //**********************************************************************************************************************