5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){
41 //initialize outputTypes
42 vector<string> tempOutNames;
43 outputTypes["fasta"] = tempOutNames;
44 outputTypes["name"] = tempOutNames;
47 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
51 //**********************************************************************************************************************
52 vector<string> GetOTURepCommand::getValidParameters(){
54 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
55 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
59 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
63 //**********************************************************************************************************************
64 vector<string> GetOTURepCommand::getRequiredParameters(){
66 string Array[] = {"fasta","list"};
67 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
71 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
75 //**********************************************************************************************************************
76 vector<string> GetOTURepCommand::getRequiredFiles(){
78 vector<string> myArray;
82 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
86 //**********************************************************************************************************************
87 GetOTURepCommand::GetOTURepCommand(string option) {
89 globaldata = GlobalData::getInstance();
94 //allow user to run help
95 if (option == "help") {
98 //valid paramters for this command
99 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
100 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
102 OptionParser parser(option);
103 map<string, string> parameters = parser.getParameters();
105 ValidParameters validParameter;
106 map<string, string>::iterator it;
108 //check to make sure all parameters are valid for command
109 for (it = parameters.begin(); it != parameters.end(); it++) {
110 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
113 //initialize outputTypes
114 vector<string> tempOutNames;
115 outputTypes["fasta"] = tempOutNames;
116 outputTypes["name"] = tempOutNames;
118 //if the user changes the input directory command factory will send this info to us in the output parameter
119 string inputDir = validParameter.validFile(parameters, "inputdir", false);
120 if (inputDir == "not found"){ inputDir = ""; }
123 it = parameters.find("list");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["list"] = inputDir + it->second; }
131 it = parameters.find("fasta");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["fasta"] = inputDir + it->second; }
139 it = parameters.find("phylip");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["phylip"] = inputDir + it->second; }
147 it = parameters.find("column");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["column"] = inputDir + it->second; }
155 it = parameters.find("name");
156 //user has given a template file
157 if(it != parameters.end()){
158 path = m->hasPath(it->second);
159 //if the user has not given a path then, add inputdir. else leave path alone.
160 if (path == "") { parameters["name"] = inputDir + it->second; }
163 it = parameters.find("group");
164 //user has given a template file
165 if(it != parameters.end()){
166 path = m->hasPath(it->second);
167 //if the user has not given a path then, add inputdir. else leave path alone.
168 if (path == "") { parameters["group"] = inputDir + it->second; }
173 //if the user changes the output directory command factory will send this info to us in the output parameter
174 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
176 //check for required parameters
177 fastafile = validParameter.validFile(parameters, "fasta", true);
178 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
179 else if (fastafile == "not open") { abort = true; }
181 listfile = validParameter.validFile(parameters, "list", true);
182 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
183 else if (listfile == "not open") { abort = true; }
185 phylipfile = validParameter.validFile(parameters, "phylip", true);
186 if (phylipfile == "not found") { phylipfile = ""; }
187 else if (phylipfile == "not open") { abort = true; }
188 else { distFile = phylipfile; format = "phylip"; }
190 columnfile = validParameter.validFile(parameters, "column", true);
191 if (columnfile == "not found") { columnfile = ""; }
192 else if (columnfile == "not open") { abort = true; }
193 else { distFile = columnfile; format = "column"; }
195 namefile = validParameter.validFile(parameters, "name", true);
196 if (namefile == "not open") { abort = true; }
197 else if (namefile == "not found") { namefile = ""; }
199 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
200 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
202 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
204 //check for optional parameter and set defaults
205 // ...at some point should added some additional type checking...
206 label = validParameter.validFile(parameters, "label", false);
207 if (label == "not found") { label = ""; allLines = 1; }
209 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
210 else { allLines = 1; }
213 groupfile = validParameter.validFile(parameters, "group", true);
214 if (groupfile == "not open") { groupfile = ""; abort = true; }
215 else if (groupfile == "not found") { groupfile = ""; }
217 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
218 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
219 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
223 if ((sorted == "group") && (groupfile == "")) {
224 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
228 groups = validParameter.validFile(parameters, "groups", false);
229 if (groups == "not found") { groups = ""; }
231 if (groupfile == "") {
232 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
235 m->splitAtDash(groups, Groups);
238 globaldata->Groups = Groups;
240 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
241 large = m->isTrue(temp);
243 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
244 convert(temp, precision);
246 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
247 convert(temp, cutoff);
248 cutoff += (5 / (precision * 10.0));
251 catch(exception& e) {
252 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
257 //**********************************************************************************************************************
259 void GetOTURepCommand::help(){
261 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
262 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
263 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
264 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
265 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
266 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
267 m->mothurOut("The default value for label is all labels in your inputfile.\n");
268 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
269 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
270 m->mothurOut("The group parameter allows you provide a group file.\n");
271 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
272 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
273 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
274 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
276 catch(exception& e) {
277 m->errorOut(e, "GetOTURepCommand", "help");
282 //**********************************************************************************************************************
284 GetOTURepCommand::~GetOTURepCommand(){}
286 //**********************************************************************************************************************
288 int GetOTURepCommand::execute(){
291 if (abort == true) { return 0; }
295 //read distance files
296 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
297 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
298 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
300 readMatrix->setCutoff(cutoff);
303 nameMap = new NameAssignment(namefile);
305 }else{ nameMap = NULL; }
307 readMatrix->read(nameMap);
309 if (m->control_pressed) { delete readMatrix; return 0; }
312 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
313 globaldata->gListVector = readMatrix->getListVector();
315 SparseMatrix* matrix = readMatrix->getMatrix();
317 // Create a data structure to quickly access the distance information.
318 // It consists of a vector of distance maps, where each map contains
319 // all distances of a certain sequence. Vector and maps are accessed
320 // via the index of a sequence in the distance matrix
321 seqVec = vector<SeqMap>(globaldata->gListVector->size());
322 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
323 if (m->control_pressed) { delete readMatrix; return 0; }
324 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
331 if (m->control_pressed) { return 0; }
333 //process file and set up indexes
334 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
335 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
336 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
338 formatMatrix->setCutoff(cutoff);
341 nameMap = new NameAssignment(namefile);
343 }else{ nameMap = NULL; }
345 formatMatrix->read(nameMap);
347 if (m->control_pressed) { delete formatMatrix; return 0; }
350 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
351 globaldata->gListVector = formatMatrix->getListVector();
353 distFile = formatMatrix->getFormattedFileName();
355 //positions in file where the distances for each sequence begin
356 //rowPositions[1] = position in file where distance related to sequence 1 start.
357 rowPositions = formatMatrix->getRowPositions();
362 //openfile for getMap to use
363 m->openInputFile(distFile, inRow);
365 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
369 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
370 if (globaldata->gListVector != NULL) {
371 vector<string> names;
373 //map names to rows in sparsematrix
374 for (int i = 0; i < globaldata->gListVector->size(); i++) {
376 binnames = globaldata->gListVector->get(i);
378 m->splitAtComma(binnames, names);
380 for (int j = 0; j < names.size(); j++) {
381 nameToIndex[names[j]] = i;
384 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
387 if (m->control_pressed) {
388 if (large) { inRow.close(); remove(distFile.c_str()); }
392 if (groupfile != "") {
393 //read in group map info.
394 groupMap = new GroupMap(groupfile);
395 int error = groupMap->readMap();
396 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
398 if (Groups.size() != 0) {
399 SharedUtil* util = new SharedUtil();
400 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
405 //set format to list so input can get listvector
406 globaldata->setFormat("list");
409 read = new ReadOTUFile(listfile);
410 read->read(&*globaldata);
412 input = globaldata->ginput;
413 list = globaldata->gListVector;
414 string lastLabel = list->getLabel();
416 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
417 set<string> processedLabels;
418 set<string> userLabels = labels;
420 if (m->control_pressed) {
421 if (large) { inRow.close(); remove(distFile.c_str()); }
422 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
425 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
427 if (allLines == 1 || labels.count(list->getLabel()) == 1){
428 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
429 error = process(list);
430 if (error == 1) { return 0; } //there is an error in hte input files, abort command
432 if (m->control_pressed) {
433 if (large) { inRow.close(); remove(distFile.c_str()); }
434 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
435 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
438 processedLabels.insert(list->getLabel());
439 userLabels.erase(list->getLabel());
442 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
443 string saveLabel = list->getLabel();
446 list = input->getListVector(lastLabel);
447 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
448 error = process(list);
449 if (error == 1) { return 0; } //there is an error in hte input files, abort command
451 if (m->control_pressed) {
452 if (large) { inRow.close(); remove(distFile.c_str()); }
453 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
454 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
457 processedLabels.insert(list->getLabel());
458 userLabels.erase(list->getLabel());
460 //restore real lastlabel to save below
461 list->setLabel(saveLabel);
464 lastLabel = list->getLabel();
467 list = input->getListVector();
470 //output error messages about any remaining user labels
471 bool needToRun = false;
472 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
473 m->mothurOut("Your file does not include the label " + (*it));
474 if (processedLabels.count(lastLabel) != 1) {
475 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
478 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
482 //run last label if you need to
483 if (needToRun == true) {
484 if (list != NULL) { delete list; }
485 list = input->getListVector(lastLabel);
486 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
487 error = process(list);
489 if (error == 1) { return 0; } //there is an error in hte input files, abort command
491 if (m->control_pressed) {
492 if (large) { inRow.close(); remove(distFile.c_str()); }
493 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
494 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
498 //close and remove formatted matrix file
501 remove(distFile.c_str());
504 globaldata->gListVector = NULL;
505 delete input; globaldata->ginput = NULL;
509 fasta = new FastaMap();
510 fasta->readFastaFile(fastafile);
512 //if user gave a namesfile then use it
513 if (namefile != "") { readNamesFile(); }
515 //output create and output the .rep.fasta files
516 map<string, string>::iterator itNameFile;
517 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
518 processNames(itNameFile->first, itNameFile->second);
522 if (groupfile != "") {
523 delete groupMap; globaldata->gGroupmap = NULL;
526 if (m->control_pressed) { return 0; }
528 m->mothurOutEndLine();
529 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
530 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
531 m->mothurOutEndLine();
535 catch(exception& e) {
536 m->errorOut(e, "GetOTURepCommand", "execute");
541 //**********************************************************************************************************************
542 void GetOTURepCommand::readNamesFile() {
544 vector<string> dupNames;
545 m->openInputFile(namefile, inNames);
547 string name, names, sequence;
550 inNames >> name; //read from first column A
551 inNames >> names; //read from second column A,B,C,D
555 //parse names into vector
556 m->splitAtComma(names, dupNames);
558 //store names in fasta map
559 sequence = fasta->getSequence(name);
560 for (int i = 0; i < dupNames.size(); i++) {
561 fasta->push_back(dupNames[i], sequence);
569 catch(exception& e) {
570 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
574 //**********************************************************************************************************************
575 string GetOTURepCommand::findRep(vector<string> names) {
577 // if only 1 sequence in bin or processing the "unique" label, then
578 // the first sequence of the OTU is the representative one
579 if ((names.size() == 2) || (names.size() == 1) || (list->getLabel() == "unique")) {
582 vector<int> seqIndex(names.size());
583 vector<float> max_dist(names.size());
584 vector<float> total_dist(names.size());
586 //fill seqIndex and initialize sums
587 for (size_t i = 0; i < names.size(); i++) {
588 seqIndex[i] = nameToIndex[names[i]];
593 // loop through all entries in seqIndex
596 for (size_t i=0; i < seqIndex.size(); i++) {
597 if (m->control_pressed) { return "control"; }
599 if (!large) { currMap = seqVec[seqIndex[i]]; }
600 else { currMap = getMap(seqIndex[i]); }
602 for (size_t j=0; j < seqIndex.size(); j++) {
603 it = currMap.find(seqIndex[j]);
604 if (it != currMap.end()) {
605 max_dist[i] = max(max_dist[i], it->second);
606 max_dist[j] = max(max_dist[j], it->second);
607 total_dist[i] += it->second;
608 total_dist[j] += it->second;
609 }else{ //if you can't find the distance make it the cutoff
610 max_dist[i] = max(max_dist[i], cutoff);
611 max_dist[j] = max(max_dist[j], cutoff);
612 total_dist[i] += cutoff;
613 total_dist[j] += cutoff;
618 // sequence with the smallest maximum distance is the representative
619 //if tie occurs pick sequence with smallest average distance
622 for (size_t i=0; i < max_dist.size(); i++) {
623 if (m->control_pressed) { return "control"; }
624 if (max_dist[i] < min) {
627 }else if (max_dist[i] == min) {
628 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
629 float newAverage = total_dist[i] / (float) total_dist.size();
631 if (newAverage < currentAverage) {
638 return(names[minIndex]);
641 catch(exception& e) {
642 m->errorOut(e, "GetOTURepCommand", "FindRep");
647 //**********************************************************************************************************************
648 int GetOTURepCommand::process(ListVector* processList) {
650 string name, sequence;
654 if (outputDir == "") { outputDir += m->hasPath(listfile); }
656 ofstream newNamesOutput;
657 string outputNamesFile;
658 map<string, ofstream*> filehandles;
660 if (Groups.size() == 0) { //you don't want to use groups
661 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
662 m->openOutputFile(outputNamesFile, newNamesOutput);
663 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
664 outputNameFiles[outputNamesFile] = processList->getLabel();
665 }else{ //you want to use groups
667 for (int i=0; i<Groups.size(); i++) {
669 filehandles[Groups[i]] = temp;
670 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
672 m->openOutputFile(outputNamesFile, *(temp));
673 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
674 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
678 //for each bin in the list vector
679 for (int i = 0; i < processList->size(); i++) {
680 if (m->control_pressed) {
682 if (Groups.size() == 0) { //you don't want to use groups
683 newNamesOutput.close();
685 for (int j=0; j<Groups.size(); j++) {
686 (*(filehandles[Groups[j]])).close();
687 delete filehandles[Groups[j]];
693 string temp = processList->get(i);
694 vector<string> namesInBin;
695 m->splitAtComma(temp, namesInBin);
697 if (Groups.size() == 0) {
698 nameRep = findRep(namesInBin);
699 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
701 map<string, vector<string> > NamesInGroup;
702 for (int j=0; j<Groups.size(); j++) { //initialize groups
703 NamesInGroup[Groups[j]].resize(0);
706 for (int j=0; j<namesInBin.size(); j++) {
707 string thisgroup = groupMap->getGroup(namesInBin[j]);
709 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
711 if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
712 NamesInGroup[thisgroup].push_back(namesInBin[j]);
716 //get rep for each group in otu
717 for (int j=0; j<Groups.size(); j++) {
718 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
719 //get rep for each group
720 nameRep = findRep(NamesInGroup[Groups[j]]);
722 //output group rep and other members of this group
723 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
725 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
726 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
729 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
735 if (Groups.size() == 0) { //you don't want to use groups
736 newNamesOutput.close();
738 for (int i=0; i<Groups.size(); i++) {
739 (*(filehandles[Groups[i]])).close();
740 delete filehandles[Groups[i]];
747 catch(exception& e) {
748 m->errorOut(e, "GetOTURepCommand", "process");
752 //**********************************************************************************************************************
753 int GetOTURepCommand::processNames(string filename, string label) {
757 if (outputDir == "") { outputDir += m->hasPath(listfile); }
758 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
759 m->openOutputFile(outputFileName, out);
760 vector<repStruct> reps;
761 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
764 string tempNameFile = filename + ".temp";
765 m->openOutputFile(tempNameFile, out2);
768 m->openInputFile(filename, in);
772 string rep, binnames;
773 in >> i >> rep >> binnames; m->gobble(in);
774 out2 << rep << '\t' << binnames << endl;
776 vector<string> names;
777 m->splitAtComma(binnames, names);
778 int binsize = names.size();
780 //if you have a groupfile
782 if (groupfile != "") {
783 map<string, string> groups;
784 map<string, string>::iterator groupIt;
786 //find the groups that are in this bin
787 for (size_t i = 0; i < names.size(); i++) {
788 string groupName = groupMap->getGroup(names[i]);
789 if (groupName == "not found") {
790 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
793 groups[groupName] = groupName;
797 //turn the groups into a string
798 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
799 group += groupIt->first + "-";
802 group = group.substr(0, group.length()-1);
806 //print out name and sequence for that bin
807 string sequence = fasta->getSequence(rep);
809 if (sequence != "not found") {
810 if (sorted == "") { //print them out
811 rep = rep + "\t" + toString(i+1);
812 rep = rep + "|" + toString(binsize);
813 if (groupfile != "") {
814 rep = rep + "|" + group;
816 out << ">" << rep << endl;
817 out << sequence << endl;
819 repStruct newRep(rep, i+1, binsize, group);
820 reps.push_back(newRep);
823 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
828 if (sorted != "") { //then sort them and print them
829 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
830 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
831 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
832 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
835 for (int i = 0; i < reps.size(); i++) {
836 string sequence = fasta->getSequence(reps[i].name);
837 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
838 outputName = outputName + "|" + toString(reps[i].size);
839 if (groupfile != "") {
840 outputName = outputName + "|" + reps[i].group;
842 out << ">" << outputName << endl;
843 out << sequence << endl;
851 remove(filename.c_str());
852 rename(tempNameFile.c_str(), filename.c_str());
857 catch(exception& e) {
858 m->errorOut(e, "GetOTURepCommand", "processNames");
862 //**********************************************************************************************************************
863 SeqMap GetOTURepCommand::getMap(int row) {
867 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
868 if (rowPositions[row] != -1){
870 inRow.seekg(rowPositions[row]);
872 int rowNum, numDists, colNum;
875 inRow >> rowNum >> numDists;
877 for(int i = 0; i < numDists; i++) {
878 inRow >> colNum >> dist;
879 rowMap[colNum] = dist;
886 catch(exception& e) {
887 m->errorOut(e, "GetOTURepCommand", "getMap");
891 //**********************************************************************************************************************