]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
some bug fixes
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option)  {
40         try{
41                 globaldata = GlobalData::getInstance();
42                 abort = false;
43                 allLines = 1;
44                 labels.clear();
45                                 
46                 //allow user to run help
47                 if (option == "help") { 
48                         help(); abort = true;
49                 } else {
50                         //valid paramters for this command
51                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
52                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                         
54                         OptionParser parser(option);
55                         map<string, string> parameters = parser.getParameters();
56                         
57                         ValidParameters validParameter;
58                         map<string, string>::iterator it;
59                 
60                         //check to make sure all parameters are valid for command
61                         for (it = parameters.begin(); it != parameters.end(); it++) { 
62                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
63                         }
64                         
65                         //if the user changes the input directory command factory will send this info to us in the output parameter 
66                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
67                         if (inputDir == "not found"){   inputDir = "";          }
68                         else {
69                                 string path;
70                                 it = parameters.find("list");
71                                 //user has given a template file
72                                 if(it != parameters.end()){ 
73                                         path = hasPath(it->second);
74                                         //if the user has not given a path then, add inputdir. else leave path alone.
75                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
76                                 }
77                                 
78                                 it = parameters.find("fasta");
79                                 //user has given a template file
80                                 if(it != parameters.end()){ 
81                                         path = hasPath(it->second);
82                                         //if the user has not given a path then, add inputdir. else leave path alone.
83                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
84                                 }
85                                 
86                                 it = parameters.find("phylip");
87                                 //user has given a template file
88                                 if(it != parameters.end()){ 
89                                         path = hasPath(it->second);
90                                         //if the user has not given a path then, add inputdir. else leave path alone.
91                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
92                                 }
93                                 
94                                 it = parameters.find("column");
95                                 //user has given a template file
96                                 if(it != parameters.end()){ 
97                                         path = hasPath(it->second);
98                                         //if the user has not given a path then, add inputdir. else leave path alone.
99                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
100                                 }
101                                 
102                                 it = parameters.find("name");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
108                                 }
109                                 
110                                 it = parameters.find("group");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
116                                 }
117                         }
118
119                         
120                         //if the user changes the output directory command factory will send this info to us in the output parameter 
121                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
122                         
123                         //check for required parameters
124                         fastafile = validParameter.validFile(parameters, "fasta", true);
125                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126                         else if (fastafile == "not open") { abort = true; }     
127                 
128                         listfile = validParameter.validFile(parameters, "list", true);
129                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130                         else if (listfile == "not open") { abort = true; }      
131                         
132                         phylipfile = validParameter.validFile(parameters, "phylip", true);
133                         if (phylipfile == "not found") { phylipfile = "";  }
134                         else if (phylipfile == "not open") { abort = true; }    
135                         else { distFile = phylipfile; format = "phylip";   }
136                         
137                         columnfile = validParameter.validFile(parameters, "column", true);
138                         if (columnfile == "not found") { columnfile = ""; }
139                         else if (columnfile == "not open") { abort = true; }    
140                         else { distFile = columnfile; format = "column";   }
141                         
142                         namefile = validParameter.validFile(parameters, "name", true);
143                         if (namefile == "not open") { abort = true; }   
144                         else if (namefile == "not found") { namefile = ""; }
145                         
146                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
148                 
149                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
150
151                         //check for optional parameter and set defaults
152                         // ...at some point should added some additional type checking...
153                         label = validParameter.validFile(parameters, "label", false);                   
154                         if (label == "not found") { label = ""; allLines = 1;  }
155                         else { 
156                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
157                                 else { allLines = 1;  }
158                         }
159                         
160                         groupfile = validParameter.validFile(parameters, "group", true);
161                         if (groupfile == "not open") { groupfile = ""; abort = true; }
162                         else if (groupfile == "not found") { groupfile = ""; }
163                                                 
164                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
165                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
166                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
167                                 sorted = "";
168                         }
169                         
170                         if ((sorted == "group") && (groupfile == "")) {
171                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
172                                 sorted = "";
173                         }
174                         
175                         groups = validParameter.validFile(parameters, "groups", false);                 
176                         if (groups == "not found") { groups = ""; }
177                         else { 
178                                 if (groupfile == "") {
179                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
180                                         abort = true;
181                                 }else { 
182                                         splitAtDash(groups, Groups);
183                                 }
184                         }
185                         globaldata->Groups = Groups;
186                         
187                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
188                         large = isTrue(temp);
189                         
190                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
191                         convert(temp, precision); 
192                         
193                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
194                         convert(temp, cutoff); 
195                         cutoff += (5 / (precision * 10.0));
196                 }
197         }
198         catch(exception& e) {
199                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
200                 exit(1);
201         }
202 }
203
204 //**********************************************************************************************************************
205
206 void GetOTURepCommand::help(){
207         try {
208                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
209                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
210                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
211                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
212                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
213                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
214                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
215                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
216                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
217                 m->mothurOut("The group parameter allows you provide a group file.\n");
218                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
219                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
220                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
221                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
222         }
223         catch(exception& e) {
224                 m->errorOut(e, "GetOTURepCommand", "help");
225                 exit(1);
226         }
227 }
228
229 //**********************************************************************************************************************
230
231 GetOTURepCommand::~GetOTURepCommand(){}
232
233 //**********************************************************************************************************************
234
235 int GetOTURepCommand::execute(){
236         try {
237         
238                 if (abort == true) { return 0; }
239                 int error;
240                 
241                 if (!large) {
242                         //read distance files
243                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
244                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
245                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
246                         
247                         readMatrix->setCutoff(cutoff);
248         
249                         if(namefile != ""){     
250                                 nameMap = new NameAssignment(namefile);
251                                 nameMap->readMap();
252                         }else{  nameMap = NULL;         }
253                         
254                         readMatrix->read(nameMap);
255                         
256                         if (m->control_pressed) { delete readMatrix; return 0; }
257
258                         //get matrix
259                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
260                         globaldata->gListVector = readMatrix->getListVector();
261
262                         SparseMatrix* matrix = readMatrix->getMatrix();
263                         
264                         // Create a data structure to quickly access the distance information.
265                         // It consists of a vector of distance maps, where each map contains
266                         // all distances of a certain sequence. Vector and maps are accessed
267                         // via the index of a sequence in the distance matrix
268                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
269                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
270                                 if (m->control_pressed) { delete readMatrix; return 0; }
271                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
272                         }
273                         
274                         delete matrix;
275                         delete readMatrix;
276                         delete nameMap;
277                         
278                         if (m->control_pressed) { return 0; }
279                 }else {
280                         //process file and set up indexes
281                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
282                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
283                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
284                         
285                         formatMatrix->setCutoff(cutoff);
286         
287                         if(namefile != ""){     
288                                 nameMap = new NameAssignment(namefile);
289                                 nameMap->readMap();
290                         }else{  nameMap = NULL;         }
291                         
292                         formatMatrix->read(nameMap);
293                         
294                         if (m->control_pressed) { delete formatMatrix;  return 0; }
295
296                         //get matrix
297                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
298                         globaldata->gListVector = formatMatrix->getListVector();
299                         
300                         distFile = formatMatrix->getFormattedFileName();
301                         
302                         //positions in file where the distances for each sequence begin
303                         //rowPositions[1] = position in file where distance related to sequence 1 start.
304                         rowPositions = formatMatrix->getRowPositions();
305                         
306                         delete formatMatrix;
307                         delete nameMap;
308                         
309                         //openfile for getMap to use
310                         openInputFile(distFile, inRow);
311                         
312                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
313                 }
314                 
315                 
316                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
317                 if (globaldata->gListVector != NULL) {
318                         vector<string> names;
319                         string binnames;
320                         //map names to rows in sparsematrix
321                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
322                                 names.clear();
323                                 binnames = globaldata->gListVector->get(i);
324                                 
325                                 splitAtComma(binnames, names);
326                                 
327                                 for (int j = 0; j < names.size(); j++) {
328                                         nameToIndex[names[j]] = i;
329                                 }
330                         }
331                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
332                 
333                                 
334                 if (m->control_pressed) { 
335                         if (large) {  inRow.close(); remove(distFile.c_str());  }
336                         return 0; 
337                 }
338                 
339                 if (groupfile != "") {
340                         //read in group map info.
341                         groupMap = new GroupMap(groupfile);
342                         int error = groupMap->readMap();
343                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
344                         
345                         if (Groups.size() != 0) {
346                                 SharedUtil* util = new SharedUtil();
347                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
348                                 delete util;
349                         }
350                 }
351                                                                 
352                 //set format to list so input can get listvector
353                 globaldata->setFormat("list");
354         
355                 //read list file
356                 read = new ReadOTUFile(listfile);
357                 read->read(&*globaldata); 
358                 
359                 input = globaldata->ginput;
360                 list = globaldata->gListVector;
361                 string lastLabel = list->getLabel();
362
363                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
364                 set<string> processedLabels;
365                 set<string> userLabels = labels;
366                 
367                 if (m->control_pressed) { 
368                         if (large) {  inRow.close(); remove(distFile.c_str());  }
369                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
370                 }
371         
372                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
373                         
374                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
375                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
376                                         error = process(list);
377                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
378                                         
379                                         if (m->control_pressed) { 
380                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
381                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
382                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
383                                         }
384                                         
385                                         processedLabels.insert(list->getLabel());
386                                         userLabels.erase(list->getLabel());
387                         }
388                         
389                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
390                                         string saveLabel = list->getLabel();
391                                         
392                                         delete list;
393                                         list = input->getListVector(lastLabel);
394                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
395                                         error = process(list);
396                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
397                                         
398                                         if (m->control_pressed) { 
399                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
400                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
401                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
402                                         }
403                                         
404                                         processedLabels.insert(list->getLabel());
405                                         userLabels.erase(list->getLabel());
406                                         
407                                         //restore real lastlabel to save below
408                                         list->setLabel(saveLabel);
409                         }
410                         
411                         lastLabel = list->getLabel();
412         
413                         delete list;
414                         list = input->getListVector();
415                 }
416                 
417                 //output error messages about any remaining user labels
418                 bool needToRun = false;
419                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
420                         m->mothurOut("Your file does not include the label " + (*it)); 
421                         if (processedLabels.count(lastLabel) != 1) {
422                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
423                                 needToRun = true;
424                         }else {
425                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
426                         }
427                 }
428                 
429                 //run last label if you need to
430                 if (needToRun == true)  {
431                         if (list != NULL) {     delete list;    }
432                         list = input->getListVector(lastLabel);
433                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
434                         error = process(list);
435                         delete list;
436                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
437                         
438                         if (m->control_pressed) { 
439                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
440                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
441                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
442                         }
443                 }
444                 
445                 //close and remove formatted matrix file
446                 if (large) {
447                         inRow.close();
448                         remove(distFile.c_str());
449                 }
450                 
451                 globaldata->gListVector = NULL;
452                 delete input;  globaldata->ginput = NULL;
453                 delete read;
454                 
455                 //read fastafile
456                 fasta = new FastaMap();
457                 fasta->readFastaFile(fastafile);
458                 
459                 //if user gave a namesfile then use it
460                 if (namefile != "") {   readNamesFile();        }
461                 
462                 //output create and output the .rep.fasta files
463                 map<string, string>::iterator itNameFile;
464                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
465                         processNames(itNameFile->first, itNameFile->second);
466                 }
467                 
468                 delete fasta;
469                 if (groupfile != "") {
470                         delete groupMap;  globaldata->gGroupmap = NULL;
471                 }
472                 
473                 if (m->control_pressed) {  return 0; }
474                 
475                 m->mothurOutEndLine();
476                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
477                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
478                 m->mothurOutEndLine();
479                 
480                 return 0;
481         }
482         catch(exception& e) {
483                 m->errorOut(e, "GetOTURepCommand", "execute");
484                 exit(1);
485         }
486 }
487
488 //**********************************************************************************************************************
489 void GetOTURepCommand::readNamesFile() {
490         try {
491                 vector<string> dupNames;
492                 openInputFile(namefile, inNames);
493                 
494                 string name, names, sequence;
495         
496                 while(inNames){
497                         inNames >> name;                        //read from first column  A
498                         inNames >> names;               //read from second column  A,B,C,D
499                         
500                         dupNames.clear();
501                         
502                         //parse names into vector
503                         splitAtComma(names, dupNames);
504                         
505                         //store names in fasta map
506                         sequence = fasta->getSequence(name);
507                         for (int i = 0; i < dupNames.size(); i++) {
508                                 fasta->push_back(dupNames[i], sequence);
509                         }
510                 
511                         gobble(inNames);
512                 }
513                 inNames.close();
514
515         }
516         catch(exception& e) {
517                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
518                 exit(1);
519         }
520 }
521 //**********************************************************************************************************************
522 string GetOTURepCommand::findRep(vector<string> names) {
523         try{
524                 // if only 1 sequence in bin or processing the "unique" label, then 
525                 // the first sequence of the OTU is the representative one
526                 if ((names.size() == 2) || (names.size() == 1) || (list->getLabel() == "unique")) {
527                         return names[0];
528                 }else{
529                         vector<int> seqIndex(names.size());
530                         vector<float> max_dist(names.size());
531                         vector<float> total_dist(names.size());
532
533                         //fill seqIndex and initialize sums
534                         for (size_t i = 0; i < names.size(); i++) {
535                                 seqIndex[i] = nameToIndex[names[i]];
536                                 max_dist[i] = 0.0;
537                                 total_dist[i] = 0.0;
538                         }
539                         
540                         // loop through all entries in seqIndex
541                         SeqMap::iterator it;
542                         SeqMap currMap;
543                         for (size_t i=0; i < seqIndex.size(); i++) {
544                                 if (m->control_pressed) {  return  "control"; }
545                         
546                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
547                                 else            {       currMap = getMap(seqIndex[i]);  }
548                                 
549                                 for (size_t j=0; j < seqIndex.size(); j++) {
550                                         it = currMap.find(seqIndex[j]);         
551                                         if (it != currMap.end()) {
552                                                 max_dist[i] = max(max_dist[i], it->second);
553                                                 max_dist[j] = max(max_dist[j], it->second);
554                                                 total_dist[i] += it->second;
555                                                 total_dist[j] += it->second;
556                                         }else{ //if you can't find the distance make it the cutoff
557                                                 max_dist[i] = max(max_dist[i], cutoff);
558                                                 max_dist[j] = max(max_dist[j], cutoff);
559                                                 total_dist[i] += cutoff;
560                                                 total_dist[j] += cutoff;
561                                         }
562                                 }
563                         }
564                         
565                         // sequence with the smallest maximum distance is the representative
566                         //if tie occurs pick sequence with smallest average distance
567                         float min = 10000;
568                         int minIndex;
569                         for (size_t i=0; i < max_dist.size(); i++) {
570                                 if (m->control_pressed) {  return  "control"; }
571                                 if (max_dist[i] < min) {
572                                         min = max_dist[i];
573                                         minIndex = i;
574                                 }else if (max_dist[i] == min) {
575                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
576                                         float newAverage = total_dist[i] / (float) total_dist.size();
577                                         
578                                         if (newAverage < currentAverage) {
579                                                 min = max_dist[i];
580                                                 minIndex = i;
581                                         }
582                                 }
583                         }
584                         
585                         return(names[minIndex]);
586                 }
587         }
588         catch(exception& e) {
589                 m->errorOut(e, "GetOTURepCommand", "FindRep");
590                 exit(1);
591         }
592 }
593
594 //**********************************************************************************************************************
595 int GetOTURepCommand::process(ListVector* processList) {
596         try{
597                 string name, sequence;
598                 string nameRep;
599
600                 //create output file
601                 if (outputDir == "") { outputDir += hasPath(listfile); }
602                                 
603                 ofstream newNamesOutput;
604                 string outputNamesFile;
605                 map<string, ofstream*> filehandles;
606                 
607                 if (Groups.size() == 0) { //you don't want to use groups
608                         outputNamesFile  = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
609                         openOutputFile(outputNamesFile, newNamesOutput);
610                         outputNames.push_back(outputNamesFile);
611                         outputNameFiles[outputNamesFile] = processList->getLabel();
612                 }else{ //you want to use groups
613                         ofstream* temp;
614                         for (int i=0; i<Groups.size(); i++) {
615                                 temp = new ofstream;
616                                 filehandles[Groups[i]] = temp;
617                                 outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
618                                 
619                                 openOutputFile(outputNamesFile, *(temp));
620                                 outputNames.push_back(outputNamesFile);
621                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
622                         }
623                 }
624                 
625                 //for each bin in the list vector
626                 for (int i = 0; i < processList->size(); i++) {
627                         if (m->control_pressed) { 
628                                 out.close();  
629                                 if (Groups.size() == 0) { //you don't want to use groups
630                                         newNamesOutput.close();
631                                 }else{
632                                         for (int j=0; j<Groups.size(); j++) {
633                                                 (*(filehandles[Groups[j]])).close();
634                                                 delete filehandles[Groups[j]];
635                                         }
636                                 }
637                                 return 0; 
638                         }
639                         
640                         string temp = processList->get(i);
641                         vector<string> namesInBin;
642                         splitAtComma(temp, namesInBin);
643                         
644                         if (Groups.size() == 0) {
645                                 nameRep = findRep(namesInBin);
646                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
647                         }else{
648                                 map<string, vector<string> > NamesInGroup;
649                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
650                                         NamesInGroup[Groups[j]].resize(0);
651                                 }
652                                 
653                                 for (int j=0; j<namesInBin.size(); j++) {
654                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
655                                         
656                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
657                                         
658                                         if (inUsersGroups(thisgroup, Groups)) { //add this name to correct group
659                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
660                                         }
661                                 }
662                                 
663                                 //get rep for each group in otu
664                                 for (int j=0; j<Groups.size(); j++) {
665                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
666                                                 //get rep for each group
667                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
668                                                 
669                                                 //output group rep and other members of this group
670                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
671                                                 
672                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
673                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
674                                                 }
675                                                 //output last name
676                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
677                                         }
678                                 }
679                         }
680                 }
681                 
682                 if (Groups.size() == 0) { //you don't want to use groups
683                         newNamesOutput.close();
684                 }else{
685                         for (int i=0; i<Groups.size(); i++) {
686                                 (*(filehandles[Groups[i]])).close();
687                                 delete filehandles[Groups[i]];
688                         }
689                 }
690                 
691                 return 0;
692
693         }
694         catch(exception& e) {
695                 m->errorOut(e, "GetOTURepCommand", "process");
696                 exit(1);
697         }
698 }
699 //**********************************************************************************************************************
700 int GetOTURepCommand::processNames(string filename, string label) {
701         try{
702
703                 //create output file
704                 if (outputDir == "") { outputDir += hasPath(listfile); }
705                 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + label + ".rep.fasta";
706                 openOutputFile(outputFileName, out);
707                 vector<repStruct> reps;
708                 outputNames.push_back(outputFileName);
709                 
710                 ofstream out2;
711                 string tempNameFile = filename + ".temp";
712                 openOutputFile(tempNameFile, out2);
713                 
714                 ifstream in;
715                 openInputFile(filename, in);
716                 
717                 int i = 0;
718                 while (!in.eof()) {
719                         string rep, binnames;
720                         in >> i >> rep >> binnames; gobble(in);
721                         out2 << rep << '\t' << binnames << endl;
722                         
723                         vector<string> names;
724                         splitAtComma(binnames, names);
725                         int binsize = names.size();
726                         
727                         //if you have a groupfile
728                         string group = "";
729                         if (groupfile != "") {
730                                 map<string, string> groups;
731                                 map<string, string>::iterator groupIt;
732                                 
733                                 //find the groups that are in this bin
734                                 for (size_t i = 0; i < names.size(); i++) {
735                                         string groupName = groupMap->getGroup(names[i]);
736                                         if (groupName == "not found") {  
737                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
738                                                 groupError = true;
739                                         } else {
740                                                 groups[groupName] = groupName;
741                                         }
742                                 }
743                                 
744                                 //turn the groups into a string
745                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
746                                         group += groupIt->first + "-";
747                                 }
748                                 //rip off last dash
749                                 group = group.substr(0, group.length()-1);
750                         }else{ group = ""; }
751
752                         
753                         //print out name and sequence for that bin
754                         string sequence = fasta->getSequence(rep);
755
756                         if (sequence != "not found") {
757                                 if (sorted == "") { //print them out
758                                         rep = rep + "\t" + toString(i+1);
759                                         rep = rep + "|" + toString(binsize);
760                                         if (groupfile != "") {
761                                                 rep = rep + "|" + group;
762                                         }
763                                         out << ">" << rep << endl;
764                                         out << sequence << endl;
765                                 }else { //save them
766                                         repStruct newRep(rep, i+1, binsize, group);
767                                         reps.push_back(newRep);
768                                 }
769                         }else { 
770                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
771                         }
772                 }
773                 
774                         
775                 if (sorted != "") { //then sort them and print them
776                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
777                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
778                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
779                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
780                         
781                         //print them
782                         for (int i = 0; i < reps.size(); i++) {
783                                 string sequence = fasta->getSequence(reps[i].name);
784                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
785                                 outputName = outputName + "|" + toString(reps[i].size);
786                                 if (groupfile != "") {
787                                         outputName = outputName + "|" + reps[i].group;
788                                 }
789                                 out << ">" << outputName << endl;
790                                 out << sequence << endl;
791                         }
792                 }
793                         
794                 out.close();
795                 out2.close();
796                 
797                 remove(filename.c_str());
798                 rename(tempNameFile.c_str(), filename.c_str());
799                 
800                 return 0;
801
802         }
803         catch(exception& e) {
804                 m->errorOut(e, "GetOTURepCommand", "processNames");
805                 exit(1);
806         }
807 }
808 //**********************************************************************************************************************
809 SeqMap GetOTURepCommand::getMap(int row) {
810         try {
811                 SeqMap rowMap;
812                 
813                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
814                 if (rowPositions[row] != -1){
815                         //go to row in file
816                         inRow.seekg(rowPositions[row]);
817                         
818                         int rowNum, numDists, colNum;
819                         float dist;
820                         
821                         inRow >> rowNum >> numDists;
822                         
823                         for(int i = 0; i < numDists; i++) {
824                                 inRow >> colNum >> dist;
825                                 rowMap[colNum] = dist;
826                                 
827                         }
828                 }
829                 
830                 return rowMap;
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "GetOTURepCommand", "getMap");
834                 exit(1);
835         }
836 }
837 //**********************************************************************************************************************
838