]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
made classifier faster
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option)  {
40         try{
41                 globaldata = GlobalData::getInstance();
42                 abort = false;
43                 allLines = 1;
44                 labels.clear();
45                                 
46                 //allow user to run help
47                 if (option == "help") { 
48                         help(); abort = true;
49                 } else {
50                         //valid paramters for this command
51                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
52                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                         
54                         OptionParser parser(option);
55                         map<string, string> parameters = parser.getParameters();
56                         
57                         ValidParameters validParameter;
58                         map<string, string>::iterator it;
59                 
60                         //check to make sure all parameters are valid for command
61                         for (it = parameters.begin(); it != parameters.end(); it++) { 
62                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
63                         }
64                         
65                         //if the user changes the input directory command factory will send this info to us in the output parameter 
66                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
67                         if (inputDir == "not found"){   inputDir = "";          }
68                         else {
69                                 string path;
70                                 it = parameters.find("list");
71                                 //user has given a template file
72                                 if(it != parameters.end()){ 
73                                         path = hasPath(it->second);
74                                         //if the user has not given a path then, add inputdir. else leave path alone.
75                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
76                                 }
77                                 
78                                 it = parameters.find("fasta");
79                                 //user has given a template file
80                                 if(it != parameters.end()){ 
81                                         path = hasPath(it->second);
82                                         //if the user has not given a path then, add inputdir. else leave path alone.
83                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
84                                 }
85                                 
86                                 it = parameters.find("phylip");
87                                 //user has given a template file
88                                 if(it != parameters.end()){ 
89                                         path = hasPath(it->second);
90                                         //if the user has not given a path then, add inputdir. else leave path alone.
91                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
92                                 }
93                                 
94                                 it = parameters.find("column");
95                                 //user has given a template file
96                                 if(it != parameters.end()){ 
97                                         path = hasPath(it->second);
98                                         //if the user has not given a path then, add inputdir. else leave path alone.
99                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
100                                 }
101                                 
102                                 it = parameters.find("name");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
108                                 }
109                                 
110                                 it = parameters.find("group");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
116                                 }
117                         }
118
119                         
120                         //if the user changes the output directory command factory will send this info to us in the output parameter 
121                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
122                         
123                         //check for required parameters
124                         fastafile = validParameter.validFile(parameters, "fasta", true);
125                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126                         else if (fastafile == "not open") { abort = true; }     
127                 
128                         listfile = validParameter.validFile(parameters, "list", true);
129                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130                         else if (listfile == "not open") { abort = true; }      
131                         
132                         phylipfile = validParameter.validFile(parameters, "phylip", true);
133                         if (phylipfile == "not found") { phylipfile = "";  }
134                         else if (phylipfile == "not open") { abort = true; }    
135                         else { distFile = phylipfile; format = "phylip";   }
136                         
137                         columnfile = validParameter.validFile(parameters, "column", true);
138                         if (columnfile == "not found") { columnfile = ""; }
139                         else if (columnfile == "not open") { abort = true; }    
140                         else { distFile = columnfile; format = "column";   }
141                         
142                         namefile = validParameter.validFile(parameters, "name", true);
143                         if (namefile == "not open") { abort = true; }   
144                         else if (namefile == "not found") { namefile = ""; }
145                         
146                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
148                 
149                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
150
151                         //check for optional parameter and set defaults
152                         // ...at some point should added some additional type checking...
153                         label = validParameter.validFile(parameters, "label", false);                   
154                         if (label == "not found") { label = ""; allLines = 1;  }
155                         else { 
156                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
157                                 else { allLines = 1;  }
158                         }
159                         
160                         groupfile = validParameter.validFile(parameters, "group", true);
161                         if (groupfile == "not open") { groupfile = ""; abort = true; }
162                         else if (groupfile == "not found") { groupfile = ""; }
163                         else {
164                                 //read in group map info.
165                                 groupMap = new GroupMap(groupfile);
166                                 int error = groupMap->readMap();
167                                 if (error == 1) { delete groupMap; abort = true; }
168                         }
169                         
170                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
171                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
172                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
173                                 sorted = "";
174                         }
175                         
176                         if ((sorted == "group") && (groupfile == "")) {
177                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
178                                 sorted = "";
179                         }
180                         
181                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
182                         large = isTrue(temp);
183                         
184                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
185                         convert(temp, precision); 
186                         
187                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
188                         convert(temp, cutoff); 
189                         cutoff += (5 / (precision * 10.0));
190                 }
191         }
192         catch(exception& e) {
193                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
194                 exit(1);
195         }
196 }
197
198 //**********************************************************************************************************************
199
200 void GetOTURepCommand::help(){
201         try {
202                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
203                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
204                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
205                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
206                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
207                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
208                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
209                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
210                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
211                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
212                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
213                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "GetOTURepCommand", "help");
217                 exit(1);
218         }
219 }
220
221 //**********************************************************************************************************************
222
223 GetOTURepCommand::~GetOTURepCommand(){}
224
225 //**********************************************************************************************************************
226
227 int GetOTURepCommand::execute(){
228         try {
229         
230                 if (abort == true) { return 0; }
231                 int error;
232                 
233                 if (!large) {
234                         //read distance files
235                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
236                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
237                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
238                         
239                         readMatrix->setCutoff(cutoff);
240         
241                         if(namefile != ""){     
242                                 nameMap = new NameAssignment(namefile);
243                                 nameMap->readMap();
244                         }else{  nameMap = NULL;         }
245                         
246                         readMatrix->read(nameMap);
247                         
248                         if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; }
249
250                         //get matrix
251                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
252                         globaldata->gListVector = readMatrix->getListVector();
253
254                         SparseMatrix* matrix = readMatrix->getMatrix();
255                         
256                         // Create a data structure to quickly access the distance information.
257                         // It consists of a vector of distance maps, where each map contains
258                         // all distances of a certain sequence. Vector and maps are accessed
259                         // via the index of a sequence in the distance matrix
260                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
261                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
262                                 if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; }
263                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
264                         }
265                         
266                         delete matrix;
267                         delete readMatrix;
268                         
269                         if (m->control_pressed) {  delete groupMap; return 0; }
270                 }else {
271                         //process file and set up indexes
272                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
273                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
274                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
275                         
276                         formatMatrix->setCutoff(cutoff);
277         
278                         if(namefile != ""){     
279                                 nameMap = new NameAssignment(namefile);
280                                 nameMap->readMap();
281                         }else{  nameMap = NULL;         }
282                         
283                         formatMatrix->read(nameMap);
284                         
285                         if (m->control_pressed) { delete formatMatrix; delete groupMap; return 0; }
286
287                         //get matrix
288                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
289                         globaldata->gListVector = formatMatrix->getListVector();
290                         
291                         distFile = formatMatrix->getFormattedFileName();
292                         
293                         //positions in file where the distances for each sequence begin
294                         //rowPositions[1] = position in file where distance related to sequence 1 start.
295                         rowPositions = formatMatrix->getRowPositions();
296                         
297                         delete formatMatrix;
298                         
299                         //openfile for getMap to use
300                         openInputFile(distFile, inRow);
301                         
302                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); delete groupMap; return 0; }
303                 }
304                 
305                 
306                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
307                 if (globaldata->gListVector != NULL) {
308                         vector<string> names;
309                         string binnames;
310                         //map names to rows in sparsematrix
311                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
312                                 names.clear();
313                                 binnames = globaldata->gListVector->get(i);
314                                 
315                                 splitAtComma(binnames, names);
316                                 
317                                 for (int j = 0; j < names.size(); j++) {
318                                         nameToIndex[names[j]] = i;
319                                 }
320                         }
321                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
322                 
323                 fasta = new FastaMap();
324                 
325                 //read fastafile
326                 fasta->readFastaFile(fastafile);
327                 
328                 if (m->control_pressed) { 
329                         if (large) {  inRow.close(); remove(distFile.c_str());  }
330                         delete fasta; return 0; 
331                 }
332                                 
333                 //if user gave a namesfile then use it
334                 if (namefile != "") {   readNamesFile();        }
335                 
336                 //set format to list so input can get listvector
337                 globaldata->setFormat("list");
338
339                 //read list file
340                 read = new ReadOTUFile(listfile);
341                 read->read(&*globaldata); 
342                 
343                 input = globaldata->ginput;
344                 list = globaldata->gListVector;
345                 string lastLabel = list->getLabel();
346                 
347                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
348                 set<string> processedLabels;
349                 set<string> userLabels = labels;
350                 
351                 if (m->control_pressed) { 
352                         if (large) {  inRow.close(); remove(distFile.c_str());  }
353                         delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
354                 }
355         
356                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
357                         
358                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
359                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
360                                         error = process(list);
361                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
362                                         
363                                         if (m->control_pressed) { 
364                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
365                                                 if (groupfile != "") {  delete groupMap;  globaldata->gGroupmap = NULL; }
366                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
367                                                 delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
368                                         }
369                                         
370                                         processedLabels.insert(list->getLabel());
371                                         userLabels.erase(list->getLabel());
372                         }
373                         
374                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
375                                         string saveLabel = list->getLabel();
376                                         
377                                         delete list;
378                                         list = input->getListVector(lastLabel);
379                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
380                                         error = process(list);
381                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
382                                         
383                                         if (m->control_pressed) { 
384                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
385                                                 if (groupfile != "") {  delete groupMap;  globaldata->gGroupmap = NULL; }
386                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
387                                                 delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
388                                         }
389                                         
390                                         processedLabels.insert(list->getLabel());
391                                         userLabels.erase(list->getLabel());
392                                         
393                                         //restore real lastlabel to save below
394                                         list->setLabel(saveLabel);
395                         }
396                         
397                         lastLabel = list->getLabel();
398                         
399                         delete list;
400                         list = input->getListVector();
401                 }
402                 
403                 //output error messages about any remaining user labels
404                 bool needToRun = false;
405                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
406                         m->mothurOut("Your file does not include the label " + *it); 
407                         if (processedLabels.count(list->getLabel()) != 1) {
408                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
409                                 needToRun = true;
410                         }else {
411                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
412                         }
413                 }
414                 
415                 //run last label if you need to
416                 if (needToRun == true)  {
417                         if (list != NULL) {     delete list;    }
418                         list = input->getListVector(lastLabel);
419                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
420                         error = process(list);
421                         delete list;
422                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
423                         
424                         if (m->control_pressed) { 
425                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
426                                         if (groupfile != "") {  delete groupMap;  globaldata->gGroupmap = NULL; }
427                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
428                                         delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
429                         }
430                 }
431                 
432                 //close and remove formatted matrix file
433                 if (large) {
434                         inRow.close();
435                         remove(distFile.c_str());
436                 }
437                 
438                 globaldata->gListVector = NULL;
439                 delete input;  globaldata->ginput = NULL;
440                 delete read;
441                 delete fasta;
442                 if (groupfile != "") {
443                         delete groupMap;  globaldata->gGroupmap = NULL;
444                 }
445                 
446                 if (m->control_pressed) {  return 0; }
447                 
448                 m->mothurOutEndLine();
449                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
450                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
451                 m->mothurOutEndLine();
452                 
453                 return 0;
454         }
455         catch(exception& e) {
456                 m->errorOut(e, "GetOTURepCommand", "execute");
457                 exit(1);
458         }
459 }
460
461 //**********************************************************************************************************************
462 void GetOTURepCommand::readNamesFile() {
463         try {
464                 vector<string> dupNames;
465                 openInputFile(namefile, inNames);
466                 
467                 string name, names, sequence;
468         
469                 while(inNames){
470                         inNames >> name;                        //read from first column  A
471                         inNames >> names;               //read from second column  A,B,C,D
472                         
473                         dupNames.clear();
474                         
475                         //parse names into vector
476                         splitAtComma(names, dupNames);
477                         
478                         //store names in fasta map
479                         sequence = fasta->getSequence(name);
480                         for (int i = 0; i < dupNames.size(); i++) {
481                                 fasta->push_back(dupNames[i], sequence);
482                         }
483                 
484                         gobble(inNames);
485                 }
486                 inNames.close();
487
488         }
489         catch(exception& e) {
490                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
491                 exit(1);
492         }
493 }
494 //**********************************************************************************************************************
495 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
496         try{
497                 vector<string> names;
498                 map<string, string> groups;
499                 map<string, string>::iterator groupIt;
500
501                 //parse names into vector
502                 string binnames = thisList->get(bin);
503                 splitAtComma(binnames, names);
504                 binsize = names.size();
505
506                 //if you have a groupfile
507                 if (groupfile != "") {
508                         //find the groups that are in this bin
509                         for (size_t i = 0; i < names.size(); i++) {
510                                 string groupName = groupMap->getGroup(names[i]);
511                                 if (groupName == "not found") {  
512                                         m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
513                                         groupError = true;
514                                 } else {
515                                         groups[groupName] = groupName;
516                                 }
517                         }
518                         
519                         //turn the groups into a string
520                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
521                                 group += groupIt->first + "-";
522                         }
523                         //rip off last dash
524                         group = group.substr(0, group.length()-1);
525                 }else{ group = ""; }
526
527                 // if only 1 sequence in bin or processing the "unique" label, then 
528                 // the first sequence of the OTU is the representative one
529                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
530                         return names[0];
531                 }else{
532                         vector<int> seqIndex(names.size());
533                         vector<float> max_dist(names.size());
534                         vector<float> total_dist(names.size());
535
536                         //fill seqIndex and initialize sums
537                         for (size_t i = 0; i < names.size(); i++) {
538                                 seqIndex[i] = nameToIndex[names[i]];
539                                 max_dist[i] = 0.0;
540                                 total_dist[i] = 0.0;
541                         }
542                         
543                         // loop through all entries in seqIndex
544                         SeqMap::iterator it;
545                         SeqMap currMap;
546                         for (size_t i=0; i < seqIndex.size(); i++) {
547                                 if (m->control_pressed) {  return  "control"; }
548                         
549                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
550                                 else            {       currMap = getMap(seqIndex[i]);  }
551                                 
552                                 for (size_t j=0; j < seqIndex.size(); j++) {
553                                         it = currMap.find(seqIndex[j]);         
554                                         if (it != currMap.end()) {
555                                                 max_dist[i] = max(max_dist[i], it->second);
556                                                 max_dist[j] = max(max_dist[j], it->second);
557                                                 total_dist[i] += it->second;
558                                                 total_dist[j] += it->second;
559                                         }else{ //if you can't find the distance make it the cutoff
560                                                 max_dist[i] = max(max_dist[i], cutoff);
561                                                 max_dist[j] = max(max_dist[j], cutoff);
562                                                 total_dist[i] += cutoff;
563                                                 total_dist[j] += cutoff;
564                                         }
565                                 }
566                         }
567                         
568                         // sequence with the smallest maximum distance is the representative
569                         //if tie occurs pick sequence with smallest average distance
570                         float min = 10000;
571                         int minIndex;
572                         for (size_t i=0; i < max_dist.size(); i++) {
573                                 if (m->control_pressed) {  return  "control"; }
574                                 if (max_dist[i] < min) {
575                                         min = max_dist[i];
576                                         minIndex = i;
577                                 }else if (max_dist[i] == min) {
578                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
579                                         float newAverage = total_dist[i] / (float) total_dist.size();
580                                         
581                                         if (newAverage < currentAverage) {
582                                                 min = max_dist[i];
583                                                 minIndex = i;
584                                         }
585                                 }
586                         }
587
588                         return(names[minIndex]);
589                 }
590         }
591         catch(exception& e) {
592                 m->errorOut(e, "GetOTURepCommand", "FindRep");
593                 exit(1);
594         }
595 }
596
597 //**********************************************************************************************************************
598 int GetOTURepCommand::process(ListVector* processList) {
599         try{
600                 string nameRep, name, sequence;
601
602                 //create output file
603                 if (outputDir == "") { outputDir += hasPath(listfile); }
604                 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.fasta";
605                 openOutputFile(outputFileName, out);
606                 vector<repStruct> reps;
607                 outputNames.push_back(outputFileName);
608                 
609                 ofstream newNamesOutput;
610                 string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
611                 openOutputFile(outputNamesFile, newNamesOutput);
612                 outputNames.push_back(outputNamesFile);
613                 
614                 //for each bin in the list vector
615                 for (int i = 0; i < processList->size(); i++) {
616                         string groups;
617                         int binsize;
618                         
619                         if (m->control_pressed) { out.close();  newNamesOutput.close(); return 0; }
620                         
621                         nameRep = findRep(i, groups, processList, binsize);
622                         
623                         if (m->control_pressed) { out.close();  newNamesOutput.close(); return 0; }
624                         
625                         //output to new names file
626                         newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
627
628                         //print out name and sequence for that bin
629                         sequence = fasta->getSequence(nameRep);
630
631                         if (sequence != "not found") {
632                                 if (sorted == "") { //print them out
633                                         nameRep = nameRep + "|" + toString(i+1);
634                                         nameRep = nameRep + "|" + toString(binsize);
635                                         if (groupfile != "") {
636                                                 nameRep = nameRep + "|" + groups;
637                                         }
638                                         out << ">" << nameRep << endl;
639                                         out << sequence << endl;
640                                 }else { //save them
641                                         repStruct newRep(nameRep, i+1, binsize, groups);
642                                         reps.push_back(newRep);
643                                 }
644                         }else { 
645                                 m->mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); 
646                                 remove(outputFileName.c_str());
647                                 remove(outputNamesFile.c_str());
648                                 return 1;
649                         }
650                 }
651                 
652                 if (sorted != "") { //then sort them and print them
653                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
654                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
655                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
656                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
657                         
658                         //print them
659                         for (int i = 0; i < reps.size(); i++) {
660                                 string sequence = fasta->getSequence(reps[i].name);
661                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
662                                 outputName = outputName + "|" + toString(reps[i].size);
663                                 if (groupfile != "") {
664                                         outputName = outputName + "|" + reps[i].group;
665                                 }
666                                 out << ">" << outputName << endl;
667                                 out << sequence << endl;
668                         }
669                 }
670
671                 out.close();
672                 newNamesOutput.close();
673                 return 0;
674
675         }
676         catch(exception& e) {
677                 m->errorOut(e, "GetOTURepCommand", "process");
678                 exit(1);
679         }
680 }
681
682 //**********************************************************************************************************************
683 SeqMap GetOTURepCommand::getMap(int row) {
684         try {
685                 SeqMap rowMap;
686                 
687                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
688                 if (rowPositions[row] != -1){
689                         //go to row in file
690                         inRow.seekg(rowPositions[row]);
691                         
692                         int rowNum, numDists, colNum;
693                         float dist;
694                         
695                         inRow >> rowNum >> numDists;
696                         
697                         for(int i = 0; i < numDists; i++) {
698                                 inRow >> colNum >> dist;
699                                 rowMap[colNum] = dist;
700                                 
701                         }
702                 }
703                 
704                 return rowMap;
705         }
706         catch(exception& e) {
707                 m->errorOut(e, "GetOTURepCommand", "getMap");
708                 exit(1);
709         }
710 }
711 //**********************************************************************************************************************
712