]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
rechecking in
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option)  {
40         try{
41                 globaldata = GlobalData::getInstance();
42                 abort = false;
43                 allLines = 1;
44                 labels.clear();
45                                 
46                 //allow user to run help
47                 if (option == "help") { 
48                         help(); abort = true;
49                 } else {
50                         //valid paramters for this command
51                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
52                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                         
54                         OptionParser parser(option);
55                         map<string, string> parameters = parser.getParameters();
56                         
57                         ValidParameters validParameter;
58                         map<string, string>::iterator it;
59                 
60                         //check to make sure all parameters are valid for command
61                         for (it = parameters.begin(); it != parameters.end(); it++) { 
62                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
63                         }
64                         
65                         //if the user changes the input directory command factory will send this info to us in the output parameter 
66                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
67                         if (inputDir == "not found"){   inputDir = "";          }
68                         else {
69                                 string path;
70                                 it = parameters.find("list");
71                                 //user has given a template file
72                                 if(it != parameters.end()){ 
73                                         path = hasPath(it->second);
74                                         //if the user has not given a path then, add inputdir. else leave path alone.
75                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
76                                 }
77                                 
78                                 it = parameters.find("fasta");
79                                 //user has given a template file
80                                 if(it != parameters.end()){ 
81                                         path = hasPath(it->second);
82                                         //if the user has not given a path then, add inputdir. else leave path alone.
83                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
84                                 }
85                                 
86                                 it = parameters.find("phylip");
87                                 //user has given a template file
88                                 if(it != parameters.end()){ 
89                                         path = hasPath(it->second);
90                                         //if the user has not given a path then, add inputdir. else leave path alone.
91                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
92                                 }
93                                 
94                                 it = parameters.find("column");
95                                 //user has given a template file
96                                 if(it != parameters.end()){ 
97                                         path = hasPath(it->second);
98                                         //if the user has not given a path then, add inputdir. else leave path alone.
99                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
100                                 }
101                                 
102                                 it = parameters.find("name");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
108                                 }
109                                 
110                                 it = parameters.find("group");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
116                                 }
117                         }
118
119                         
120                         //if the user changes the output directory command factory will send this info to us in the output parameter 
121                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
122                         
123                         //check for required parameters
124                         fastafile = validParameter.validFile(parameters, "fasta", true);
125                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126                         else if (fastafile == "not open") { abort = true; }     
127                 
128                         listfile = validParameter.validFile(parameters, "list", true);
129                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130                         else if (listfile == "not open") { abort = true; }      
131                         
132                         phylipfile = validParameter.validFile(parameters, "phylip", true);
133                         if (phylipfile == "not found") { phylipfile = "";  }
134                         else if (phylipfile == "not open") { abort = true; }    
135                         else { distFile = phylipfile; format = "phylip";   }
136                         
137                         columnfile = validParameter.validFile(parameters, "column", true);
138                         if (columnfile == "not found") { columnfile = ""; }
139                         else if (columnfile == "not open") { abort = true; }    
140                         else { distFile = columnfile; format = "column";   }
141                         
142                         namefile = validParameter.validFile(parameters, "name", true);
143                         if (namefile == "not open") { abort = true; }   
144                         else if (namefile == "not found") { namefile = ""; }
145                         
146                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
148                 
149                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
150
151                         //check for optional parameter and set defaults
152                         // ...at some point should added some additional type checking...
153                         label = validParameter.validFile(parameters, "label", false);                   
154                         if (label == "not found") { label = ""; allLines = 1;  }
155                         else { 
156                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
157                                 else { allLines = 1;  }
158                         }
159                         
160                         groupfile = validParameter.validFile(parameters, "group", true);
161                         if (groupfile == "not open") { groupfile = ""; abort = true; }
162                         else if (groupfile == "not found") { groupfile = ""; }
163                                                 
164                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
165                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
166                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
167                                 sorted = "";
168                         }
169                         
170                         if ((sorted == "group") && (groupfile == "")) {
171                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
172                                 sorted = "";
173                         }
174                         
175                         groups = validParameter.validFile(parameters, "groups", false);                 
176                         if (groups == "not found") { groups = ""; }
177                         else { 
178                                 if (groupfile == "") {
179                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
180                                         abort = true;
181                                 }else { 
182                                         splitAtDash(groups, Groups);
183                                 }
184                         }
185                         globaldata->Groups = Groups;
186                         
187                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
188                         large = isTrue(temp);
189                         
190                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
191                         convert(temp, precision); 
192                         
193                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
194                         convert(temp, cutoff); 
195                         cutoff += (5 / (precision * 10.0));
196                 }
197         }
198         catch(exception& e) {
199                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
200                 exit(1);
201         }
202 }
203
204 //**********************************************************************************************************************
205
206 void GetOTURepCommand::help(){
207         try {
208                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
209                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
210                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
211                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
212                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
213                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
214                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
215                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
216                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
217                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
218                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
219                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
220                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
221         }
222         catch(exception& e) {
223                 m->errorOut(e, "GetOTURepCommand", "help");
224                 exit(1);
225         }
226 }
227
228 //**********************************************************************************************************************
229
230 GetOTURepCommand::~GetOTURepCommand(){}
231
232 //**********************************************************************************************************************
233
234 int GetOTURepCommand::execute(){
235         try {
236         
237                 if (abort == true) { return 0; }
238                 int error;
239                 
240                 if (!large) {
241                         //read distance files
242                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
243                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
244                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
245                         
246                         readMatrix->setCutoff(cutoff);
247         
248                         if(namefile != ""){     
249                                 nameMap = new NameAssignment(namefile);
250                                 nameMap->readMap();
251                         }else{  nameMap = NULL;         }
252                         
253                         readMatrix->read(nameMap);
254                         
255                         if (m->control_pressed) { delete readMatrix; return 0; }
256
257                         //get matrix
258                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
259                         globaldata->gListVector = readMatrix->getListVector();
260
261                         SparseMatrix* matrix = readMatrix->getMatrix();
262                         
263                         // Create a data structure to quickly access the distance information.
264                         // It consists of a vector of distance maps, where each map contains
265                         // all distances of a certain sequence. Vector and maps are accessed
266                         // via the index of a sequence in the distance matrix
267                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
268                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
269                                 if (m->control_pressed) { delete readMatrix; return 0; }
270                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
271                         }
272                         
273                         delete matrix;
274                         delete readMatrix;
275                         delete nameMap;
276                         
277                         if (m->control_pressed) { return 0; }
278                 }else {
279                         //process file and set up indexes
280                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
281                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
282                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
283                         
284                         formatMatrix->setCutoff(cutoff);
285         
286                         if(namefile != ""){     
287                                 nameMap = new NameAssignment(namefile);
288                                 nameMap->readMap();
289                         }else{  nameMap = NULL;         }
290                         
291                         formatMatrix->read(nameMap);
292                         
293                         if (m->control_pressed) { delete formatMatrix;  return 0; }
294
295                         //get matrix
296                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
297                         globaldata->gListVector = formatMatrix->getListVector();
298                         
299                         distFile = formatMatrix->getFormattedFileName();
300                         
301                         //positions in file where the distances for each sequence begin
302                         //rowPositions[1] = position in file where distance related to sequence 1 start.
303                         rowPositions = formatMatrix->getRowPositions();
304                         
305                         delete formatMatrix;
306                         delete nameMap;
307                         
308                         //openfile for getMap to use
309                         openInputFile(distFile, inRow);
310                         
311                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
312                 }
313                 
314                 
315                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
316                 if (globaldata->gListVector != NULL) {
317                         vector<string> names;
318                         string binnames;
319                         //map names to rows in sparsematrix
320                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
321                                 names.clear();
322                                 binnames = globaldata->gListVector->get(i);
323                                 
324                                 splitAtComma(binnames, names);
325                                 
326                                 for (int j = 0; j < names.size(); j++) {
327                                         nameToIndex[names[j]] = i;
328                                 }
329                         }
330                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
331                 
332                                 
333                 if (m->control_pressed) { 
334                         if (large) {  inRow.close(); remove(distFile.c_str());  }
335                         return 0; 
336                 }
337                 
338                 if (groupfile != "") {
339                         //read in group map info.
340                         groupMap = new GroupMap(groupfile);
341                         int error = groupMap->readMap();
342                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
343                         
344                         if (Groups.size() != 0) {
345                                 SharedUtil* util = new SharedUtil();
346                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
347                                 delete util;
348                         }
349                 }
350                                                                 
351                 //set format to list so input can get listvector
352                 globaldata->setFormat("list");
353         
354                 //read list file
355                 read = new ReadOTUFile(listfile);
356                 read->read(&*globaldata); 
357                 
358                 input = globaldata->ginput;
359                 list = globaldata->gListVector;
360                 string lastLabel = list->getLabel();
361
362                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
363                 set<string> processedLabels;
364                 set<string> userLabels = labels;
365                 
366                 if (m->control_pressed) { 
367                         if (large) {  inRow.close(); remove(distFile.c_str());  }
368                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
369                 }
370         
371                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
372                         
373                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
374                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
375                                         error = process(list);
376                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
377                                         
378                                         if (m->control_pressed) { 
379                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
380                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
381                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
382                                         }
383                                         
384                                         processedLabels.insert(list->getLabel());
385                                         userLabels.erase(list->getLabel());
386                         }
387                         
388                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
389                                         string saveLabel = list->getLabel();
390                                         
391                                         delete list;
392                                         list = input->getListVector(lastLabel);
393                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
394                                         error = process(list);
395                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
396                                         
397                                         if (m->control_pressed) { 
398                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
399                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
400                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
401                                         }
402                                         
403                                         processedLabels.insert(list->getLabel());
404                                         userLabels.erase(list->getLabel());
405                                         
406                                         //restore real lastlabel to save below
407                                         list->setLabel(saveLabel);
408                         }
409                         
410                         lastLabel = list->getLabel();
411         
412                         delete list;
413                         list = input->getListVector();
414                 }
415                 
416                 //output error messages about any remaining user labels
417                 bool needToRun = false;
418                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
419                         m->mothurOut("Your file does not include the label " + (*it)); 
420                         if (processedLabels.count(lastLabel) != 1) {
421                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
422                                 needToRun = true;
423                         }else {
424                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
425                         }
426                 }
427                 
428                 //run last label if you need to
429                 if (needToRun == true)  {
430                         if (list != NULL) {     delete list;    }
431                         list = input->getListVector(lastLabel);
432                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
433                         error = process(list);
434                         delete list;
435                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
436                         
437                         if (m->control_pressed) { 
438                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
439                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  }
440                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
441                         }
442                 }
443                 
444                 //close and remove formatted matrix file
445                 if (large) {
446                         inRow.close();
447                         remove(distFile.c_str());
448                 }
449                 
450                 globaldata->gListVector = NULL;
451                 delete input;  globaldata->ginput = NULL;
452                 delete read;
453                 
454                 //read fastafile
455                 fasta = new FastaMap();
456                 fasta->readFastaFile(fastafile);
457                 
458                 //if user gave a namesfile then use it
459                 if (namefile != "") {   readNamesFile();        }
460                 
461                 //output create and output the .rep.fasta files
462                 map<string, string>::iterator itNameFile;
463                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
464                         processNames(itNameFile->first, itNameFile->second);
465                 }
466                 
467                 delete fasta;
468                 if (groupfile != "") {
469                         delete groupMap;  globaldata->gGroupmap = NULL;
470                 }
471                 
472                 if (m->control_pressed) {  return 0; }
473                 
474                 m->mothurOutEndLine();
475                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
476                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
477                 m->mothurOutEndLine();
478                 
479                 return 0;
480         }
481         catch(exception& e) {
482                 m->errorOut(e, "GetOTURepCommand", "execute");
483                 exit(1);
484         }
485 }
486
487 //**********************************************************************************************************************
488 void GetOTURepCommand::readNamesFile() {
489         try {
490                 vector<string> dupNames;
491                 openInputFile(namefile, inNames);
492                 
493                 string name, names, sequence;
494         
495                 while(inNames){
496                         inNames >> name;                        //read from first column  A
497                         inNames >> names;               //read from second column  A,B,C,D
498                         
499                         dupNames.clear();
500                         
501                         //parse names into vector
502                         splitAtComma(names, dupNames);
503                         
504                         //store names in fasta map
505                         sequence = fasta->getSequence(name);
506                         for (int i = 0; i < dupNames.size(); i++) {
507                                 fasta->push_back(dupNames[i], sequence);
508                         }
509                 
510                         gobble(inNames);
511                 }
512                 inNames.close();
513
514         }
515         catch(exception& e) {
516                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
517                 exit(1);
518         }
519 }
520 //**********************************************************************************************************************
521 string GetOTURepCommand::findRep(vector<string> names) {
522         try{
523                 // if only 1 sequence in bin or processing the "unique" label, then 
524                 // the first sequence of the OTU is the representative one
525                 if ((names.size() == 2) || (names.size() == 1) || (list->getLabel() == "unique")) {
526                         return names[0];
527                 }else{
528                         vector<int> seqIndex(names.size());
529                         vector<float> max_dist(names.size());
530                         vector<float> total_dist(names.size());
531
532                         //fill seqIndex and initialize sums
533                         for (size_t i = 0; i < names.size(); i++) {
534                                 seqIndex[i] = nameToIndex[names[i]];
535                                 max_dist[i] = 0.0;
536                                 total_dist[i] = 0.0;
537                         }
538                         
539                         // loop through all entries in seqIndex
540                         SeqMap::iterator it;
541                         SeqMap currMap;
542                         for (size_t i=0; i < seqIndex.size(); i++) {
543                                 if (m->control_pressed) {  return  "control"; }
544                         
545                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
546                                 else            {       currMap = getMap(seqIndex[i]);  }
547                                 
548                                 for (size_t j=0; j < seqIndex.size(); j++) {
549                                         it = currMap.find(seqIndex[j]);         
550                                         if (it != currMap.end()) {
551                                                 max_dist[i] = max(max_dist[i], it->second);
552                                                 max_dist[j] = max(max_dist[j], it->second);
553                                                 total_dist[i] += it->second;
554                                                 total_dist[j] += it->second;
555                                         }else{ //if you can't find the distance make it the cutoff
556                                                 max_dist[i] = max(max_dist[i], cutoff);
557                                                 max_dist[j] = max(max_dist[j], cutoff);
558                                                 total_dist[i] += cutoff;
559                                                 total_dist[j] += cutoff;
560                                         }
561                                 }
562                         }
563                         
564                         // sequence with the smallest maximum distance is the representative
565                         //if tie occurs pick sequence with smallest average distance
566                         float min = 10000;
567                         int minIndex;
568                         for (size_t i=0; i < max_dist.size(); i++) {
569                                 if (m->control_pressed) {  return  "control"; }
570                                 if (max_dist[i] < min) {
571                                         min = max_dist[i];
572                                         minIndex = i;
573                                 }else if (max_dist[i] == min) {
574                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
575                                         float newAverage = total_dist[i] / (float) total_dist.size();
576                                         
577                                         if (newAverage < currentAverage) {
578                                                 min = max_dist[i];
579                                                 minIndex = i;
580                                         }
581                                 }
582                         }
583                         
584                         return(names[minIndex]);
585                 }
586         }
587         catch(exception& e) {
588                 m->errorOut(e, "GetOTURepCommand", "FindRep");
589                 exit(1);
590         }
591 }
592
593 //**********************************************************************************************************************
594 int GetOTURepCommand::process(ListVector* processList) {
595         try{
596                 string name, sequence;
597                 string nameRep;
598
599                 //create output file
600                 if (outputDir == "") { outputDir += hasPath(listfile); }
601                                 
602                 ofstream newNamesOutput;
603                 string outputNamesFile;
604                 map<string, ofstream*> filehandles;
605                 
606                 if (Groups.size() == 0) { //you don't want to use groups
607                         outputNamesFile  = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
608                         openOutputFile(outputNamesFile, newNamesOutput);
609                         outputNames.push_back(outputNamesFile);
610                         outputNameFiles[outputNamesFile] = processList->getLabel();
611                 }else{ //you want to use groups
612                         ofstream* temp;
613                         for (int i=0; i<Groups.size(); i++) {
614                                 temp = new ofstream;
615                                 filehandles[Groups[i]] = temp;
616                                 outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
617                                 
618                                 openOutputFile(outputNamesFile, *(temp));
619                                 outputNames.push_back(outputNamesFile);
620                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
621                         }
622                 }
623                 
624                 //for each bin in the list vector
625                 for (int i = 0; i < processList->size(); i++) {
626                         if (m->control_pressed) { 
627                                 out.close();  
628                                 if (Groups.size() == 0) { //you don't want to use groups
629                                         newNamesOutput.close();
630                                 }else{
631                                         for (int j=0; j<Groups.size(); j++) {
632                                                 (*(filehandles[Groups[j]])).close();
633                                                 delete filehandles[Groups[j]];
634                                         }
635                                 }
636                                 return 0; 
637                         }
638                         
639                         string temp = processList->get(i);
640                         vector<string> namesInBin;
641                         splitAtComma(temp, namesInBin);
642                         
643                         if (Groups.size() == 0) {
644                                 nameRep = findRep(namesInBin);
645                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
646                         }else{
647                                 map<string, vector<string> > NamesInGroup;
648                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
649                                         NamesInGroup[Groups[j]].resize(0);
650                                 }
651                                 
652                                 for (int j=0; j<namesInBin.size(); j++) {
653                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
654                                         
655                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
656                                         
657                                         if (inUsersGroups(thisgroup, Groups)) { //add this name to correct group
658                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
659                                         }
660                                 }
661                                 
662                                 //get rep for each group in otu
663                                 for (int j=0; j<Groups.size(); j++) {
664                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
665                                                 //get rep for each group
666                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
667                                                 
668                                                 //output group rep and other members of this group
669                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
670                                                 
671                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
672                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
673                                                 }
674                                                 //output last name
675                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
676                                         }
677                                 }
678                         }
679                 }
680                 
681                 if (Groups.size() == 0) { //you don't want to use groups
682                         newNamesOutput.close();
683                 }else{
684                         for (int i=0; i<Groups.size(); i++) {
685                                 (*(filehandles[Groups[i]])).close();
686                                 delete filehandles[Groups[i]];
687                         }
688                 }
689                 
690                 return 0;
691
692         }
693         catch(exception& e) {
694                 m->errorOut(e, "GetOTURepCommand", "process");
695                 exit(1);
696         }
697 }
698 //**********************************************************************************************************************
699 int GetOTURepCommand::processNames(string filename, string label) {
700         try{
701
702                 //create output file
703                 if (outputDir == "") { outputDir += hasPath(listfile); }
704                 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + label + ".rep.fasta";
705                 openOutputFile(outputFileName, out);
706                 vector<repStruct> reps;
707                 outputNames.push_back(outputFileName);
708                 
709                 ofstream out2;
710                 string tempNameFile = filename + ".temp";
711                 openOutputFile(tempNameFile, out2);
712                 
713                 ifstream in;
714                 openInputFile(filename, in);
715                 
716                 int i = 0;
717                 while (!in.eof()) {
718                         string rep, binnames;
719                         in >> i >> rep >> binnames; gobble(in);
720                         out2 << rep << '\t' << binnames << endl;
721                         
722                         vector<string> names;
723                         splitAtComma(binnames, names);
724                         int binsize = names.size();
725                         
726                         //if you have a groupfile
727                         string group = "";
728                         if (groupfile != "") {
729                                 map<string, string> groups;
730                                 map<string, string>::iterator groupIt;
731                                 
732                                 //find the groups that are in this bin
733                                 for (size_t i = 0; i < names.size(); i++) {
734                                         string groupName = groupMap->getGroup(names[i]);
735                                         if (groupName == "not found") {  
736                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
737                                                 groupError = true;
738                                         } else {
739                                                 groups[groupName] = groupName;
740                                         }
741                                 }
742                                 
743                                 //turn the groups into a string
744                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
745                                         group += groupIt->first + "-";
746                                 }
747                                 //rip off last dash
748                                 group = group.substr(0, group.length()-1);
749                         }else{ group = ""; }
750
751                         
752                         //print out name and sequence for that bin
753                         string sequence = fasta->getSequence(rep);
754
755                         if (sequence != "not found") {
756                                 if (sorted == "") { //print them out
757                                         rep = rep + "|" + toString(i+1);
758                                         rep = rep + "|" + toString(binsize);
759                                         if (groupfile != "") {
760                                                 rep = rep + "|" + group;
761                                         }
762                                         out << ">" << rep << endl;
763                                         out << sequence << endl;
764                                 }else { //save them
765                                         repStruct newRep(rep, i+1, binsize, group);
766                                         reps.push_back(newRep);
767                                 }
768                         }else { 
769                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
770                         }
771                 }
772                 
773                         
774                 if (sorted != "") { //then sort them and print them
775                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
776                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
777                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
778                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
779                         
780                         //print them
781                         for (int i = 0; i < reps.size(); i++) {
782                                 string sequence = fasta->getSequence(reps[i].name);
783                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
784                                 outputName = outputName + "|" + toString(reps[i].size);
785                                 if (groupfile != "") {
786                                         outputName = outputName + "|" + reps[i].group;
787                                 }
788                                 out << ">" << outputName << endl;
789                                 out << sequence << endl;
790                         }
791                 }
792                         
793                 out.close();
794                 out2.close();
795                 
796                 rename(tempNameFile.c_str(), filename.c_str());
797                 
798                 return 0;
799
800         }
801         catch(exception& e) {
802                 m->errorOut(e, "GetOTURepCommand", "processNames");
803                 exit(1);
804         }
805 }
806 //**********************************************************************************************************************
807 SeqMap GetOTURepCommand::getMap(int row) {
808         try {
809                 SeqMap rowMap;
810                 
811                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
812                 if (rowPositions[row] != -1){
813                         //go to row in file
814                         inRow.seekg(rowPositions[row]);
815                         
816                         int rowNum, numDists, colNum;
817                         float dist;
818                         
819                         inRow >> rowNum >> numDists;
820                         
821                         for(int i = 0; i < numDists; i++) {
822                                 inRow >> colNum >> dist;
823                                 rowMap[colNum] = dist;
824                                 
825                         }
826                 }
827                 
828                 return rowMap;
829         }
830         catch(exception& e) {
831                 m->errorOut(e, "GetOTURepCommand", "getMap");
832                 exit(1);
833         }
834 }
835 //**********************************************************************************************************************
836