]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added [ERROR] flag if command aborts
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){   
40         try {
41                 abort = true; calledHelp = true; 
42                 vector<string> tempOutNames;
43                 outputTypes["fasta"] = tempOutNames;
44                 outputTypes["name"] = tempOutNames;
45         }
46         catch(exception& e) {
47                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
48                 exit(1);
49         }
50 }
51 //**********************************************************************************************************************
52 vector<string> GetOTURepCommand::getValidParameters(){  
53         try {
54                 string Array[] =  {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
55                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64 vector<string> GetOTURepCommand::getRequiredParameters(){       
65         try {
66                 string Array[] =  {"fasta","list"};
67                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
68                 return myArray;
69         }
70         catch(exception& e) {
71                 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
72                 exit(1);
73         }
74 }
75 //**********************************************************************************************************************
76 vector<string> GetOTURepCommand::getRequiredFiles(){    
77         try {
78                 vector<string> myArray;
79                 return myArray;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87 GetOTURepCommand::GetOTURepCommand(string option)  {
88         try{
89                 globaldata = GlobalData::getInstance();
90                 abort = false; calledHelp = false;   
91                 allLines = 1;
92                 labels.clear();
93                                 
94                 //allow user to run help
95                 if (option == "help") { 
96                         help(); abort = true; calledHelp = true;
97                 } else {
98                         //valid paramters for this command
99                         string Array[] =  {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
100                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
101                         
102                         OptionParser parser(option);
103                         map<string, string> parameters = parser.getParameters();
104                         
105                         ValidParameters validParameter;
106                         map<string, string>::iterator it;
107                 
108                         //check to make sure all parameters are valid for command
109                         for (it = parameters.begin(); it != parameters.end(); it++) { 
110                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
111                         }
112                         
113                         //initialize outputTypes
114                         vector<string> tempOutNames;
115                         outputTypes["fasta"] = tempOutNames;
116                         outputTypes["name"] = tempOutNames;
117                         
118                         //if the user changes the input directory command factory will send this info to us in the output parameter 
119                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
120                         if (inputDir == "not found"){   inputDir = "";          }
121                         else {
122                                 string path;
123                                 it = parameters.find("list");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
129                                 }
130                                 
131                                 it = parameters.find("fasta");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
137                                 }
138                                 
139                                 it = parameters.find("phylip");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
145                                 }
146                                 
147                                 it = parameters.find("column");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
153                                 }
154                                 
155                                 it = parameters.find("name");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
161                                 }
162                                 
163                                 it = parameters.find("group");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
169                                 }
170                         }
171
172                         
173                         //if the user changes the output directory command factory will send this info to us in the output parameter 
174                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
175                         
176                         //check for required parameters
177                         fastafile = validParameter.validFile(parameters, "fasta", true);
178                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
179                         else if (fastafile == "not open") { abort = true; }     
180                 
181                         listfile = validParameter.validFile(parameters, "list", true);
182                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
183                         else if (listfile == "not open") { abort = true; }      
184                         
185                         phylipfile = validParameter.validFile(parameters, "phylip", true);
186                         if (phylipfile == "not found") { phylipfile = "";  }
187                         else if (phylipfile == "not open") { abort = true; }    
188                         else { distFile = phylipfile; format = "phylip";   }
189                         
190                         columnfile = validParameter.validFile(parameters, "column", true);
191                         if (columnfile == "not found") { columnfile = ""; }
192                         else if (columnfile == "not open") { abort = true; }    
193                         else { distFile = columnfile; format = "column";   }
194                         
195                         namefile = validParameter.validFile(parameters, "name", true);
196                         if (namefile == "not open") { abort = true; }   
197                         else if (namefile == "not found") { namefile = ""; }
198                         
199                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
200                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
201                 
202                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
203
204                         //check for optional parameter and set defaults
205                         // ...at some point should added some additional type checking...
206                         label = validParameter.validFile(parameters, "label", false);                   
207                         if (label == "not found") { label = ""; allLines = 1;  }
208                         else { 
209                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
210                                 else { allLines = 1;  }
211                         }
212                         
213                         groupfile = validParameter.validFile(parameters, "group", true);
214                         if (groupfile == "not open") { groupfile = ""; abort = true; }
215                         else if (groupfile == "not found") { groupfile = ""; }
216                                                 
217                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
218                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
219                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
220                                 sorted = "";
221                         }
222                         
223                         if ((sorted == "group") && (groupfile == "")) {
224                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
225                                 sorted = "";
226                         }
227                         
228                         groups = validParameter.validFile(parameters, "groups", false);                 
229                         if (groups == "not found") { groups = ""; }
230                         else { 
231                                 if (groupfile == "") {
232                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
233                                         abort = true;
234                                 }else { 
235                                         m->splitAtDash(groups, Groups);
236                                 }
237                         }
238                         globaldata->Groups = Groups;
239                         
240                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
241                         large = m->isTrue(temp);
242                         
243                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {      if (namefile == "") { temp = "F"; } else { temp = "t"; }        }
244                         weighted = m->isTrue(temp);
245                         
246                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
247                         
248                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
249                         convert(temp, precision); 
250                         
251                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
252                         convert(temp, cutoff); 
253                         cutoff += (5 / (precision * 10.0));
254                 }
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
258                 exit(1);
259         }
260 }
261
262 //**********************************************************************************************************************
263
264 void GetOTURepCommand::help(){
265         try {
266                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
267                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
268                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
269                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
270                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
271                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
272                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
273                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
274                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
275                 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false with no namesfile and true when a name file is provided.\n");
276                 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
277                 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
278                 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
279                 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
280                 m->mothurOut("The group parameter allows you provide a group file.\n");
281                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
282                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
283                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
284                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "GetOTURepCommand", "help");
288                 exit(1);
289         }
290 }
291
292 //**********************************************************************************************************************
293
294 GetOTURepCommand::~GetOTURepCommand(){}
295
296 //**********************************************************************************************************************
297
298 int GetOTURepCommand::execute(){
299         try {
300         
301                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
302                 int error;
303                 
304                 if (!large) {
305                         //read distance files
306                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
307                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
308                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
309                         
310                         readMatrix->setCutoff(cutoff);
311         
312                         if(namefile != ""){     
313                                 nameMap = new NameAssignment(namefile);
314                                 nameMap->readMap();
315                         }else{  nameMap = NULL;         }
316                         
317                         readMatrix->read(nameMap);
318                         
319                         if (m->control_pressed) { delete readMatrix; return 0; }
320
321                         //get matrix
322                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
323                         globaldata->gListVector = readMatrix->getListVector();
324
325                         SparseMatrix* matrix = readMatrix->getMatrix();
326                         
327                         // Create a data structure to quickly access the distance information.
328                         // It consists of a vector of distance maps, where each map contains
329                         // all distances of a certain sequence. Vector and maps are accessed
330                         // via the index of a sequence in the distance matrix
331                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
332                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
333                                 if (m->control_pressed) { delete readMatrix; return 0; }
334                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
335                         }
336                         //add dummy map for unweighted calc
337                         SeqMap dummy;
338                         seqVec.push_back(dummy);
339                         
340                         delete matrix;
341                         delete readMatrix;
342                         delete nameMap;
343                         
344                         if (m->control_pressed) { return 0; }
345                 }else {
346                         //process file and set up indexes
347                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
348                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
349                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
350                         
351                         formatMatrix->setCutoff(cutoff);
352         
353                         if(namefile != ""){     
354                                 nameMap = new NameAssignment(namefile);
355                                 nameMap->readMap();
356                         }else{  nameMap = NULL;         }
357                         
358                         formatMatrix->read(nameMap);
359                         
360                         if (m->control_pressed) { delete formatMatrix;  return 0; }
361
362                         //get matrix
363                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
364                         globaldata->gListVector = formatMatrix->getListVector();
365                         
366                         distFile = formatMatrix->getFormattedFileName();
367                         
368                         //positions in file where the distances for each sequence begin
369                         //rowPositions[1] = position in file where distance related to sequence 1 start.
370                         rowPositions = formatMatrix->getRowPositions();
371                         rowPositions.push_back(-1); //dummy row for unweighted calc
372                         
373                         delete formatMatrix;
374                         delete nameMap;
375                         
376                         //openfile for getMap to use
377                         m->openInputFile(distFile, inRow);
378                         
379                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
380                 }
381                 
382                 
383                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
384                 if (globaldata->gListVector != NULL) {
385                         vector<string> names;
386                         string binnames;
387                         //map names to rows in sparsematrix
388                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
389                                 names.clear();
390                                 binnames = globaldata->gListVector->get(i);
391                                 
392                                 m->splitAtComma(binnames, names);
393                                 
394                                 for (int j = 0; j < names.size(); j++) {
395                                         nameToIndex[names[j]] = i;
396                                 }
397                         }
398                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
399                 
400                                 
401                 if (m->control_pressed) { 
402                         if (large) {  inRow.close(); remove(distFile.c_str());  }
403                         return 0; 
404                 }
405                 
406                 if (groupfile != "") {
407                         //read in group map info.
408                         groupMap = new GroupMap(groupfile);
409                         int error = groupMap->readMap();
410                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
411                         
412                         if (Groups.size() != 0) {
413                                 SharedUtil* util = new SharedUtil();
414                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
415                                 delete util;
416                         }
417                 }
418                                                                 
419                 //set format to list so input can get listvector
420                 globaldata->setFormat("list");
421         
422                 //read list file
423                 read = new ReadOTUFile(listfile);
424                 read->read(&*globaldata); 
425                 
426                 input = globaldata->ginput;
427                 list = globaldata->gListVector;
428                 string lastLabel = list->getLabel();
429
430                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
431                 set<string> processedLabels;
432                 set<string> userLabels = labels;
433                 
434                 if (m->control_pressed) { 
435                         if (large) {  inRow.close(); remove(distFile.c_str());  }
436                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
437                 }
438                 
439                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
440                 
441                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
442                         
443                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
444                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
445                                         error = process(list);
446                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
447                                         
448                                         if (m->control_pressed) { 
449                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
450                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
451                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
452                                         }
453                                         
454                                         processedLabels.insert(list->getLabel());
455                                         userLabels.erase(list->getLabel());
456                         }
457                         
458                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
459                                         string saveLabel = list->getLabel();
460                                         
461                                         delete list;
462                                         list = input->getListVector(lastLabel);
463                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
464                                         error = process(list);
465                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
466                                         
467                                         if (m->control_pressed) { 
468                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
469                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
470                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
471                                         }
472                                         
473                                         processedLabels.insert(list->getLabel());
474                                         userLabels.erase(list->getLabel());
475                                         
476                                         //restore real lastlabel to save below
477                                         list->setLabel(saveLabel);
478                         }
479                         
480                         lastLabel = list->getLabel();
481         
482                         delete list;
483                         list = input->getListVector();
484                 }
485                 
486                 //output error messages about any remaining user labels
487                 bool needToRun = false;
488                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
489                         m->mothurOut("Your file does not include the label " + (*it)); 
490                         if (processedLabels.count(lastLabel) != 1) {
491                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
492                                 needToRun = true;
493                         }else {
494                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
495                         }
496                 }
497                 
498                 //run last label if you need to
499                 if (needToRun == true)  {
500                         if (list != NULL) {     delete list;    }
501                         list = input->getListVector(lastLabel);
502                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
503                         error = process(list);
504                         delete list;
505                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
506                         
507                         if (m->control_pressed) { 
508                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
509                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
510                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
511                         }
512                 }
513                 
514                 //close and remove formatted matrix file
515                 if (large) {
516                         inRow.close();
517                         remove(distFile.c_str());
518                 }
519                 
520                 globaldata->gListVector = NULL;
521                 delete input;  globaldata->ginput = NULL;
522                 delete read;
523                 
524                 if (!weighted) { nameFileMap.clear(); }
525                 
526                 //read fastafile
527                 fasta = new FastaMap();
528                 fasta->readFastaFile(fastafile);
529                 
530                 //if user gave a namesfile then use it
531                 if (namefile != "") {   readNamesFile();        }
532                 
533                 //output create and output the .rep.fasta files
534                 map<string, string>::iterator itNameFile;
535                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
536                         processNames(itNameFile->first, itNameFile->second);
537                 }
538                 
539                 delete fasta;
540                 if (groupfile != "") {
541                         delete groupMap;  globaldata->gGroupmap = NULL;
542                 }
543                 
544                 if (m->control_pressed) {  return 0; }
545                 
546                 m->mothurOutEndLine();
547                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
548                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
549                 m->mothurOutEndLine();
550                 
551                 return 0;
552         }
553         catch(exception& e) {
554                 m->errorOut(e, "GetOTURepCommand", "execute");
555                 exit(1);
556         }
557 }
558
559 //**********************************************************************************************************************
560 void GetOTURepCommand::readNamesFile() {
561         try {
562                 ifstream in;
563                 vector<string> dupNames;
564                 m->openInputFile(namefile, in);
565                 
566                 string name, names, sequence;
567         
568                 while(!in.eof()){
569                         in >> name;                     //read from first column  A
570                         in >> names;            //read from second column  A,B,C,D
571                         
572                         dupNames.clear();
573                         
574                         //parse names into vector
575                         m->splitAtComma(names, dupNames);
576                         
577                         //store names in fasta map
578                         sequence = fasta->getSequence(name);
579                         for (int i = 0; i < dupNames.size(); i++) {
580                                 fasta->push_back(dupNames[i], sequence);
581                         }
582                 
583                         m->gobble(in);
584                 }
585                 in.close();
586
587         }
588         catch(exception& e) {
589                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
590                 exit(1);
591         }
592 }
593 //**********************************************************************************************************************
594 //read names file to find the weighted rep for each bin
595 void GetOTURepCommand::readNamesFile(bool w) {
596         try {
597                 ifstream in;
598                 vector<string> dupNames;
599                 m->openInputFile(namefile, in);
600                 
601                 string name, names, sequence;
602                 
603                 while(!in.eof()){
604                         in >> name;     m->gobble(in);          //read from first column  A
605                         in >> names;                                                    //read from second column  A,B,C,D
606                         
607                         dupNames.clear();
608                         
609                         //parse names into vector
610                         m->splitAtComma(names, dupNames);
611                         
612                         for (int i = 0; i < dupNames.size(); i++) {
613                                 nameFileMap[dupNames[i]] = name;
614                         }
615                         
616                         m->gobble(in);
617                 }
618                 in.close();
619                 
620         }
621         catch(exception& e) {
622                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
623                 exit(1);
624         }
625 }
626 //**********************************************************************************************************************
627 string GetOTURepCommand::findRep(vector<string> names) {
628         try{
629                 // if only 1 sequence in bin or processing the "unique" label, then 
630                 // the first sequence of the OTU is the representative one
631                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
632                         return names[0];
633                 }else{
634                         vector<int> seqIndex(names.size());
635                         vector<float> max_dist(names.size());
636                         vector<float> total_dist(names.size());
637                         map<string, string>::iterator itNameFile;
638                         map<string, int>::iterator itNameIndex;
639
640                         //fill seqIndex and initialize sums
641                         for (size_t i = 0; i < names.size(); i++) {
642                                 if (weighted) {
643                                         seqIndex[i] = nameToIndex[names[i]];
644                                 }else { 
645                                         if (namefile == "") {
646                                                 itNameIndex = nameToIndex.find(names[i]);
647                                                 
648                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
649                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
650                                                         else {  seqIndex[i] = (seqVec.size()-1); }
651                                                 }else {
652                                                         seqIndex[i] = itNameIndex->second;
653                                                 }
654                                                 
655                                         }else {
656                                                 itNameFile = nameFileMap.find(names[i]);
657                                                 
658                                                 if (itNameFile == nameFileMap.end()) {
659                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
660                                                 }else{
661                                                         string name1 = itNameFile->first;
662                                                         string name2 = itNameFile->second;
663                                                         
664                                                         if (name1 == name2) { //then you are unique so add your real dists
665                                                                 seqIndex[i] = nameToIndex[names[i]];
666                                                         }else { //add dummy
667                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
668                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
669                                                         }
670                                                 }
671                                         }
672                                 }
673                                 max_dist[i] = 0.0;
674                                 total_dist[i] = 0.0;
675                         }
676                         
677                         // loop through all entries in seqIndex
678                         SeqMap::iterator it;
679                         SeqMap currMap;
680                         for (size_t i=0; i < seqIndex.size(); i++) {
681                                 if (m->control_pressed) {  return  "control"; }
682                         
683                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
684                                 else            {       currMap = getMap(seqIndex[i]);  }
685                                 
686                                 for (size_t j=0; j < seqIndex.size(); j++) {
687                                         it = currMap.find(seqIndex[j]);         
688                                         if (it != currMap.end()) {
689                                                 max_dist[i] = max(max_dist[i], it->second);
690                                                 max_dist[j] = max(max_dist[j], it->second);
691                                                 total_dist[i] += it->second;
692                                                 total_dist[j] += it->second;
693                                         }else{ //if you can't find the distance make it the cutoff
694                                                 max_dist[i] = max(max_dist[i], cutoff);
695                                                 max_dist[j] = max(max_dist[j], cutoff);
696                                                 total_dist[i] += cutoff;
697                                                 total_dist[j] += cutoff;
698                                         }
699                                 }
700                         }
701                         
702                         // sequence with the smallest maximum distance is the representative
703                         //if tie occurs pick sequence with smallest average distance
704                         float min = 10000;
705                         int minIndex;
706                         for (size_t i=0; i < max_dist.size(); i++) {
707                                 if (m->control_pressed) {  return  "control"; }
708                                 if (max_dist[i] < min) {
709                                         min = max_dist[i];
710                                         minIndex = i;
711                                 }else if (max_dist[i] == min) {
712                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
713                                         float newAverage = total_dist[i] / (float) total_dist.size();
714                                         
715                                         if (newAverage < currentAverage) {
716                                                 min = max_dist[i];
717                                                 minIndex = i;
718                                         }
719                                 }
720                         }
721                         
722                         return(names[minIndex]);
723                 }
724         }
725         catch(exception& e) {
726                 m->errorOut(e, "GetOTURepCommand", "FindRep");
727                 exit(1);
728         }
729 }
730
731 //**********************************************************************************************************************
732 int GetOTURepCommand::process(ListVector* processList) {
733         try{
734                 string name, sequence;
735                 string nameRep;
736
737                 //create output file
738                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
739                                 
740                 ofstream newNamesOutput;
741                 string outputNamesFile;
742                 map<string, ofstream*> filehandles;
743                 
744                 if (Groups.size() == 0) { //you don't want to use groups
745                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
746                         m->openOutputFile(outputNamesFile, newNamesOutput);
747                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
748                         outputNameFiles[outputNamesFile] = processList->getLabel();
749                 }else{ //you want to use groups
750                         ofstream* temp;
751                         for (int i=0; i<Groups.size(); i++) {
752                                 temp = new ofstream;
753                                 filehandles[Groups[i]] = temp;
754                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
755                                 
756                                 m->openOutputFile(outputNamesFile, *(temp));
757                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
758                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
759                         }
760                 }
761                 
762                 //for each bin in the list vector
763                 for (int i = 0; i < processList->size(); i++) {
764                         if (m->control_pressed) { 
765                                 out.close();  
766                                 if (Groups.size() == 0) { //you don't want to use groups
767                                         newNamesOutput.close();
768                                 }else{
769                                         for (int j=0; j<Groups.size(); j++) {
770                                                 (*(filehandles[Groups[j]])).close();
771                                                 delete filehandles[Groups[j]];
772                                         }
773                                 }
774                                 return 0; 
775                         }
776                         
777                         string temp = processList->get(i);
778                         vector<string> namesInBin;
779                         m->splitAtComma(temp, namesInBin);
780                         
781                         if (Groups.size() == 0) {
782                                 nameRep = findRep(namesInBin);
783                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
784                         }else{
785                                 map<string, vector<string> > NamesInGroup;
786                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
787                                         NamesInGroup[Groups[j]].resize(0);
788                                 }
789                                 
790                                 for (int j=0; j<namesInBin.size(); j++) {
791                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
792                                         
793                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
794                                         
795                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
796                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
797                                         }
798                                 }
799                                 
800                                 //get rep for each group in otu
801                                 for (int j=0; j<Groups.size(); j++) {
802                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
803                                                 //get rep for each group
804                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
805                                                 
806                                                 //output group rep and other members of this group
807                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
808                                                 
809                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
810                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
811                                                 }
812                                                 //output last name
813                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
814                                         }
815                                 }
816                         }
817                 }
818                 
819                 if (Groups.size() == 0) { //you don't want to use groups
820                         newNamesOutput.close();
821                 }else{
822                         for (int i=0; i<Groups.size(); i++) {
823                                 (*(filehandles[Groups[i]])).close();
824                                 delete filehandles[Groups[i]];
825                         }
826                 }
827                 
828                 return 0;
829
830         }
831         catch(exception& e) {
832                 m->errorOut(e, "GetOTURepCommand", "process");
833                 exit(1);
834         }
835 }
836 //**********************************************************************************************************************
837 int GetOTURepCommand::processNames(string filename, string label) {
838         try{
839
840                 //create output file
841                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
842                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
843                 m->openOutputFile(outputFileName, out);
844                 vector<repStruct> reps;
845                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
846                 
847                 ofstream out2;
848                 string tempNameFile = filename + ".temp";
849                 m->openOutputFile(tempNameFile, out2);
850                 
851                 ifstream in;
852                 m->openInputFile(filename, in);
853                 
854                 int i = 0;
855                 while (!in.eof()) {
856                         string rep, binnames;
857                         in >> i >> rep >> binnames; m->gobble(in);
858                         out2 << rep << '\t' << binnames << endl;
859                         
860                         vector<string> names;
861                         m->splitAtComma(binnames, names);
862                         int binsize = names.size();
863                         
864                         //if you have a groupfile
865                         string group = "";
866                         if (groupfile != "") {
867                                 map<string, string> groups;
868                                 map<string, string>::iterator groupIt;
869                                 
870                                 //find the groups that are in this bin
871                                 for (size_t i = 0; i < names.size(); i++) {
872                                         string groupName = groupMap->getGroup(names[i]);
873                                         if (groupName == "not found") {  
874                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
875                                                 groupError = true;
876                                         } else {
877                                                 groups[groupName] = groupName;
878                                         }
879                                 }
880                                 
881                                 //turn the groups into a string
882                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
883                                         group += groupIt->first + "-";
884                                 }
885                                 //rip off last dash
886                                 group = group.substr(0, group.length()-1);
887                         }else{ group = ""; }
888
889                         
890                         //print out name and sequence for that bin
891                         string sequence = fasta->getSequence(rep);
892
893                         if (sequence != "not found") {
894                                 if (sorted == "") { //print them out
895                                         rep = rep + "\t" + toString(i+1);
896                                         rep = rep + "|" + toString(binsize);
897                                         if (groupfile != "") {
898                                                 rep = rep + "|" + group;
899                                         }
900                                         out << ">" << rep << endl;
901                                         out << sequence << endl;
902                                 }else { //save them
903                                         repStruct newRep(rep, i+1, binsize, group);
904                                         reps.push_back(newRep);
905                                 }
906                         }else { 
907                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
908                         }
909                 }
910                 
911                         
912                 if (sorted != "") { //then sort them and print them
913                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
914                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
915                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
916                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
917                         
918                         //print them
919                         for (int i = 0; i < reps.size(); i++) {
920                                 string sequence = fasta->getSequence(reps[i].name);
921                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
922                                 outputName = outputName + "|" + toString(reps[i].size);
923                                 if (groupfile != "") {
924                                         outputName = outputName + "|" + reps[i].group;
925                                 }
926                                 out << ">" << outputName << endl;
927                                 out << sequence << endl;
928                         }
929                 }
930                 
931                 in.close();
932                 out.close();
933                 out2.close();
934                 
935                 remove(filename.c_str());
936                 rename(tempNameFile.c_str(), filename.c_str());
937                 
938                 return 0;
939
940         }
941         catch(exception& e) {
942                 m->errorOut(e, "GetOTURepCommand", "processNames");
943                 exit(1);
944         }
945 }
946 //**********************************************************************************************************************
947 SeqMap GetOTURepCommand::getMap(int row) {
948         try {
949                 SeqMap rowMap;
950                 
951                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
952                 if (rowPositions[row] != -1){
953                         //go to row in file
954                         inRow.seekg(rowPositions[row]);
955                         
956                         int rowNum, numDists, colNum;
957                         float dist;
958                         
959                         inRow >> rowNum >> numDists;
960                         
961                         for(int i = 0; i < numDists; i++) {
962                                 inRow >> colNum >> dist;
963                                 rowMap[colNum] = dist;
964                                 
965                         }
966                 }
967                 
968                 return rowMap;
969         }
970         catch(exception& e) {
971                 m->errorOut(e, "GetOTURepCommand", "getMap");
972                 exit(1);
973         }
974 }
975 //**********************************************************************************************************************
976