]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added set.current and get.current commands and modified existing commands to update...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){   
40         try {
41                 abort = true; calledHelp = true; 
42                 vector<string> tempOutNames;
43                 outputTypes["fasta"] = tempOutNames;
44                 outputTypes["name"] = tempOutNames;
45         }
46         catch(exception& e) {
47                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
48                 exit(1);
49         }
50 }
51 //**********************************************************************************************************************
52 vector<string> GetOTURepCommand::getValidParameters(){  
53         try {
54                 string Array[] =  {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
55                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64 vector<string> GetOTURepCommand::getRequiredParameters(){       
65         try {
66                 string Array[] =  {"fasta","list"};
67                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
68                 return myArray;
69         }
70         catch(exception& e) {
71                 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
72                 exit(1);
73         }
74 }
75 //**********************************************************************************************************************
76 vector<string> GetOTURepCommand::getRequiredFiles(){    
77         try {
78                 vector<string> myArray;
79                 return myArray;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87 GetOTURepCommand::GetOTURepCommand(string option)  {
88         try{
89                 globaldata = GlobalData::getInstance();
90                 abort = false; calledHelp = false;   
91                 allLines = 1;
92                 labels.clear();
93                                 
94                 //allow user to run help
95                 if (option == "help") { 
96                         help(); abort = true; calledHelp = true;
97                 } else {
98                         //valid paramters for this command
99                         string Array[] =  {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
100                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
101                         
102                         OptionParser parser(option);
103                         map<string, string> parameters = parser.getParameters();
104                         
105                         ValidParameters validParameter;
106                         map<string, string>::iterator it;
107                 
108                         //check to make sure all parameters are valid for command
109                         for (it = parameters.begin(); it != parameters.end(); it++) { 
110                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
111                         }
112                         
113                         //initialize outputTypes
114                         vector<string> tempOutNames;
115                         outputTypes["fasta"] = tempOutNames;
116                         outputTypes["name"] = tempOutNames;
117                         
118                         //if the user changes the input directory command factory will send this info to us in the output parameter 
119                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
120                         if (inputDir == "not found"){   inputDir = "";          }
121                         else {
122                                 string path;
123                                 it = parameters.find("list");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
129                                 }
130                                 
131                                 it = parameters.find("fasta");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
137                                 }
138                                 
139                                 it = parameters.find("phylip");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
145                                 }
146                                 
147                                 it = parameters.find("column");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
153                                 }
154                                 
155                                 it = parameters.find("name");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
161                                 }
162                                 
163                                 it = parameters.find("group");
164                                 //user has given a template file
165                                 if(it != parameters.end()){ 
166                                         path = m->hasPath(it->second);
167                                         //if the user has not given a path then, add inputdir. else leave path alone.
168                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
169                                 }
170                         }
171
172                         
173                         //if the user changes the output directory command factory will send this info to us in the output parameter 
174                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
175                         
176                         //check for required parameters
177                         fastafile = validParameter.validFile(parameters, "fasta", true);
178                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
179                         else if (fastafile == "not open") { abort = true; }     
180                 
181                         listfile = validParameter.validFile(parameters, "list", true);
182                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
183                         else if (listfile == "not open") { abort = true; }      
184                         
185                         phylipfile = validParameter.validFile(parameters, "phylip", true);
186                         if (phylipfile == "not found") { phylipfile = "";  }
187                         else if (phylipfile == "not open") { abort = true; }    
188                         else { distFile = phylipfile; format = "phylip";   }
189                         
190                         columnfile = validParameter.validFile(parameters, "column", true);
191                         if (columnfile == "not found") { columnfile = ""; }
192                         else if (columnfile == "not open") { abort = true; }    
193                         else { distFile = columnfile; format = "column";   }
194                         
195                         namefile = validParameter.validFile(parameters, "name", true);
196                         if (namefile == "not open") { abort = true; }   
197                         else if (namefile == "not found") { namefile = ""; }
198                         
199                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
200                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
201                 
202                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
203
204                         //check for optional parameter and set defaults
205                         // ...at some point should added some additional type checking...
206                         label = validParameter.validFile(parameters, "label", false);                   
207                         if (label == "not found") { label = ""; allLines = 1;  }
208                         else { 
209                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
210                                 else { allLines = 1;  }
211                         }
212                         
213                         groupfile = validParameter.validFile(parameters, "group", true);
214                         if (groupfile == "not open") { groupfile = ""; abort = true; }
215                         else if (groupfile == "not found") { groupfile = ""; }
216                                                 
217                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
218                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
219                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
220                                 sorted = "";
221                         }
222                         
223                         if ((sorted == "group") && (groupfile == "")) {
224                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
225                                 sorted = "";
226                         }
227                         
228                         groups = validParameter.validFile(parameters, "groups", false);                 
229                         if (groups == "not found") { groups = ""; }
230                         else { 
231                                 if (groupfile == "") {
232                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
233                                         abort = true;
234                                 }else { 
235                                         m->splitAtDash(groups, Groups);
236                                 }
237                         }
238                         globaldata->Groups = Groups;
239                         
240                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
241                         large = m->isTrue(temp);
242                         
243                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {      if (namefile == "") { temp = "F"; } else { temp = "t"; }        }
244                         weighted = m->isTrue(temp);
245                         
246                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
247                         
248                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
249                         convert(temp, precision); 
250                         
251                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
252                         convert(temp, cutoff); 
253                         cutoff += (5 / (precision * 10.0));
254                 }
255         }
256         catch(exception& e) {
257                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
258                 exit(1);
259         }
260 }
261
262 //**********************************************************************************************************************
263
264 void GetOTURepCommand::help(){
265         try {
266                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
267                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
268                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
269                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
270                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
271                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
272                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
273                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
274                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
275                 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false with no namesfile and true when a name file is provided.\n");
276                 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
277                 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
278                 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
279                 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
280                 m->mothurOut("The group parameter allows you provide a group file.\n");
281                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
282                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
283                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
284                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
285         }
286         catch(exception& e) {
287                 m->errorOut(e, "GetOTURepCommand", "help");
288                 exit(1);
289         }
290 }
291
292 //**********************************************************************************************************************
293
294 GetOTURepCommand::~GetOTURepCommand(){}
295
296 //**********************************************************************************************************************
297
298 int GetOTURepCommand::execute(){
299         try {
300         
301                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
302                 int error;
303                 
304                 if (!large) {
305                         //read distance files
306                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
307                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
308                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
309                         
310                         readMatrix->setCutoff(cutoff);
311         
312                         if(namefile != ""){     
313                                 nameMap = new NameAssignment(namefile);
314                                 nameMap->readMap();
315                         }else{  nameMap = NULL;         }
316                         
317                         readMatrix->read(nameMap);
318                         
319                         if (m->control_pressed) { delete readMatrix; return 0; }
320
321                         //get matrix
322                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
323                         globaldata->gListVector = readMatrix->getListVector();
324
325                         SparseMatrix* matrix = readMatrix->getMatrix();
326                         
327                         // Create a data structure to quickly access the distance information.
328                         // It consists of a vector of distance maps, where each map contains
329                         // all distances of a certain sequence. Vector and maps are accessed
330                         // via the index of a sequence in the distance matrix
331                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
332                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
333                                 if (m->control_pressed) { delete readMatrix; return 0; }
334                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
335                         }
336                         //add dummy map for unweighted calc
337                         SeqMap dummy;
338                         seqVec.push_back(dummy);
339                         
340                         delete matrix;
341                         delete readMatrix;
342                         delete nameMap;
343                         
344                         if (m->control_pressed) { return 0; }
345                 }else {
346                         //process file and set up indexes
347                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
348                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
349                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
350                         
351                         formatMatrix->setCutoff(cutoff);
352         
353                         if(namefile != ""){     
354                                 nameMap = new NameAssignment(namefile);
355                                 nameMap->readMap();
356                         }else{  nameMap = NULL;         }
357                         
358                         formatMatrix->read(nameMap);
359                         
360                         if (m->control_pressed) { delete formatMatrix;  return 0; }
361
362                         //get matrix
363                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
364                         globaldata->gListVector = formatMatrix->getListVector();
365                         
366                         distFile = formatMatrix->getFormattedFileName();
367                         
368                         //positions in file where the distances for each sequence begin
369                         //rowPositions[1] = position in file where distance related to sequence 1 start.
370                         rowPositions = formatMatrix->getRowPositions();
371                         rowPositions.push_back(-1); //dummy row for unweighted calc
372                         
373                         delete formatMatrix;
374                         delete nameMap;
375                         
376                         //openfile for getMap to use
377                         m->openInputFile(distFile, inRow);
378                         
379                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
380                 }
381                 
382                 
383                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
384                 if (globaldata->gListVector != NULL) {
385                         vector<string> names;
386                         string binnames;
387                         //map names to rows in sparsematrix
388                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
389                                 names.clear();
390                                 binnames = globaldata->gListVector->get(i);
391                                 
392                                 m->splitAtComma(binnames, names);
393                                 
394                                 for (int j = 0; j < names.size(); j++) {
395                                         nameToIndex[names[j]] = i;
396                                 }
397                         }
398                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
399                 
400                                 
401                 if (m->control_pressed) { 
402                         if (large) {  inRow.close(); remove(distFile.c_str());  }
403                         return 0; 
404                 }
405                 
406                 if (groupfile != "") {
407                         //read in group map info.
408                         groupMap = new GroupMap(groupfile);
409                         int error = groupMap->readMap();
410                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
411                         
412                         if (Groups.size() != 0) {
413                                 SharedUtil* util = new SharedUtil();
414                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
415                                 delete util;
416                         }
417                 }
418                                                                 
419                 //set format to list so input can get listvector
420                 globaldata->setFormat("list");
421         
422                 //read list file
423                 read = new ReadOTUFile(listfile);
424                 read->read(&*globaldata); 
425                 
426                 input = globaldata->ginput;
427                 list = globaldata->gListVector;
428                 string lastLabel = list->getLabel();
429
430                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
431                 set<string> processedLabels;
432                 set<string> userLabels = labels;
433                 
434                 if (m->control_pressed) { 
435                         if (large) {  inRow.close(); remove(distFile.c_str());  }
436                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
437                 }
438                 
439                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
440                 
441                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
442                         
443                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
444                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
445                                         error = process(list);
446                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
447                                         
448                                         if (m->control_pressed) { 
449                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
450                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
451                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
452                                         }
453                                         
454                                         processedLabels.insert(list->getLabel());
455                                         userLabels.erase(list->getLabel());
456                         }
457                         
458                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
459                                         string saveLabel = list->getLabel();
460                                         
461                                         delete list;
462                                         list = input->getListVector(lastLabel);
463                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
464                                         error = process(list);
465                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
466                                         
467                                         if (m->control_pressed) { 
468                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
469                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
470                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
471                                         }
472                                         
473                                         processedLabels.insert(list->getLabel());
474                                         userLabels.erase(list->getLabel());
475                                         
476                                         //restore real lastlabel to save below
477                                         list->setLabel(saveLabel);
478                         }
479                         
480                         lastLabel = list->getLabel();
481         
482                         delete list;
483                         list = input->getListVector();
484                 }
485                 
486                 //output error messages about any remaining user labels
487                 bool needToRun = false;
488                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
489                         m->mothurOut("Your file does not include the label " + (*it)); 
490                         if (processedLabels.count(lastLabel) != 1) {
491                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
492                                 needToRun = true;
493                         }else {
494                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
495                         }
496                 }
497                 
498                 //run last label if you need to
499                 if (needToRun == true)  {
500                         if (list != NULL) {     delete list;    }
501                         list = input->getListVector(lastLabel);
502                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
503                         error = process(list);
504                         delete list;
505                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
506                         
507                         if (m->control_pressed) { 
508                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
509                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
510                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
511                         }
512                 }
513                 
514                 //close and remove formatted matrix file
515                 if (large) {
516                         inRow.close();
517                         remove(distFile.c_str());
518                 }
519                 
520                 globaldata->gListVector = NULL;
521                 delete input;  globaldata->ginput = NULL;
522                 delete read;
523                 
524                 if (!weighted) { nameFileMap.clear(); }
525                 
526                 //read fastafile
527                 fasta = new FastaMap();
528                 fasta->readFastaFile(fastafile);
529                 
530                 //if user gave a namesfile then use it
531                 if (namefile != "") {   readNamesFile();        }
532                 
533                 //output create and output the .rep.fasta files
534                 map<string, string>::iterator itNameFile;
535                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
536                         processNames(itNameFile->first, itNameFile->second);
537                 }
538                 
539                 delete fasta;
540                 if (groupfile != "") {
541                         delete groupMap;  globaldata->gGroupmap = NULL;
542                 }
543                 
544                 if (m->control_pressed) {  return 0; }
545                 
546                 //set fasta file as new current fastafile - use first one??
547                 string current = "";
548                 itTypes = outputTypes.find("fasta");
549                 if (itTypes != outputTypes.end()) {
550                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
551                 }
552                 
553                 itTypes = outputTypes.find("name");
554                 if (itTypes != outputTypes.end()) {
555                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
556                 }
557                 
558                 m->mothurOutEndLine();
559                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
560                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
561                 m->mothurOutEndLine();
562                 
563                 return 0;
564         }
565         catch(exception& e) {
566                 m->errorOut(e, "GetOTURepCommand", "execute");
567                 exit(1);
568         }
569 }
570
571 //**********************************************************************************************************************
572 void GetOTURepCommand::readNamesFile() {
573         try {
574                 ifstream in;
575                 vector<string> dupNames;
576                 m->openInputFile(namefile, in);
577                 
578                 string name, names, sequence;
579         
580                 while(!in.eof()){
581                         in >> name;                     //read from first column  A
582                         in >> names;            //read from second column  A,B,C,D
583                         
584                         dupNames.clear();
585                         
586                         //parse names into vector
587                         m->splitAtComma(names, dupNames);
588                         
589                         //store names in fasta map
590                         sequence = fasta->getSequence(name);
591                         for (int i = 0; i < dupNames.size(); i++) {
592                                 fasta->push_back(dupNames[i], sequence);
593                         }
594                 
595                         m->gobble(in);
596                 }
597                 in.close();
598
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
602                 exit(1);
603         }
604 }
605 //**********************************************************************************************************************
606 //read names file to find the weighted rep for each bin
607 void GetOTURepCommand::readNamesFile(bool w) {
608         try {
609                 ifstream in;
610                 vector<string> dupNames;
611                 m->openInputFile(namefile, in);
612                 
613                 string name, names, sequence;
614                 
615                 while(!in.eof()){
616                         in >> name;     m->gobble(in);          //read from first column  A
617                         in >> names;                                                    //read from second column  A,B,C,D
618                         
619                         dupNames.clear();
620                         
621                         //parse names into vector
622                         m->splitAtComma(names, dupNames);
623                         
624                         for (int i = 0; i < dupNames.size(); i++) {
625                                 nameFileMap[dupNames[i]] = name;
626                         }
627                         
628                         m->gobble(in);
629                 }
630                 in.close();
631                 
632         }
633         catch(exception& e) {
634                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
635                 exit(1);
636         }
637 }
638 //**********************************************************************************************************************
639 string GetOTURepCommand::findRep(vector<string> names) {
640         try{
641                 // if only 1 sequence in bin or processing the "unique" label, then 
642                 // the first sequence of the OTU is the representative one
643                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
644                         return names[0];
645                 }else{
646                         vector<int> seqIndex(names.size());
647                         vector<float> max_dist(names.size());
648                         vector<float> total_dist(names.size());
649                         map<string, string>::iterator itNameFile;
650                         map<string, int>::iterator itNameIndex;
651
652                         //fill seqIndex and initialize sums
653                         for (size_t i = 0; i < names.size(); i++) {
654                                 if (weighted) {
655                                         seqIndex[i] = nameToIndex[names[i]];
656                                 }else { 
657                                         if (namefile == "") {
658                                                 itNameIndex = nameToIndex.find(names[i]);
659                                                 
660                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
661                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
662                                                         else {  seqIndex[i] = (seqVec.size()-1); }
663                                                 }else {
664                                                         seqIndex[i] = itNameIndex->second;
665                                                 }
666                                                 
667                                         }else {
668                                                 itNameFile = nameFileMap.find(names[i]);
669                                                 
670                                                 if (itNameFile == nameFileMap.end()) {
671                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
672                                                 }else{
673                                                         string name1 = itNameFile->first;
674                                                         string name2 = itNameFile->second;
675                                                         
676                                                         if (name1 == name2) { //then you are unique so add your real dists
677                                                                 seqIndex[i] = nameToIndex[names[i]];
678                                                         }else { //add dummy
679                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
680                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
681                                                         }
682                                                 }
683                                         }
684                                 }
685                                 max_dist[i] = 0.0;
686                                 total_dist[i] = 0.0;
687                         }
688                         
689                         // loop through all entries in seqIndex
690                         SeqMap::iterator it;
691                         SeqMap currMap;
692                         for (size_t i=0; i < seqIndex.size(); i++) {
693                                 if (m->control_pressed) {  return  "control"; }
694                         
695                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
696                                 else            {       currMap = getMap(seqIndex[i]);  }
697                                 
698                                 for (size_t j=0; j < seqIndex.size(); j++) {
699                                         it = currMap.find(seqIndex[j]);         
700                                         if (it != currMap.end()) {
701                                                 max_dist[i] = max(max_dist[i], it->second);
702                                                 max_dist[j] = max(max_dist[j], it->second);
703                                                 total_dist[i] += it->second;
704                                                 total_dist[j] += it->second;
705                                         }else{ //if you can't find the distance make it the cutoff
706                                                 max_dist[i] = max(max_dist[i], cutoff);
707                                                 max_dist[j] = max(max_dist[j], cutoff);
708                                                 total_dist[i] += cutoff;
709                                                 total_dist[j] += cutoff;
710                                         }
711                                 }
712                         }
713                         
714                         // sequence with the smallest maximum distance is the representative
715                         //if tie occurs pick sequence with smallest average distance
716                         float min = 10000;
717                         int minIndex;
718                         for (size_t i=0; i < max_dist.size(); i++) {
719                                 if (m->control_pressed) {  return  "control"; }
720                                 if (max_dist[i] < min) {
721                                         min = max_dist[i];
722                                         minIndex = i;
723                                 }else if (max_dist[i] == min) {
724                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
725                                         float newAverage = total_dist[i] / (float) total_dist.size();
726                                         
727                                         if (newAverage < currentAverage) {
728                                                 min = max_dist[i];
729                                                 minIndex = i;
730                                         }
731                                 }
732                         }
733                         
734                         return(names[minIndex]);
735                 }
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "GetOTURepCommand", "FindRep");
739                 exit(1);
740         }
741 }
742
743 //**********************************************************************************************************************
744 int GetOTURepCommand::process(ListVector* processList) {
745         try{
746                 string name, sequence;
747                 string nameRep;
748
749                 //create output file
750                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
751                                 
752                 ofstream newNamesOutput;
753                 string outputNamesFile;
754                 map<string, ofstream*> filehandles;
755                 
756                 if (Groups.size() == 0) { //you don't want to use groups
757                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
758                         m->openOutputFile(outputNamesFile, newNamesOutput);
759                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
760                         outputNameFiles[outputNamesFile] = processList->getLabel();
761                 }else{ //you want to use groups
762                         ofstream* temp;
763                         for (int i=0; i<Groups.size(); i++) {
764                                 temp = new ofstream;
765                                 filehandles[Groups[i]] = temp;
766                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
767                                 
768                                 m->openOutputFile(outputNamesFile, *(temp));
769                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
770                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
771                         }
772                 }
773                 
774                 //for each bin in the list vector
775                 for (int i = 0; i < processList->size(); i++) {
776                         if (m->control_pressed) { 
777                                 out.close();  
778                                 if (Groups.size() == 0) { //you don't want to use groups
779                                         newNamesOutput.close();
780                                 }else{
781                                         for (int j=0; j<Groups.size(); j++) {
782                                                 (*(filehandles[Groups[j]])).close();
783                                                 delete filehandles[Groups[j]];
784                                         }
785                                 }
786                                 return 0; 
787                         }
788                         
789                         string temp = processList->get(i);
790                         vector<string> namesInBin;
791                         m->splitAtComma(temp, namesInBin);
792                         
793                         if (Groups.size() == 0) {
794                                 nameRep = findRep(namesInBin);
795                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
796                         }else{
797                                 map<string, vector<string> > NamesInGroup;
798                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
799                                         NamesInGroup[Groups[j]].resize(0);
800                                 }
801                                 
802                                 for (int j=0; j<namesInBin.size(); j++) {
803                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
804                                         
805                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
806                                         
807                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
808                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
809                                         }
810                                 }
811                                 
812                                 //get rep for each group in otu
813                                 for (int j=0; j<Groups.size(); j++) {
814                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
815                                                 //get rep for each group
816                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
817                                                 
818                                                 //output group rep and other members of this group
819                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
820                                                 
821                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
822                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
823                                                 }
824                                                 //output last name
825                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
826                                         }
827                                 }
828                         }
829                 }
830                 
831                 if (Groups.size() == 0) { //you don't want to use groups
832                         newNamesOutput.close();
833                 }else{
834                         for (int i=0; i<Groups.size(); i++) {
835                                 (*(filehandles[Groups[i]])).close();
836                                 delete filehandles[Groups[i]];
837                         }
838                 }
839                 
840                 return 0;
841
842         }
843         catch(exception& e) {
844                 m->errorOut(e, "GetOTURepCommand", "process");
845                 exit(1);
846         }
847 }
848 //**********************************************************************************************************************
849 int GetOTURepCommand::processNames(string filename, string label) {
850         try{
851
852                 //create output file
853                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
854                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
855                 m->openOutputFile(outputFileName, out);
856                 vector<repStruct> reps;
857                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
858                 
859                 ofstream out2;
860                 string tempNameFile = filename + ".temp";
861                 m->openOutputFile(tempNameFile, out2);
862                 
863                 ifstream in;
864                 m->openInputFile(filename, in);
865                 
866                 int i = 0;
867                 while (!in.eof()) {
868                         string rep, binnames;
869                         in >> i >> rep >> binnames; m->gobble(in);
870                         out2 << rep << '\t' << binnames << endl;
871                         
872                         vector<string> names;
873                         m->splitAtComma(binnames, names);
874                         int binsize = names.size();
875                         
876                         //if you have a groupfile
877                         string group = "";
878                         if (groupfile != "") {
879                                 map<string, string> groups;
880                                 map<string, string>::iterator groupIt;
881                                 
882                                 //find the groups that are in this bin
883                                 for (size_t i = 0; i < names.size(); i++) {
884                                         string groupName = groupMap->getGroup(names[i]);
885                                         if (groupName == "not found") {  
886                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
887                                                 groupError = true;
888                                         } else {
889                                                 groups[groupName] = groupName;
890                                         }
891                                 }
892                                 
893                                 //turn the groups into a string
894                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
895                                         group += groupIt->first + "-";
896                                 }
897                                 //rip off last dash
898                                 group = group.substr(0, group.length()-1);
899                         }else{ group = ""; }
900
901                         
902                         //print out name and sequence for that bin
903                         string sequence = fasta->getSequence(rep);
904
905                         if (sequence != "not found") {
906                                 if (sorted == "") { //print them out
907                                         rep = rep + "\t" + toString(i+1);
908                                         rep = rep + "|" + toString(binsize);
909                                         if (groupfile != "") {
910                                                 rep = rep + "|" + group;
911                                         }
912                                         out << ">" << rep << endl;
913                                         out << sequence << endl;
914                                 }else { //save them
915                                         repStruct newRep(rep, i+1, binsize, group);
916                                         reps.push_back(newRep);
917                                 }
918                         }else { 
919                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
920                         }
921                 }
922                 
923                         
924                 if (sorted != "") { //then sort them and print them
925                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
926                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
927                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
928                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
929                         
930                         //print them
931                         for (int i = 0; i < reps.size(); i++) {
932                                 string sequence = fasta->getSequence(reps[i].name);
933                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
934                                 outputName = outputName + "|" + toString(reps[i].size);
935                                 if (groupfile != "") {
936                                         outputName = outputName + "|" + reps[i].group;
937                                 }
938                                 out << ">" << outputName << endl;
939                                 out << sequence << endl;
940                         }
941                 }
942                 
943                 in.close();
944                 out.close();
945                 out2.close();
946                 
947                 remove(filename.c_str());
948                 rename(tempNameFile.c_str(), filename.c_str());
949                 
950                 return 0;
951
952         }
953         catch(exception& e) {
954                 m->errorOut(e, "GetOTURepCommand", "processNames");
955                 exit(1);
956         }
957 }
958 //**********************************************************************************************************************
959 SeqMap GetOTURepCommand::getMap(int row) {
960         try {
961                 SeqMap rowMap;
962                 
963                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
964                 if (rowPositions[row] != -1){
965                         //go to row in file
966                         inRow.seekg(rowPositions[row]);
967                         
968                         int rowNum, numDists, colNum;
969                         float dist;
970                         
971                         inRow >> rowNum >> numDists;
972                         
973                         for(int i = 0; i < numDists; i++) {
974                                 inRow >> colNum >> dist;
975                                 rowMap[colNum] = dist;
976                                 
977                         }
978                 }
979                 
980                 return rowMap;
981         }
982         catch(exception& e) {
983                 m->errorOut(e, "GetOTURepCommand", "getMap");
984                 exit(1);
985         }
986 }
987 //**********************************************************************************************************************
988