]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
1.23.0
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){   
99         try {
100                 abort = true; calledHelp = true; 
101                 setParameters();
102                 vector<string> tempOutNames;
103                 outputTypes["fasta"] = tempOutNames;
104                 outputTypes["name"] = tempOutNames;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
108                 exit(1);
109         }
110 }
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option)  {
113         try{
114                 abort = false; calledHelp = false;   
115                 allLines = 1;
116                                 
117                 //allow user to run help
118                 if (option == "help") { 
119                         help(); abort = true; calledHelp = true;
120                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
121                 } else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string, string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string, string>::iterator it;
129                 
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["name"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("list");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
151                                 }
152                                 
153                                 it = parameters.find("fasta");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
159                                 }
160                                 
161                                 it = parameters.find("phylip");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
167                                 }
168                                 
169                                 it = parameters.find("column");
170                                 //user has given a template file
171                                 if(it != parameters.end()){ 
172                                         path = m->hasPath(it->second);
173                                         //if the user has not given a path then, add inputdir. else leave path alone.
174                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
175                                 }
176                                 
177                                 it = parameters.find("name");
178                                 //user has given a template file
179                                 if(it != parameters.end()){ 
180                                         path = m->hasPath(it->second);
181                                         //if the user has not given a path then, add inputdir. else leave path alone.
182                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
183                                 }
184                                 
185                                 it = parameters.find("group");
186                                 //user has given a template file
187                                 if(it != parameters.end()){ 
188                                         path = m->hasPath(it->second);
189                                         //if the user has not given a path then, add inputdir. else leave path alone.
190                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
191                                 }
192                         }
193
194                         
195                         //if the user changes the output directory command factory will send this info to us in the output parameter 
196                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
197                         
198                         //check for required parameters
199                         fastafile = validParameter.validFile(parameters, "fasta", true);
200                         if (fastafile == "not found") { fastafile = ""; }
201                         else if (fastafile == "not open") { abort = true; }     
202                         else { m->setFastaFile(fastafile); }
203                 
204                         listfile = validParameter.validFile(parameters, "list", true);
205                         if (listfile == "not found") {                  
206                                 listfile = m->getListFile(); 
207                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
208                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
209                         }
210                         else if (listfile == "not open") { abort = true; }      
211                         else { m->setListFile(listfile); }
212                         
213                         phylipfile = validParameter.validFile(parameters, "phylip", true);
214                         if (phylipfile == "not found") { phylipfile = "";  }
215                         else if (phylipfile == "not open") { abort = true; }    
216                         else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile);   }
217                         
218                         columnfile = validParameter.validFile(parameters, "column", true);
219                         if (columnfile == "not found") { columnfile = ""; }
220                         else if (columnfile == "not open") { abort = true; }    
221                         else { distFile = columnfile; format = "column";  m->setColumnFile(columnfile); }
222                         
223                         namefile = validParameter.validFile(parameters, "name", true);
224                         if (namefile == "not open") { abort = true; }   
225                         else if (namefile == "not found") { namefile = ""; }
226                         else { m->setNameFile(namefile); }
227                         
228                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
229                                 //give priority to column, then phylip
230                                 columnfile = m->getColumnFile(); 
231                                 if (columnfile != "") {  distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
232                                 else { 
233                                         phylipfile = m->getPhylipFile(); 
234                                         if (phylipfile != "") {  distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
235                                         else { 
236                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
237                                                 abort = true;
238                                         }
239                                 }
240                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
241                 
242                         if (columnfile != "") {  
243                                 if (namefile == "") {  
244                                         namefile = m->getNameFile(); 
245                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
246                                         else { 
247                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
248                                                 abort = true; 
249                                         }       
250                                 } 
251                         }
252
253                         //check for optional parameter and set defaults
254                         // ...at some point should added some additional type checking...
255                         label = validParameter.validFile(parameters, "label", false);                   
256                         if (label == "not found") { label = ""; allLines = 1;  }
257                         else { 
258                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
259                                 else { allLines = 1;  }
260                         }
261                         
262                         groupfile = validParameter.validFile(parameters, "group", true);
263                         if (groupfile == "not open") { groupfile = ""; abort = true; }
264                         else if (groupfile == "not found") { groupfile = ""; }
265                         else { m->setGroupFile(groupfile); }
266                         
267                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
268                         if (sorted == "none") { sorted=""; }
269                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
270                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
271                                 sorted = "";
272                         }
273                         
274                         if ((sorted == "group") && (groupfile == "")) {
275                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
276                                 sorted = "";
277                         }
278                         
279                         groups = validParameter.validFile(parameters, "groups", false);                 
280                         if (groups == "not found") { groups = ""; }
281                         else { 
282                                 if (groupfile == "") {
283                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
284                                         abort = true;
285                                 }else { 
286                                         m->splitAtDash(groups, Groups);
287                                 }
288                         }
289                         m->setGroups(Groups);
290                         
291                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
292                         large = m->isTrue(temp);
293                         
294                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
295                         weighted = m->isTrue(temp);
296                         
297                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
298                         
299                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
300                         m->mothurConvert(temp, precision); 
301                         
302                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
303                         m->mothurConvert(temp, cutoff); 
304                         cutoff += (5 / (precision * 10.0));
305                 }
306         }
307         catch(exception& e) {
308                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
309                 exit(1);
310         }
311 }
312
313 //**********************************************************************************************************************
314
315 int GetOTURepCommand::execute(){
316         try {
317         
318                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
319                 int error;
320                 list = NULL;
321                 
322                 if (!large) {
323                         //read distance files
324                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
325                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
326                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
327                         
328                         readMatrix->setCutoff(cutoff);
329         
330                         if(namefile != ""){     
331                                 nameMap = new NameAssignment(namefile);
332                                 nameMap->readMap();
333                         }else{  nameMap = NULL;         }
334                         
335                         readMatrix->read(nameMap);
336                         
337                         if (m->control_pressed) { delete readMatrix; return 0; }
338
339                         list = readMatrix->getListVector();
340
341                         SparseMatrix* matrix = readMatrix->getMatrix();
342                         
343                         // Create a data structure to quickly access the distance information.
344                         // It consists of a vector of distance maps, where each map contains
345                         // all distances of a certain sequence. Vector and maps are accessed
346                         // via the index of a sequence in the distance matrix
347                         seqVec = vector<SeqMap>(list->size()); 
348                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
349                                 if (m->control_pressed) { delete readMatrix; return 0; }
350                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
351                         }
352                         //add dummy map for unweighted calc
353                         SeqMap dummy;
354                         seqVec.push_back(dummy);
355                         
356                         delete matrix;
357                         delete readMatrix;
358                         delete nameMap;
359                         
360                         if (m->control_pressed) { return 0; }
361                 }else {
362                         //process file and set up indexes
363                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
364                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
365                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
366                         
367                         formatMatrix->setCutoff(cutoff);
368         
369                         if(namefile != ""){     
370                                 nameMap = new NameAssignment(namefile);
371                                 nameMap->readMap();
372                         }else{  nameMap = NULL;         }
373                         
374                         formatMatrix->read(nameMap);
375                         
376                         if (m->control_pressed) { delete formatMatrix;  return 0; }
377
378                         list = formatMatrix->getListVector();
379                         
380                         distFile = formatMatrix->getFormattedFileName();
381                         
382                         //positions in file where the distances for each sequence begin
383                         //rowPositions[1] = position in file where distance related to sequence 1 start.
384                         rowPositions = formatMatrix->getRowPositions();
385                         rowPositions.push_back(-1); //dummy row for unweighted calc
386                         
387                         delete formatMatrix;
388                         delete nameMap;
389                         
390                         //openfile for getMap to use
391                         m->openInputFile(distFile, inRow);
392                         
393                         if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
394                 }
395                 
396                 
397                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
398                 if (list != NULL) {
399                         vector<string> names;
400                         string binnames;
401                         //map names to rows in sparsematrix
402                         for (int i = 0; i < list->size(); i++) {
403                                 names.clear();
404                                 binnames = list->get(i);
405                                 
406                                 m->splitAtComma(binnames, names);
407                                 
408                                 for (int j = 0; j < names.size(); j++) {
409                                         nameToIndex[names[j]] = i;
410                                 }
411                         }
412                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
413                 
414                                 
415                 if (m->control_pressed) { 
416                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
417                         return 0; 
418                 }
419                 
420                 if (groupfile != "") {
421                         //read in group map info.
422                         groupMap = new GroupMap(groupfile);
423                         int error = groupMap->readMap();
424                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
425                         
426                         if (Groups.size() != 0) {
427                                 SharedUtil* util = new SharedUtil();
428                                 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
429                                 util->setGroups(Groups, gNamesOfGroups, "getoturep");
430                                 groupMap->setNamesOfGroups(gNamesOfGroups);
431                                 delete util;
432                         }
433                 }
434                 
435                 //done with listvector from matrix
436                 if (list != NULL) { delete list; }
437                 
438                 input = new InputData(listfile, "list");
439                 list = input->getListVector();
440                 string lastLabel = list->getLabel();
441
442                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
443                 set<string> processedLabels;
444                 set<string> userLabels = labels;
445                 
446                 if (m->control_pressed) { 
447                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
448                         delete input; delete list; return 0; 
449                 }
450                 
451                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
452                 
453                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
454                         
455                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
456                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
457                                         error = process(list);
458                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
459                                         
460                                         if (m->control_pressed) { 
461                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
462                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
463                                                 delete input; delete list; return 0; 
464                                         }
465                                         
466                                         processedLabels.insert(list->getLabel());
467                                         userLabels.erase(list->getLabel());
468                         }
469                         
470                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
471                                         string saveLabel = list->getLabel();
472                                         
473                                         delete list;
474                                         list = input->getListVector(lastLabel);
475                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
476                                         error = process(list);
477                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
478                                         
479                                         if (m->control_pressed) { 
480                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
481                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
482                                                 delete input; delete list; return 0; 
483                                         }
484                                         
485                                         processedLabels.insert(list->getLabel());
486                                         userLabels.erase(list->getLabel());
487                                         
488                                         //restore real lastlabel to save below
489                                         list->setLabel(saveLabel);
490                         }
491                         
492                         lastLabel = list->getLabel();
493         
494                         delete list;
495                         list = input->getListVector();
496                 }
497                 
498                 //output error messages about any remaining user labels
499                 bool needToRun = false;
500                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
501                         m->mothurOut("Your file does not include the label " + (*it)); 
502                         if (processedLabels.count(lastLabel) != 1) {
503                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
504                                 needToRun = true;
505                         }else {
506                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
507                         }
508                 }
509                 
510                 //run last label if you need to
511                 if (needToRun == true)  {
512                         if (list != NULL) {     delete list;    }
513                         list = input->getListVector(lastLabel);
514                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
515                         error = process(list);
516                         delete list;
517                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
518                         
519                         if (m->control_pressed) { 
520                                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
521                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
522                                         delete input; delete list; return 0; 
523                         }
524                 }
525                 
526                 //close and remove formatted matrix file
527                 if (large) {
528                         inRow.close();
529                         m->mothurRemove(distFile);
530                 }
531                 
532                 delete input;  
533                 
534                 if (!weighted) { nameFileMap.clear(); }
535                 
536                                 
537                 if (fastafile != "") {
538                         //read fastafile
539                         fasta = new FastaMap();
540                         fasta->readFastaFile(fastafile);
541                         
542                         //if user gave a namesfile then use it
543                         if (namefile != "") {   readNamesFile();        }
544                         
545                         //output create and output the .rep.fasta files
546                         map<string, string>::iterator itNameFile;
547                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
548                                 processFastaNames(itNameFile->first, itNameFile->second);
549                         }
550                 }else {
551                         //output create and output the .rep.fasta files
552                         map<string, string>::iterator itNameFile;
553                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
554                                 processNames(itNameFile->first, itNameFile->second);
555                         }
556                 }
557                 
558                                 
559                 if (groupfile != "") { delete groupMap; }
560                 
561                 if (m->control_pressed) {  return 0; }
562                 
563                 //set fasta file as new current fastafile - use first one??
564                 string current = "";
565                 itTypes = outputTypes.find("fasta");
566                 if (itTypes != outputTypes.end()) {
567                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
568                 }
569                 
570                 itTypes = outputTypes.find("name");
571                 if (itTypes != outputTypes.end()) {
572                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
573                 }
574                 
575                 m->mothurOutEndLine();
576                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
577                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
578                 m->mothurOutEndLine();
579                 
580                 return 0;
581         }
582         catch(exception& e) {
583                 m->errorOut(e, "GetOTURepCommand", "execute");
584                 exit(1);
585         }
586 }
587
588 //**********************************************************************************************************************
589 void GetOTURepCommand::readNamesFile() {
590         try {
591                 ifstream in;
592                 vector<string> dupNames;
593                 m->openInputFile(namefile, in);
594                 
595                 string name, names, sequence;
596         
597                 while(!in.eof()){
598                         in >> name;                     //read from first column  A
599                         in >> names;            //read from second column  A,B,C,D
600                         
601                         dupNames.clear();
602                         
603                         //parse names into vector
604                         m->splitAtComma(names, dupNames);
605                         
606                         //store names in fasta map
607                         sequence = fasta->getSequence(name);
608                         for (int i = 0; i < dupNames.size(); i++) {
609                                 fasta->push_back(dupNames[i], sequence);
610                         }
611                 
612                         m->gobble(in);
613                 }
614                 in.close();
615
616         }
617         catch(exception& e) {
618                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
619                 exit(1);
620         }
621 }
622 //**********************************************************************************************************************
623 //read names file to find the weighted rep for each bin
624 void GetOTURepCommand::readNamesFile(bool w) {
625         try {
626                 ifstream in;
627                 vector<string> dupNames;
628                 m->openInputFile(namefile, in);
629                 
630                 string name, names, sequence;
631                 
632                 while(!in.eof()){
633                         in >> name;     m->gobble(in);          //read from first column  A
634                         in >> names;                                                    //read from second column  A,B,C,D
635                         
636                         dupNames.clear();
637                         
638                         //parse names into vector
639                         m->splitAtComma(names, dupNames);
640                         
641                         for (int i = 0; i < dupNames.size(); i++) {
642                                 nameFileMap[dupNames[i]] = name;
643                         }
644                         
645                         m->gobble(in);
646                 }
647                 in.close();
648                 
649         }
650         catch(exception& e) {
651                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
652                 exit(1);
653         }
654 }
655 //**********************************************************************************************************************
656 string GetOTURepCommand::findRep(vector<string> names) {
657         try{
658                 // if only 1 sequence in bin or processing the "unique" label, then 
659                 // the first sequence of the OTU is the representative one
660                 if ((names.size() == 1)) {
661                         return names[0];
662                 }else{
663                         vector<int> seqIndex(names.size());
664                         vector<float> max_dist(names.size());
665                         vector<float> total_dist(names.size());
666                         map<string, string>::iterator itNameFile;
667                         map<string, int>::iterator itNameIndex;
668
669                         //fill seqIndex and initialize sums
670                         for (size_t i = 0; i < names.size(); i++) {
671                                 if (weighted) {
672                                         seqIndex[i] = nameToIndex[names[i]];
673                                 }else { 
674                                         if (namefile == "") {
675                                                 itNameIndex = nameToIndex.find(names[i]);
676                                                 
677                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
678                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
679                                                         else {  seqIndex[i] = (seqVec.size()-1); }
680                                                 }else {
681                                                         seqIndex[i] = itNameIndex->second;
682                                                 }
683                                                 
684                                         }else {
685                                                 itNameFile = nameFileMap.find(names[i]);
686                                                 
687                                                 if (itNameFile == nameFileMap.end()) {
688                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
689                                                 }else{
690                                                         string name1 = itNameFile->first;
691                                                         string name2 = itNameFile->second;
692                                                         
693                                                         if (name1 == name2) { //then you are unique so add your real dists
694                                                                 seqIndex[i] = nameToIndex[names[i]];
695                                                         }else { //add dummy
696                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
697                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
698                                                         }
699                                                 }
700                                         }
701                                 }
702                                 max_dist[i] = 0.0;
703                                 total_dist[i] = 0.0;
704                         }
705                         
706                         // loop through all entries in seqIndex
707                         SeqMap::iterator it;
708                         SeqMap currMap;
709                         for (size_t i=0; i < seqIndex.size(); i++) {
710                                 if (m->control_pressed) {  return  "control"; }
711                         
712                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
713                                 else            {       currMap = getMap(seqIndex[i]);  }
714                                 
715                                 for (size_t j=0; j < seqIndex.size(); j++) {
716                                         it = currMap.find(seqIndex[j]);         
717                                         if (it != currMap.end()) {
718                                                 max_dist[i] = max(max_dist[i], it->second);
719                                                 max_dist[j] = max(max_dist[j], it->second);
720                                                 total_dist[i] += it->second;
721                                                 total_dist[j] += it->second;
722                                         }else{ //if you can't find the distance make it the cutoff
723                                                 max_dist[i] = max(max_dist[i], cutoff);
724                                                 max_dist[j] = max(max_dist[j], cutoff);
725                                                 total_dist[i] += cutoff;
726                                                 total_dist[j] += cutoff;
727                                         }
728                                 }
729                         }
730                         
731                         // sequence with the smallest maximum distance is the representative
732                         //if tie occurs pick sequence with smallest average distance
733                         float min = 10000;
734                         int minIndex;
735                         for (size_t i=0; i < max_dist.size(); i++) {
736                                 if (m->control_pressed) {  return  "control"; }
737                                 if (max_dist[i] < min) {
738                                         min = max_dist[i];
739                                         minIndex = i;
740                                 }else if (max_dist[i] == min) {
741                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
742                                         float newAverage = total_dist[i] / (float) total_dist.size();
743                                         
744                                         if (newAverage < currentAverage) {
745                                                 min = max_dist[i];
746                                                 minIndex = i;
747                                         }
748                                 }
749                         }
750                         
751                         return(names[minIndex]);
752                 }
753         }
754         catch(exception& e) {
755                 m->errorOut(e, "GetOTURepCommand", "FindRep");
756                 exit(1);
757         }
758 }
759
760 //**********************************************************************************************************************
761 int GetOTURepCommand::process(ListVector* processList) {
762         try{
763                 string name, sequence;
764                 string nameRep;
765
766                 //create output file
767                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
768                                 
769                 ofstream newNamesOutput;
770                 string outputNamesFile;
771                 map<string, ofstream*> filehandles;
772                 
773                 if (Groups.size() == 0) { //you don't want to use groups
774                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
775                         m->openOutputFile(outputNamesFile, newNamesOutput);
776                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
777                         outputNameFiles[outputNamesFile] = processList->getLabel();
778                 }else{ //you want to use groups
779                         ofstream* temp;
780                         for (int i=0; i<Groups.size(); i++) {
781                                 temp = new ofstream;
782                                 filehandles[Groups[i]] = temp;
783                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
784                                 
785                                 m->openOutputFile(outputNamesFile, *(temp));
786                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
787                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
788                         }
789                 }
790                 
791                 //for each bin in the list vector
792                 for (int i = 0; i < processList->size(); i++) {
793                         if (m->control_pressed) { 
794                                 out.close();  
795                                 if (Groups.size() == 0) { //you don't want to use groups
796                                         newNamesOutput.close();
797                                 }else{
798                                         for (int j=0; j<Groups.size(); j++) {
799                                                 (*(filehandles[Groups[j]])).close();
800                                                 delete filehandles[Groups[j]];
801                                         }
802                                 }
803                                 return 0; 
804                         }
805                         
806                         string temp = processList->get(i);
807                         vector<string> namesInBin;
808                         m->splitAtComma(temp, namesInBin);
809                         
810                         if (Groups.size() == 0) {
811                                 nameRep = findRep(namesInBin);
812                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
813                         }else{
814                                 map<string, vector<string> > NamesInGroup;
815                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
816                                         NamesInGroup[Groups[j]].resize(0);
817                                 }
818                                 
819                                 for (int j=0; j<namesInBin.size(); j++) {
820                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
821                                         
822                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
823                                         
824                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
825                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
826                                         }
827                                 }
828                                 
829                                 //get rep for each group in otu
830                                 for (int j=0; j<Groups.size(); j++) {
831                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
832                                                 //get rep for each group
833                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
834                                                 
835                                                 //output group rep and other members of this group
836                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
837                                                 
838                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
839                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
840                                                 }
841                                                 //output last name
842                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
843                                         }
844                                 }
845                         }
846                 }
847                 
848                 if (Groups.size() == 0) { //you don't want to use groups
849                         newNamesOutput.close();
850                 }else{
851                         for (int i=0; i<Groups.size(); i++) {
852                                 (*(filehandles[Groups[i]])).close();
853                                 delete filehandles[Groups[i]];
854                         }
855                 }
856                 
857                 return 0;
858
859         }
860         catch(exception& e) {
861                 m->errorOut(e, "GetOTURepCommand", "process");
862                 exit(1);
863         }
864 }
865 //**********************************************************************************************************************
866 int GetOTURepCommand::processFastaNames(string filename, string label) {
867         try{
868
869                 //create output file
870                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
871                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
872                 m->openOutputFile(outputFileName, out);
873                 vector<repStruct> reps;
874                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
875                 
876                 ofstream out2;
877                 string tempNameFile = filename + ".temp";
878                 m->openOutputFile(tempNameFile, out2);
879                 
880                 ifstream in;
881                 m->openInputFile(filename, in);
882                 
883                 int i = 0;
884                 while (!in.eof()) {
885                         string rep, binnames;
886                         in >> i >> rep >> binnames; m->gobble(in);
887                         out2 << rep << '\t' << binnames << endl;
888                         
889                         vector<string> names;
890                         m->splitAtComma(binnames, names);
891                         int binsize = names.size();
892                         
893                         //if you have a groupfile
894                         string group = "";
895                         if (groupfile != "") {
896                                 map<string, string> groups;
897                                 map<string, string>::iterator groupIt;
898                                 
899                                 //find the groups that are in this bin
900                                 for (size_t i = 0; i < names.size(); i++) {
901                                         string groupName = groupMap->getGroup(names[i]);
902                                         if (groupName == "not found") {  
903                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
904                                                 groupError = true;
905                                         } else {
906                                                 groups[groupName] = groupName;
907                                         }
908                                 }
909                                 
910                                 //turn the groups into a string
911                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
912                                         group += groupIt->first + "-";
913                                 }
914                                 //rip off last dash
915                                 group = group.substr(0, group.length()-1);
916                         }else{ group = ""; }
917
918                         
919                         //print out name and sequence for that bin
920                         string sequence = fasta->getSequence(rep);
921
922                         if (sequence != "not found") {
923                                 if (sorted == "") { //print them out
924                                         rep = rep + "\t" + toString(i+1);
925                                         rep = rep + "|" + toString(binsize);
926                                         if (groupfile != "") {
927                                                 rep = rep + "|" + group;
928                                         }
929                                         out << ">" << rep << endl;
930                                         out << sequence << endl;
931                                 }else { //save them
932                                         repStruct newRep(rep, i+1, binsize, group);
933                                         reps.push_back(newRep);
934                                 }
935                         }else { 
936                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
937                         }
938                 }
939                 
940                         
941                 if (sorted != "") { //then sort them and print them
942                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
943                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
944                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
945                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
946                         
947                         //print them
948                         for (int i = 0; i < reps.size(); i++) {
949                                 string sequence = fasta->getSequence(reps[i].name);
950                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
951                                 outputName = outputName + "|" + toString(reps[i].size);
952                                 if (groupfile != "") {
953                                         outputName = outputName + "|" + reps[i].group;
954                                 }
955                                 out << ">" << outputName << endl;
956                                 out << sequence << endl;
957                         }
958                 }
959                 
960                 in.close();
961                 out.close();
962                 out2.close();
963                 
964                 m->mothurRemove(filename);
965                 rename(tempNameFile.c_str(), filename.c_str());
966                 
967                 return 0;
968
969         }
970         catch(exception& e) {
971                 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
972                 exit(1);
973         }
974 }
975 //**********************************************************************************************************************
976 int GetOTURepCommand::processNames(string filename, string label) {
977         try{
978                 
979                 //create output file
980                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
981                 
982                 ofstream out2;
983                 string tempNameFile = filename + ".temp";
984                 m->openOutputFile(tempNameFile, out2);
985                 
986                 ifstream in;
987                 m->openInputFile(filename, in);
988                 
989                 int i = 0;
990                 string rep, binnames;
991                 while (!in.eof()) {
992                         if (m->control_pressed) { break; }
993                         in >> i >> rep >> binnames; m->gobble(in);
994                         out2 << rep << '\t' << binnames << endl;
995                 }
996                 in.close();
997                 out2.close();
998                 
999                 m->mothurRemove(filename);
1000                 rename(tempNameFile.c_str(), filename.c_str());
1001                 
1002                 return 0;
1003         }
1004         catch(exception& e) {
1005                 m->errorOut(e, "GetOTURepCommand", "processNames");
1006                 exit(1);
1007         }
1008 }
1009 //**********************************************************************************************************************
1010 SeqMap GetOTURepCommand::getMap(int row) {
1011         try {
1012                 SeqMap rowMap;
1013                 
1014                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1015                 if (rowPositions[row] != -1){
1016                         //go to row in file
1017                         inRow.seekg(rowPositions[row]);
1018                         
1019                         int rowNum, numDists, colNum;
1020                         float dist;
1021                         
1022                         inRow >> rowNum >> numDists;
1023                         
1024                         for(int i = 0; i < numDists; i++) {
1025                                 inRow >> colNum >> dist;
1026                                 rowMap[colNum] = dist;
1027                                 
1028                         }
1029                 }
1030                 
1031                 return rowMap;
1032         }
1033         catch(exception& e) {
1034                 m->errorOut(e, "GetOTURepCommand", "getMap");
1035                 exit(1);
1036         }
1037 }
1038 //**********************************************************************************************************************
1039