]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){        
99         try {
100         string outputFileName = "";
101                 map<string, vector<string> >::iterator it;
102         
103         //is this a type this command creates
104         it = outputTypes.find(type);
105         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
106         else {
107             if (type == "fasta")            {   outputFileName =  "rep.fasta";   }
108             else if (type == "name")        {   outputFileName =  "rep.names";   }
109             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
110         }
111         return outputFileName;
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
115                 exit(1);
116         }
117 }
118 //**********************************************************************************************************************
119 GetOTURepCommand::GetOTURepCommand(){   
120         try {
121                 abort = true; calledHelp = true; 
122                 setParameters();
123                 vector<string> tempOutNames;
124                 outputTypes["fasta"] = tempOutNames;
125                 outputTypes["name"] = tempOutNames;
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133 GetOTURepCommand::GetOTURepCommand(string option)  {
134         try{
135                 abort = false; calledHelp = false;   
136                 allLines = 1;
137                                 
138                 //allow user to run help
139                 if (option == "help") { 
140                         help(); abort = true; calledHelp = true;
141                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
142                 } else {
143                         vector<string> myArray = setParameters();
144                         
145                         OptionParser parser(option);
146                         map<string, string> parameters = parser.getParameters();
147                         
148                         ValidParameters validParameter;
149                         map<string, string>::iterator it;
150                 
151                         //check to make sure all parameters are valid for command
152                         for (it = parameters.begin(); it != parameters.end(); it++) { 
153                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
154                         }
155                         
156                         //initialize outputTypes
157                         vector<string> tempOutNames;
158                         outputTypes["fasta"] = tempOutNames;
159                         outputTypes["name"] = tempOutNames;
160                         
161                         //if the user changes the input directory command factory will send this info to us in the output parameter 
162                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
163                         if (inputDir == "not found"){   inputDir = "";          }
164                         else {
165                                 string path;
166                                 it = parameters.find("list");
167                                 //user has given a template file
168                                 if(it != parameters.end()){ 
169                                         path = m->hasPath(it->second);
170                                         //if the user has not given a path then, add inputdir. else leave path alone.
171                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
172                                 }
173                                 
174                                 it = parameters.find("fasta");
175                                 //user has given a template file
176                                 if(it != parameters.end()){ 
177                                         path = m->hasPath(it->second);
178                                         //if the user has not given a path then, add inputdir. else leave path alone.
179                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
180                                 }
181                                 
182                                 it = parameters.find("phylip");
183                                 //user has given a template file
184                                 if(it != parameters.end()){ 
185                                         path = m->hasPath(it->second);
186                                         //if the user has not given a path then, add inputdir. else leave path alone.
187                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
188                                 }
189                                 
190                                 it = parameters.find("column");
191                                 //user has given a template file
192                                 if(it != parameters.end()){ 
193                                         path = m->hasPath(it->second);
194                                         //if the user has not given a path then, add inputdir. else leave path alone.
195                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
196                                 }
197                                 
198                                 it = parameters.find("name");
199                                 //user has given a template file
200                                 if(it != parameters.end()){ 
201                                         path = m->hasPath(it->second);
202                                         //if the user has not given a path then, add inputdir. else leave path alone.
203                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
204                                 }
205                                 
206                                 it = parameters.find("group");
207                                 //user has given a template file
208                                 if(it != parameters.end()){ 
209                                         path = m->hasPath(it->second);
210                                         //if the user has not given a path then, add inputdir. else leave path alone.
211                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
212                                 }
213                         }
214
215                         
216                         //if the user changes the output directory command factory will send this info to us in the output parameter 
217                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
218                         
219                         //check for required parameters
220                         fastafile = validParameter.validFile(parameters, "fasta", true);
221                         if (fastafile == "not found") { fastafile = ""; }
222                         else if (fastafile == "not open") { abort = true; }     
223                         else { m->setFastaFile(fastafile); }
224                 
225                         listfile = validParameter.validFile(parameters, "list", true);
226                         if (listfile == "not found") {                  
227                                 listfile = m->getListFile(); 
228                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
229                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
230                         }
231                         else if (listfile == "not open") { abort = true; }      
232                         else { m->setListFile(listfile); }
233                         
234                         phylipfile = validParameter.validFile(parameters, "phylip", true);
235                         if (phylipfile == "not found") { phylipfile = "";  }
236                         else if (phylipfile == "not open") { abort = true; }    
237                         else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile);   }
238                         
239                         columnfile = validParameter.validFile(parameters, "column", true);
240                         if (columnfile == "not found") { columnfile = ""; }
241                         else if (columnfile == "not open") { abort = true; }    
242                         else { distFile = columnfile; format = "column";  m->setColumnFile(columnfile); }
243                         
244                         namefile = validParameter.validFile(parameters, "name", true);
245                         if (namefile == "not open") { abort = true; }   
246                         else if (namefile == "not found") { namefile = ""; }
247                         else { m->setNameFile(namefile); }
248                         
249                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
250                                 //give priority to column, then phylip
251                                 columnfile = m->getColumnFile(); 
252                                 if (columnfile != "") {  distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
253                                 else { 
254                                         phylipfile = m->getPhylipFile(); 
255                                         if (phylipfile != "") {  distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
256                                         else { 
257                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
258                                                 abort = true;
259                                         }
260                                 }
261                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
262                 
263                         if (columnfile != "") {  
264                                 if (namefile == "") {  
265                                         namefile = m->getNameFile(); 
266                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
267                                         else { 
268                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
269                                                 abort = true; 
270                                         }       
271                                 } 
272                         }
273
274                         //check for optional parameter and set defaults
275                         // ...at some point should added some additional type checking...
276                         label = validParameter.validFile(parameters, "label", false);                   
277                         if (label == "not found") { label = ""; allLines = 1;  }
278                         else { 
279                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
280                                 else { allLines = 1;  }
281                         }
282                         
283                         groupfile = validParameter.validFile(parameters, "group", true);
284                         if (groupfile == "not open") { groupfile = ""; abort = true; }
285                         else if (groupfile == "not found") { groupfile = ""; }
286                         else { m->setGroupFile(groupfile); }
287                         
288                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
289                         if (sorted == "none") { sorted=""; }
290                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
291                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
292                                 sorted = "";
293                         }
294                         
295                         if ((sorted == "group") && (groupfile == "")) {
296                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
297                                 sorted = "";
298                         }
299                         
300                         groups = validParameter.validFile(parameters, "groups", false);                 
301                         if (groups == "not found") { groups = ""; }
302                         else { 
303                                 if (groupfile == "") {
304                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
305                                         abort = true;
306                                 }else { 
307                                         m->splitAtDash(groups, Groups);
308                                 }
309                         }
310                         m->setGroups(Groups);
311                         
312                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
313                         large = m->isTrue(temp);
314                         
315                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
316                         weighted = m->isTrue(temp);
317                         
318                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
319                         
320                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
321                         m->mothurConvert(temp, precision); 
322                         
323                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
324                         m->mothurConvert(temp, cutoff); 
325                         cutoff += (5 / (precision * 10.0));
326                 }
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
330                 exit(1);
331         }
332 }
333
334 //**********************************************************************************************************************
335
336 int GetOTURepCommand::execute(){
337         try {
338         
339                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
340                 int error;
341                 list = NULL;
342                 
343                 if (!large) {
344                         //read distance files
345                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
346                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
347                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
348                         
349                         readMatrix->setCutoff(cutoff);
350         
351                         if(namefile != ""){     
352                                 nameMap = new NameAssignment(namefile);
353                                 nameMap->readMap();
354                         }else{  nameMap = NULL;         }
355                         
356                         readMatrix->read(nameMap);
357                         
358                         if (m->control_pressed) { delete readMatrix; return 0; }
359
360                         list = readMatrix->getListVector();
361
362                         SparseMatrix* matrix = readMatrix->getMatrix();
363                         
364                         // Create a data structure to quickly access the distance information.
365                         // It consists of a vector of distance maps, where each map contains
366                         // all distances of a certain sequence. Vector and maps are accessed
367                         // via the index of a sequence in the distance matrix
368                         seqVec = vector<SeqMap>(list->size()); 
369                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
370                                 if (m->control_pressed) { delete readMatrix; return 0; }
371                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
372                         }
373                         //add dummy map for unweighted calc
374                         SeqMap dummy;
375                         seqVec.push_back(dummy);
376                         
377                         delete matrix;
378                         delete readMatrix;
379                         delete nameMap;
380                         
381                         if (m->control_pressed) { return 0; }
382                 }else {
383                         //process file and set up indexes
384                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
385                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
386                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
387                         
388                         formatMatrix->setCutoff(cutoff);
389         
390                         if(namefile != ""){     
391                                 nameMap = new NameAssignment(namefile);
392                                 nameMap->readMap();
393                         }else{  nameMap = NULL;         }
394                         
395                         formatMatrix->read(nameMap);
396                         
397                         if (m->control_pressed) { delete formatMatrix;  return 0; }
398
399                         list = formatMatrix->getListVector();
400                         
401                         distFile = formatMatrix->getFormattedFileName();
402                         
403                         //positions in file where the distances for each sequence begin
404                         //rowPositions[1] = position in file where distance related to sequence 1 start.
405                         rowPositions = formatMatrix->getRowPositions();
406                         rowPositions.push_back(-1); //dummy row for unweighted calc
407                         
408                         delete formatMatrix;
409                         delete nameMap;
410                         
411                         //openfile for getMap to use
412                         m->openInputFile(distFile, inRow);
413                         
414                         if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
415                 }
416                 
417                 
418                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
419                 if (list != NULL) {
420                         vector<string> names;
421                         string binnames;
422                         //map names to rows in sparsematrix
423                         for (int i = 0; i < list->size(); i++) {
424                                 names.clear();
425                                 binnames = list->get(i);
426                                 
427                                 m->splitAtComma(binnames, names);
428                                 
429                                 for (int j = 0; j < names.size(); j++) {
430                                         nameToIndex[names[j]] = i;
431                                 }
432                         }
433                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
434                 
435                                 
436                 if (m->control_pressed) { 
437                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
438                         return 0; 
439                 }
440                 
441                 if (groupfile != "") {
442                         //read in group map info.
443                         groupMap = new GroupMap(groupfile);
444                         int error = groupMap->readMap();
445                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
446                         
447                         if (Groups.size() != 0) {
448                                 SharedUtil* util = new SharedUtil();
449                                 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
450                                 util->setGroups(Groups, gNamesOfGroups, "getoturep");
451                                 groupMap->setNamesOfGroups(gNamesOfGroups);
452                                 delete util;
453                         }
454                 }
455                 
456                 //done with listvector from matrix
457                 if (list != NULL) { delete list; }
458                 
459                 input = new InputData(listfile, "list");
460                 list = input->getListVector();
461                 string lastLabel = list->getLabel();
462
463                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
464                 set<string> processedLabels;
465                 set<string> userLabels = labels;
466                 
467                 if (m->control_pressed) { 
468                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
469                         delete input; delete list; return 0; 
470                 }
471                 
472                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
473                 
474                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
475                         
476                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
477                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
478                                         error = process(list);
479                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
480                                         
481                                         if (m->control_pressed) { 
482                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
483                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
484                                                 delete input; delete list; return 0; 
485                                         }
486                                         
487                                         processedLabels.insert(list->getLabel());
488                                         userLabels.erase(list->getLabel());
489                         }
490                         
491                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
492                                         string saveLabel = list->getLabel();
493                                         
494                                         delete list;
495                                         list = input->getListVector(lastLabel);
496                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
497                                         error = process(list);
498                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
499                                         
500                                         if (m->control_pressed) { 
501                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
502                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
503                                                 delete input; delete list; return 0; 
504                                         }
505                                         
506                                         processedLabels.insert(list->getLabel());
507                                         userLabels.erase(list->getLabel());
508                                         
509                                         //restore real lastlabel to save below
510                                         list->setLabel(saveLabel);
511                         }
512                         
513                         lastLabel = list->getLabel();
514         
515                         delete list;
516                         list = input->getListVector();
517                 }
518                 
519                 //output error messages about any remaining user labels
520                 bool needToRun = false;
521                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
522                         m->mothurOut("Your file does not include the label " + (*it)); 
523                         if (processedLabels.count(lastLabel) != 1) {
524                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
525                                 needToRun = true;
526                         }else {
527                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
528                         }
529                 }
530                 
531                 //run last label if you need to
532                 if (needToRun == true)  {
533                         if (list != NULL) {     delete list;    }
534                         list = input->getListVector(lastLabel);
535                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
536                         error = process(list);
537                         delete list;
538                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
539                         
540                         if (m->control_pressed) { 
541                                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
542                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
543                                         delete input; delete list; return 0; 
544                         }
545                 }
546                 
547                 //close and remove formatted matrix file
548                 if (large) {
549                         inRow.close();
550                         m->mothurRemove(distFile);
551                 }
552                 
553                 delete input;  
554                 
555                 if (!weighted) { nameFileMap.clear(); }
556                 
557                                 
558                 if (fastafile != "") {
559                         //read fastafile
560                         fasta = new FastaMap();
561                         fasta->readFastaFile(fastafile);
562                         
563                         //if user gave a namesfile then use it
564                         if (namefile != "") {   readNamesFile();        }
565                         
566                         //output create and output the .rep.fasta files
567                         map<string, string>::iterator itNameFile;
568                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
569                                 processFastaNames(itNameFile->first, itNameFile->second);
570                         }
571                 }else {
572                         //output create and output the .rep.fasta files
573                         map<string, string>::iterator itNameFile;
574                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
575                                 processNames(itNameFile->first, itNameFile->second);
576                         }
577                 }
578                 
579                                 
580                 if (groupfile != "") { delete groupMap; }
581                 
582                 if (m->control_pressed) {  return 0; }
583                 
584                 //set fasta file as new current fastafile - use first one??
585                 string current = "";
586                 itTypes = outputTypes.find("fasta");
587                 if (itTypes != outputTypes.end()) {
588                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
589                 }
590                 
591                 itTypes = outputTypes.find("name");
592                 if (itTypes != outputTypes.end()) {
593                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
594                 }
595                 
596                 m->mothurOutEndLine();
597                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
598                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
599                 m->mothurOutEndLine();
600                 
601                 return 0;
602         }
603         catch(exception& e) {
604                 m->errorOut(e, "GetOTURepCommand", "execute");
605                 exit(1);
606         }
607 }
608
609 //**********************************************************************************************************************
610 void GetOTURepCommand::readNamesFile() {
611         try {
612                 ifstream in;
613                 vector<string> dupNames;
614                 m->openInputFile(namefile, in);
615                 
616                 string name, names, sequence;
617         
618                 while(!in.eof()){
619                         in >> name;                     //read from first column  A
620                         in >> names;            //read from second column  A,B,C,D
621                         
622                         dupNames.clear();
623                         
624                         //parse names into vector
625                         m->splitAtComma(names, dupNames);
626                         
627                         //store names in fasta map
628                         sequence = fasta->getSequence(name);
629                         for (int i = 0; i < dupNames.size(); i++) {
630                                 fasta->push_back(dupNames[i], sequence);
631                         }
632                 
633                         m->gobble(in);
634                 }
635                 in.close();
636
637         }
638         catch(exception& e) {
639                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
640                 exit(1);
641         }
642 }
643 //**********************************************************************************************************************
644 //read names file to find the weighted rep for each bin
645 void GetOTURepCommand::readNamesFile(bool w) {
646         try {
647                 ifstream in;
648                 vector<string> dupNames;
649                 m->openInputFile(namefile, in);
650                 
651                 string name, names, sequence;
652                 
653                 while(!in.eof()){
654                         in >> name;     m->gobble(in);          //read from first column  A
655                         in >> names;                                                    //read from second column  A,B,C,D
656                         
657                         dupNames.clear();
658                         
659                         //parse names into vector
660                         m->splitAtComma(names, dupNames);
661                         
662                         for (int i = 0; i < dupNames.size(); i++) {
663                                 nameFileMap[dupNames[i]] = name;
664                         }
665                         
666                         m->gobble(in);
667                 }
668                 in.close();
669                 
670         }
671         catch(exception& e) {
672                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
673                 exit(1);
674         }
675 }
676 //**********************************************************************************************************************
677 string GetOTURepCommand::findRep(vector<string> names) {
678         try{
679                 // if only 1 sequence in bin or processing the "unique" label, then 
680                 // the first sequence of the OTU is the representative one
681                 if ((names.size() == 1)) {
682                         return names[0];
683                 }else{
684                         vector<int> seqIndex(names.size());
685                         vector<float> max_dist(names.size());
686                         vector<float> total_dist(names.size());
687                         map<string, string>::iterator itNameFile;
688                         map<string, int>::iterator itNameIndex;
689
690                         //fill seqIndex and initialize sums
691                         for (size_t i = 0; i < names.size(); i++) {
692                                 if (weighted) {
693                                         seqIndex[i] = nameToIndex[names[i]];
694                                 }else { 
695                                         if (namefile == "") {
696                                                 itNameIndex = nameToIndex.find(names[i]);
697                                                 
698                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
699                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
700                                                         else {  seqIndex[i] = (seqVec.size()-1); }
701                                                 }else {
702                                                         seqIndex[i] = itNameIndex->second;
703                                                 }
704                                                 
705                                         }else {
706                                                 itNameFile = nameFileMap.find(names[i]);
707                                                 
708                                                 if (itNameFile == nameFileMap.end()) {
709                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
710                                                 }else{
711                                                         string name1 = itNameFile->first;
712                                                         string name2 = itNameFile->second;
713                                                         
714                                                         if (name1 == name2) { //then you are unique so add your real dists
715                                                                 seqIndex[i] = nameToIndex[names[i]];
716                                                         }else { //add dummy
717                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
718                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
719                                                         }
720                                                 }
721                                         }
722                                 }
723                                 max_dist[i] = 0.0;
724                                 total_dist[i] = 0.0;
725                         }
726                         
727                         // loop through all entries in seqIndex
728                         SeqMap::iterator it;
729                         SeqMap currMap;
730                         for (size_t i=0; i < seqIndex.size(); i++) {
731                                 if (m->control_pressed) {  return  "control"; }
732                         
733                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
734                                 else            {       currMap = getMap(seqIndex[i]);  }
735                                 
736                                 for (size_t j=0; j < seqIndex.size(); j++) {
737                                         it = currMap.find(seqIndex[j]);         
738                                         if (it != currMap.end()) {
739                                                 max_dist[i] = max(max_dist[i], it->second);
740                                                 max_dist[j] = max(max_dist[j], it->second);
741                                                 total_dist[i] += it->second;
742                                                 total_dist[j] += it->second;
743                                         }else{ //if you can't find the distance make it the cutoff
744                                                 max_dist[i] = max(max_dist[i], cutoff);
745                                                 max_dist[j] = max(max_dist[j], cutoff);
746                                                 total_dist[i] += cutoff;
747                                                 total_dist[j] += cutoff;
748                                         }
749                                 }
750                         }
751                         
752                         // sequence with the smallest maximum distance is the representative
753                         //if tie occurs pick sequence with smallest average distance
754                         float min = 10000;
755                         int minIndex;
756                         for (size_t i=0; i < max_dist.size(); i++) {
757                                 if (m->control_pressed) {  return  "control"; }
758                                 if (max_dist[i] < min) {
759                                         min = max_dist[i];
760                                         minIndex = i;
761                                 }else if (max_dist[i] == min) {
762                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
763                                         float newAverage = total_dist[i] / (float) total_dist.size();
764                                         
765                                         if (newAverage < currentAverage) {
766                                                 min = max_dist[i];
767                                                 minIndex = i;
768                                         }
769                                 }
770                         }
771                         
772                         return(names[minIndex]);
773                 }
774         }
775         catch(exception& e) {
776                 m->errorOut(e, "GetOTURepCommand", "FindRep");
777                 exit(1);
778         }
779 }
780
781 //**********************************************************************************************************************
782 int GetOTURepCommand::process(ListVector* processList) {
783         try{
784                 string name, sequence;
785                 string nameRep;
786
787                 //create output file
788                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
789                                 
790                 ofstream newNamesOutput;
791                 string outputNamesFile;
792                 map<string, ofstream*> filehandles;
793                 
794                 if (Groups.size() == 0) { //you don't want to use groups
795                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name");
796                         m->openOutputFile(outputNamesFile, newNamesOutput);
797                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
798                         outputNameFiles[outputNamesFile] = processList->getLabel();
799                 }else{ //you want to use groups
800                         ofstream* temp;
801                         for (int i=0; i<Groups.size(); i++) {
802                                 temp = new ofstream;
803                                 filehandles[Groups[i]] = temp;
804                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name");
805                                 
806                                 m->openOutputFile(outputNamesFile, *(temp));
807                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
808                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
809                         }
810                 }
811                 
812                 //for each bin in the list vector
813                 for (int i = 0; i < processList->size(); i++) {
814                         if (m->control_pressed) { 
815                                 out.close();  
816                                 if (Groups.size() == 0) { //you don't want to use groups
817                                         newNamesOutput.close();
818                                 }else{
819                                         for (int j=0; j<Groups.size(); j++) {
820                                                 (*(filehandles[Groups[j]])).close();
821                                                 delete filehandles[Groups[j]];
822                                         }
823                                 }
824                                 return 0; 
825                         }
826                         
827                         string temp = processList->get(i);
828                         vector<string> namesInBin;
829                         m->splitAtComma(temp, namesInBin);
830                         
831                         if (Groups.size() == 0) {
832                                 nameRep = findRep(namesInBin);
833                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
834                         }else{
835                                 map<string, vector<string> > NamesInGroup;
836                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
837                                         NamesInGroup[Groups[j]].resize(0);
838                                 }
839                                 
840                                 for (int j=0; j<namesInBin.size(); j++) {
841                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
842                                         
843                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
844                                         
845                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
846                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
847                                         }
848                                 }
849                                 
850                                 //get rep for each group in otu
851                                 for (int j=0; j<Groups.size(); j++) {
852                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
853                                                 //get rep for each group
854                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
855                                                 
856                                                 //output group rep and other members of this group
857                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
858                                                 
859                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
860                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
861                                                 }
862                                                 //output last name
863                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
864                                         }
865                                 }
866                         }
867                 }
868                 
869                 if (Groups.size() == 0) { //you don't want to use groups
870                         newNamesOutput.close();
871                 }else{
872                         for (int i=0; i<Groups.size(); i++) {
873                                 (*(filehandles[Groups[i]])).close();
874                                 delete filehandles[Groups[i]];
875                         }
876                 }
877                 
878                 return 0;
879
880         }
881         catch(exception& e) {
882                 m->errorOut(e, "GetOTURepCommand", "process");
883                 exit(1);
884         }
885 }
886 //**********************************************************************************************************************
887 int GetOTURepCommand::processFastaNames(string filename, string label) {
888         try{
889
890                 //create output file
891                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
892                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
893                 m->openOutputFile(outputFileName, out);
894                 vector<repStruct> reps;
895                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
896                 
897                 ofstream out2;
898                 string tempNameFile = filename + ".temp";
899                 m->openOutputFile(tempNameFile, out2);
900                 
901                 ifstream in;
902                 m->openInputFile(filename, in);
903                 
904                 int i = 0;
905                 while (!in.eof()) {
906                         string rep, binnames;
907                         in >> i >> rep >> binnames; m->gobble(in);
908                         out2 << rep << '\t' << binnames << endl;
909                         
910                         vector<string> names;
911                         m->splitAtComma(binnames, names);
912                         int binsize = names.size();
913                         
914                         //if you have a groupfile
915                         string group = "";
916                         if (groupfile != "") {
917                                 map<string, string> groups;
918                                 map<string, string>::iterator groupIt;
919                                 
920                                 //find the groups that are in this bin
921                                 for (size_t i = 0; i < names.size(); i++) {
922                                         string groupName = groupMap->getGroup(names[i]);
923                                         if (groupName == "not found") {  
924                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
925                                                 groupError = true;
926                                         } else {
927                                                 groups[groupName] = groupName;
928                                         }
929                                 }
930                                 
931                                 //turn the groups into a string
932                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
933                                         group += groupIt->first + "-";
934                                 }
935                                 //rip off last dash
936                                 group = group.substr(0, group.length()-1);
937                         }else{ group = ""; }
938
939                         
940                         //print out name and sequence for that bin
941                         string sequence = fasta->getSequence(rep);
942
943                         if (sequence != "not found") {
944                                 if (sorted == "") { //print them out
945                                         rep = rep + "\t" + toString(i+1);
946                                         rep = rep + "|" + toString(binsize);
947                                         if (groupfile != "") {
948                                                 rep = rep + "|" + group;
949                                         }
950                                         out << ">" << rep << endl;
951                                         out << sequence << endl;
952                                 }else { //save them
953                                         repStruct newRep(rep, i+1, binsize, group);
954                                         reps.push_back(newRep);
955                                 }
956                         }else { 
957                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
958                         }
959                 }
960                 
961                         
962                 if (sorted != "") { //then sort them and print them
963                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
964                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
965                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
966                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
967                         
968                         //print them
969                         for (int i = 0; i < reps.size(); i++) {
970                                 string sequence = fasta->getSequence(reps[i].name);
971                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
972                                 outputName = outputName + "|" + toString(reps[i].size);
973                                 if (groupfile != "") {
974                                         outputName = outputName + "|" + reps[i].group;
975                                 }
976                                 out << ">" << outputName << endl;
977                                 out << sequence << endl;
978                         }
979                 }
980                 
981                 in.close();
982                 out.close();
983                 out2.close();
984                 
985                 m->mothurRemove(filename);
986                 rename(tempNameFile.c_str(), filename.c_str());
987                 
988                 return 0;
989
990         }
991         catch(exception& e) {
992                 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
993                 exit(1);
994         }
995 }
996 //**********************************************************************************************************************
997 int GetOTURepCommand::processNames(string filename, string label) {
998         try{
999                 
1000                 //create output file
1001                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1002                 
1003                 ofstream out2;
1004                 string tempNameFile = filename + ".temp";
1005                 m->openOutputFile(tempNameFile, out2);
1006                 
1007                 ifstream in;
1008                 m->openInputFile(filename, in);
1009                 
1010                 int i = 0;
1011                 string rep, binnames;
1012                 while (!in.eof()) {
1013                         if (m->control_pressed) { break; }
1014                         in >> i >> rep >> binnames; m->gobble(in);
1015                         out2 << rep << '\t' << binnames << endl;
1016                 }
1017                 in.close();
1018                 out2.close();
1019                 
1020                 m->mothurRemove(filename);
1021                 rename(tempNameFile.c_str(), filename.c_str());
1022                 
1023                 return 0;
1024         }
1025         catch(exception& e) {
1026                 m->errorOut(e, "GetOTURepCommand", "processNames");
1027                 exit(1);
1028         }
1029 }
1030 //**********************************************************************************************************************
1031 SeqMap GetOTURepCommand::getMap(int row) {
1032         try {
1033                 SeqMap rowMap;
1034                 
1035                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1036                 if (rowPositions[row] != -1){
1037                         //go to row in file
1038                         inRow.seekg(rowPositions[row]);
1039                         
1040                         int rowNum, numDists, colNum;
1041                         float dist;
1042                         
1043                         inRow >> rowNum >> numDists;
1044                         
1045                         for(int i = 0; i < numDists; i++) {
1046                                 inRow >> colNum >> dist;
1047                                 rowMap[colNum] = dist;
1048                                 
1049                         }
1050                 }
1051                 
1052                 return rowMap;
1053         }
1054         catch(exception& e) {
1055                 m->errorOut(e, "GetOTURepCommand", "getMap");
1056                 exit(1);
1057         }
1058 }
1059 //**********************************************************************************************************************
1060