]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
1.22.0
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){   
99         try {
100                 abort = true; calledHelp = true; 
101                 setParameters();
102                 vector<string> tempOutNames;
103                 outputTypes["fasta"] = tempOutNames;
104                 outputTypes["name"] = tempOutNames;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
108                 exit(1);
109         }
110 }
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option)  {
113         try{
114                 abort = false; calledHelp = false;   
115                 allLines = 1;
116                                 
117                 //allow user to run help
118                 if (option == "help") { 
119                         help(); abort = true; calledHelp = true;
120                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
121                 } else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string, string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string, string>::iterator it;
129                 
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["name"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("list");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
151                                 }
152                                 
153                                 it = parameters.find("fasta");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
159                                 }
160                                 
161                                 it = parameters.find("phylip");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
167                                 }
168                                 
169                                 it = parameters.find("column");
170                                 //user has given a template file
171                                 if(it != parameters.end()){ 
172                                         path = m->hasPath(it->second);
173                                         //if the user has not given a path then, add inputdir. else leave path alone.
174                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
175                                 }
176                                 
177                                 it = parameters.find("name");
178                                 //user has given a template file
179                                 if(it != parameters.end()){ 
180                                         path = m->hasPath(it->second);
181                                         //if the user has not given a path then, add inputdir. else leave path alone.
182                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
183                                 }
184                                 
185                                 it = parameters.find("group");
186                                 //user has given a template file
187                                 if(it != parameters.end()){ 
188                                         path = m->hasPath(it->second);
189                                         //if the user has not given a path then, add inputdir. else leave path alone.
190                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
191                                 }
192                         }
193
194                         
195                         //if the user changes the output directory command factory will send this info to us in the output parameter 
196                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
197                         
198                         //check for required parameters
199                         fastafile = validParameter.validFile(parameters, "fasta", true);
200                         if (fastafile == "not found") {                                 
201                                 fastafile = m->getFastaFile(); 
202                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204                         }
205                         else if (fastafile == "not open") { abort = true; }     
206                         else { m->setFastaFile(fastafile); }
207                 
208                         listfile = validParameter.validFile(parameters, "list", true);
209                         if (listfile == "not found") {                  
210                                 listfile = m->getListFile(); 
211                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
212                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
213                         }
214                         else if (listfile == "not open") { abort = true; }      
215                         else { m->setListFile(listfile); }
216                         
217                         phylipfile = validParameter.validFile(parameters, "phylip", true);
218                         if (phylipfile == "not found") { phylipfile = "";  }
219                         else if (phylipfile == "not open") { abort = true; }    
220                         else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile);   }
221                         
222                         columnfile = validParameter.validFile(parameters, "column", true);
223                         if (columnfile == "not found") { columnfile = ""; }
224                         else if (columnfile == "not open") { abort = true; }    
225                         else { distFile = columnfile; format = "column";  m->setColumnFile(columnfile); }
226                         
227                         namefile = validParameter.validFile(parameters, "name", true);
228                         if (namefile == "not open") { abort = true; }   
229                         else if (namefile == "not found") { namefile = ""; }
230                         else { m->setNameFile(namefile); }
231                         
232                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
233                                 //give priority to column, then phylip
234                                 columnfile = m->getColumnFile(); 
235                                 if (columnfile != "") {  distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
236                                 else { 
237                                         phylipfile = m->getPhylipFile(); 
238                                         if (phylipfile != "") {  distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
239                                         else { 
240                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
241                                                 abort = true;
242                                         }
243                                 }
244                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
245                 
246                         if (columnfile != "") {  
247                                 if (namefile == "") {  
248                                         namefile = m->getNameFile(); 
249                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
250                                         else { 
251                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
252                                                 abort = true; 
253                                         }       
254                                 } 
255                         }
256
257                         //check for optional parameter and set defaults
258                         // ...at some point should added some additional type checking...
259                         label = validParameter.validFile(parameters, "label", false);                   
260                         if (label == "not found") { label = ""; allLines = 1;  }
261                         else { 
262                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
263                                 else { allLines = 1;  }
264                         }
265                         
266                         groupfile = validParameter.validFile(parameters, "group", true);
267                         if (groupfile == "not open") { groupfile = ""; abort = true; }
268                         else if (groupfile == "not found") { groupfile = ""; }
269                         else { m->setGroupFile(groupfile); }
270                         
271                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
272                         if (sorted == "none") { sorted=""; }
273                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
274                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
275                                 sorted = "";
276                         }
277                         
278                         if ((sorted == "group") && (groupfile == "")) {
279                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
280                                 sorted = "";
281                         }
282                         
283                         groups = validParameter.validFile(parameters, "groups", false);                 
284                         if (groups == "not found") { groups = ""; }
285                         else { 
286                                 if (groupfile == "") {
287                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
288                                         abort = true;
289                                 }else { 
290                                         m->splitAtDash(groups, Groups);
291                                 }
292                         }
293                         m->setGroups(Groups);
294                         
295                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
296                         large = m->isTrue(temp);
297                         
298                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
299                         weighted = m->isTrue(temp);
300                         
301                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
302                         
303                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
304                         convert(temp, precision); 
305                         
306                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
307                         convert(temp, cutoff); 
308                         cutoff += (5 / (precision * 10.0));
309                 }
310         }
311         catch(exception& e) {
312                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
313                 exit(1);
314         }
315 }
316
317 //**********************************************************************************************************************
318
319 int GetOTURepCommand::execute(){
320         try {
321         
322                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
323                 int error;
324                 list = NULL;
325                 
326                 if (!large) {
327                         //read distance files
328                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
329                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
330                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
331                         
332                         readMatrix->setCutoff(cutoff);
333         
334                         if(namefile != ""){     
335                                 nameMap = new NameAssignment(namefile);
336                                 nameMap->readMap();
337                         }else{  nameMap = NULL;         }
338                         
339                         readMatrix->read(nameMap);
340                         
341                         if (m->control_pressed) { delete readMatrix; return 0; }
342
343                         list = readMatrix->getListVector();
344
345                         SparseMatrix* matrix = readMatrix->getMatrix();
346                         
347                         // Create a data structure to quickly access the distance information.
348                         // It consists of a vector of distance maps, where each map contains
349                         // all distances of a certain sequence. Vector and maps are accessed
350                         // via the index of a sequence in the distance matrix
351                         seqVec = vector<SeqMap>(list->size()); 
352                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
353                                 if (m->control_pressed) { delete readMatrix; return 0; }
354                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
355                         }
356                         //add dummy map for unweighted calc
357                         SeqMap dummy;
358                         seqVec.push_back(dummy);
359                         
360                         delete matrix;
361                         delete readMatrix;
362                         delete nameMap;
363                         
364                         if (m->control_pressed) { return 0; }
365                 }else {
366                         //process file and set up indexes
367                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
368                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
369                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
370                         
371                         formatMatrix->setCutoff(cutoff);
372         
373                         if(namefile != ""){     
374                                 nameMap = new NameAssignment(namefile);
375                                 nameMap->readMap();
376                         }else{  nameMap = NULL;         }
377                         
378                         formatMatrix->read(nameMap);
379                         
380                         if (m->control_pressed) { delete formatMatrix;  return 0; }
381
382                         list = formatMatrix->getListVector();
383                         
384                         distFile = formatMatrix->getFormattedFileName();
385                         
386                         //positions in file where the distances for each sequence begin
387                         //rowPositions[1] = position in file where distance related to sequence 1 start.
388                         rowPositions = formatMatrix->getRowPositions();
389                         rowPositions.push_back(-1); //dummy row for unweighted calc
390                         
391                         delete formatMatrix;
392                         delete nameMap;
393                         
394                         //openfile for getMap to use
395                         m->openInputFile(distFile, inRow);
396                         
397                         if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
398                 }
399                 
400                 
401                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
402                 if (list != NULL) {
403                         vector<string> names;
404                         string binnames;
405                         //map names to rows in sparsematrix
406                         for (int i = 0; i < list->size(); i++) {
407                                 names.clear();
408                                 binnames = list->get(i);
409                                 
410                                 m->splitAtComma(binnames, names);
411                                 
412                                 for (int j = 0; j < names.size(); j++) {
413                                         nameToIndex[names[j]] = i;
414                                 }
415                         }
416                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
417                 
418                                 
419                 if (m->control_pressed) { 
420                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
421                         return 0; 
422                 }
423                 
424                 if (groupfile != "") {
425                         //read in group map info.
426                         groupMap = new GroupMap(groupfile);
427                         int error = groupMap->readMap();
428                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
429                         
430                         if (Groups.size() != 0) {
431                                 SharedUtil* util = new SharedUtil();
432                                 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
433                                 util->setGroups(Groups, gNamesOfGroups, "getoturep");
434                                 groupMap->setNamesOfGroups(gNamesOfGroups);
435                                 delete util;
436                         }
437                 }
438                 
439                 //done with listvector from matrix
440                 if (list != NULL) { delete list; }
441                 
442                 input = new InputData(listfile, "list");
443                 list = input->getListVector();
444                 string lastLabel = list->getLabel();
445
446                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
447                 set<string> processedLabels;
448                 set<string> userLabels = labels;
449                 
450                 if (m->control_pressed) { 
451                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
452                         delete input; delete list; return 0; 
453                 }
454                 
455                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
456                 
457                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
458                         
459                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
460                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
461                                         error = process(list);
462                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
463                                         
464                                         if (m->control_pressed) { 
465                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
466                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
467                                                 delete input; delete list; return 0; 
468                                         }
469                                         
470                                         processedLabels.insert(list->getLabel());
471                                         userLabels.erase(list->getLabel());
472                         }
473                         
474                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
475                                         string saveLabel = list->getLabel();
476                                         
477                                         delete list;
478                                         list = input->getListVector(lastLabel);
479                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
480                                         error = process(list);
481                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
482                                         
483                                         if (m->control_pressed) { 
484                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
485                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
486                                                 delete input; delete list; return 0; 
487                                         }
488                                         
489                                         processedLabels.insert(list->getLabel());
490                                         userLabels.erase(list->getLabel());
491                                         
492                                         //restore real lastlabel to save below
493                                         list->setLabel(saveLabel);
494                         }
495                         
496                         lastLabel = list->getLabel();
497         
498                         delete list;
499                         list = input->getListVector();
500                 }
501                 
502                 //output error messages about any remaining user labels
503                 bool needToRun = false;
504                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
505                         m->mothurOut("Your file does not include the label " + (*it)); 
506                         if (processedLabels.count(lastLabel) != 1) {
507                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
508                                 needToRun = true;
509                         }else {
510                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
511                         }
512                 }
513                 
514                 //run last label if you need to
515                 if (needToRun == true)  {
516                         if (list != NULL) {     delete list;    }
517                         list = input->getListVector(lastLabel);
518                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
519                         error = process(list);
520                         delete list;
521                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
522                         
523                         if (m->control_pressed) { 
524                                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
525                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
526                                         delete input; delete list; return 0; 
527                         }
528                 }
529                 
530                 //close and remove formatted matrix file
531                 if (large) {
532                         inRow.close();
533                         m->mothurRemove(distFile);
534                 }
535                 
536                 delete input;  
537                 
538                 if (!weighted) { nameFileMap.clear(); }
539                 
540                 //read fastafile
541                 fasta = new FastaMap();
542                 fasta->readFastaFile(fastafile);
543                 
544                 //if user gave a namesfile then use it
545                 if (namefile != "") {   readNamesFile();        }
546                 
547                 //output create and output the .rep.fasta files
548                 map<string, string>::iterator itNameFile;
549                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
550                         processNames(itNameFile->first, itNameFile->second);
551                 }
552                 
553                 delete fasta;
554                 if (groupfile != "") { delete groupMap;  }
555                 
556                 if (m->control_pressed) {  return 0; }
557                 
558                 //set fasta file as new current fastafile - use first one??
559                 string current = "";
560                 itTypes = outputTypes.find("fasta");
561                 if (itTypes != outputTypes.end()) {
562                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
563                 }
564                 
565                 itTypes = outputTypes.find("name");
566                 if (itTypes != outputTypes.end()) {
567                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
568                 }
569                 
570                 m->mothurOutEndLine();
571                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
572                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
573                 m->mothurOutEndLine();
574                 
575                 return 0;
576         }
577         catch(exception& e) {
578                 m->errorOut(e, "GetOTURepCommand", "execute");
579                 exit(1);
580         }
581 }
582
583 //**********************************************************************************************************************
584 void GetOTURepCommand::readNamesFile() {
585         try {
586                 ifstream in;
587                 vector<string> dupNames;
588                 m->openInputFile(namefile, in);
589                 
590                 string name, names, sequence;
591         
592                 while(!in.eof()){
593                         in >> name;                     //read from first column  A
594                         in >> names;            //read from second column  A,B,C,D
595                         
596                         dupNames.clear();
597                         
598                         //parse names into vector
599                         m->splitAtComma(names, dupNames);
600                         
601                         //store names in fasta map
602                         sequence = fasta->getSequence(name);
603                         for (int i = 0; i < dupNames.size(); i++) {
604                                 fasta->push_back(dupNames[i], sequence);
605                         }
606                 
607                         m->gobble(in);
608                 }
609                 in.close();
610
611         }
612         catch(exception& e) {
613                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
614                 exit(1);
615         }
616 }
617 //**********************************************************************************************************************
618 //read names file to find the weighted rep for each bin
619 void GetOTURepCommand::readNamesFile(bool w) {
620         try {
621                 ifstream in;
622                 vector<string> dupNames;
623                 m->openInputFile(namefile, in);
624                 
625                 string name, names, sequence;
626                 
627                 while(!in.eof()){
628                         in >> name;     m->gobble(in);          //read from first column  A
629                         in >> names;                                                    //read from second column  A,B,C,D
630                         
631                         dupNames.clear();
632                         
633                         //parse names into vector
634                         m->splitAtComma(names, dupNames);
635                         
636                         for (int i = 0; i < dupNames.size(); i++) {
637                                 nameFileMap[dupNames[i]] = name;
638                         }
639                         
640                         m->gobble(in);
641                 }
642                 in.close();
643                 
644         }
645         catch(exception& e) {
646                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
647                 exit(1);
648         }
649 }
650 //**********************************************************************************************************************
651 string GetOTURepCommand::findRep(vector<string> names) {
652         try{
653                 // if only 1 sequence in bin or processing the "unique" label, then 
654                 // the first sequence of the OTU is the representative one
655                 if ((names.size() == 1)) {
656                         return names[0];
657                 }else{
658                         vector<int> seqIndex(names.size());
659                         vector<float> max_dist(names.size());
660                         vector<float> total_dist(names.size());
661                         map<string, string>::iterator itNameFile;
662                         map<string, int>::iterator itNameIndex;
663
664                         //fill seqIndex and initialize sums
665                         for (size_t i = 0; i < names.size(); i++) {
666                                 if (weighted) {
667                                         seqIndex[i] = nameToIndex[names[i]];
668                                 }else { 
669                                         if (namefile == "") {
670                                                 itNameIndex = nameToIndex.find(names[i]);
671                                                 
672                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
673                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
674                                                         else {  seqIndex[i] = (seqVec.size()-1); }
675                                                 }else {
676                                                         seqIndex[i] = itNameIndex->second;
677                                                 }
678                                                 
679                                         }else {
680                                                 itNameFile = nameFileMap.find(names[i]);
681                                                 
682                                                 if (itNameFile == nameFileMap.end()) {
683                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
684                                                 }else{
685                                                         string name1 = itNameFile->first;
686                                                         string name2 = itNameFile->second;
687                                                         
688                                                         if (name1 == name2) { //then you are unique so add your real dists
689                                                                 seqIndex[i] = nameToIndex[names[i]];
690                                                         }else { //add dummy
691                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
692                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
693                                                         }
694                                                 }
695                                         }
696                                 }
697                                 max_dist[i] = 0.0;
698                                 total_dist[i] = 0.0;
699                         }
700                         
701                         // loop through all entries in seqIndex
702                         SeqMap::iterator it;
703                         SeqMap currMap;
704                         for (size_t i=0; i < seqIndex.size(); i++) {
705                                 if (m->control_pressed) {  return  "control"; }
706                         
707                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
708                                 else            {       currMap = getMap(seqIndex[i]);  }
709                                 
710                                 for (size_t j=0; j < seqIndex.size(); j++) {
711                                         it = currMap.find(seqIndex[j]);         
712                                         if (it != currMap.end()) {
713                                                 max_dist[i] = max(max_dist[i], it->second);
714                                                 max_dist[j] = max(max_dist[j], it->second);
715                                                 total_dist[i] += it->second;
716                                                 total_dist[j] += it->second;
717                                         }else{ //if you can't find the distance make it the cutoff
718                                                 max_dist[i] = max(max_dist[i], cutoff);
719                                                 max_dist[j] = max(max_dist[j], cutoff);
720                                                 total_dist[i] += cutoff;
721                                                 total_dist[j] += cutoff;
722                                         }
723                                 }
724                         }
725                         
726                         // sequence with the smallest maximum distance is the representative
727                         //if tie occurs pick sequence with smallest average distance
728                         float min = 10000;
729                         int minIndex;
730                         for (size_t i=0; i < max_dist.size(); i++) {
731                                 if (m->control_pressed) {  return  "control"; }
732                                 if (max_dist[i] < min) {
733                                         min = max_dist[i];
734                                         minIndex = i;
735                                 }else if (max_dist[i] == min) {
736                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
737                                         float newAverage = total_dist[i] / (float) total_dist.size();
738                                         
739                                         if (newAverage < currentAverage) {
740                                                 min = max_dist[i];
741                                                 minIndex = i;
742                                         }
743                                 }
744                         }
745                         
746                         return(names[minIndex]);
747                 }
748         }
749         catch(exception& e) {
750                 m->errorOut(e, "GetOTURepCommand", "FindRep");
751                 exit(1);
752         }
753 }
754
755 //**********************************************************************************************************************
756 int GetOTURepCommand::process(ListVector* processList) {
757         try{
758                 string name, sequence;
759                 string nameRep;
760
761                 //create output file
762                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
763                                 
764                 ofstream newNamesOutput;
765                 string outputNamesFile;
766                 map<string, ofstream*> filehandles;
767                 
768                 if (Groups.size() == 0) { //you don't want to use groups
769                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
770                         m->openOutputFile(outputNamesFile, newNamesOutput);
771                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
772                         outputNameFiles[outputNamesFile] = processList->getLabel();
773                 }else{ //you want to use groups
774                         ofstream* temp;
775                         for (int i=0; i<Groups.size(); i++) {
776                                 temp = new ofstream;
777                                 filehandles[Groups[i]] = temp;
778                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
779                                 
780                                 m->openOutputFile(outputNamesFile, *(temp));
781                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
782                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
783                         }
784                 }
785                 
786                 //for each bin in the list vector
787                 for (int i = 0; i < processList->size(); i++) {
788                         if (m->control_pressed) { 
789                                 out.close();  
790                                 if (Groups.size() == 0) { //you don't want to use groups
791                                         newNamesOutput.close();
792                                 }else{
793                                         for (int j=0; j<Groups.size(); j++) {
794                                                 (*(filehandles[Groups[j]])).close();
795                                                 delete filehandles[Groups[j]];
796                                         }
797                                 }
798                                 return 0; 
799                         }
800                         
801                         string temp = processList->get(i);
802                         vector<string> namesInBin;
803                         m->splitAtComma(temp, namesInBin);
804                         
805                         if (Groups.size() == 0) {
806                                 nameRep = findRep(namesInBin);
807                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
808                         }else{
809                                 map<string, vector<string> > NamesInGroup;
810                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
811                                         NamesInGroup[Groups[j]].resize(0);
812                                 }
813                                 
814                                 for (int j=0; j<namesInBin.size(); j++) {
815                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
816                                         
817                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
818                                         
819                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
820                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
821                                         }
822                                 }
823                                 
824                                 //get rep for each group in otu
825                                 for (int j=0; j<Groups.size(); j++) {
826                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
827                                                 //get rep for each group
828                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
829                                                 
830                                                 //output group rep and other members of this group
831                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
832                                                 
833                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
834                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
835                                                 }
836                                                 //output last name
837                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
838                                         }
839                                 }
840                         }
841                 }
842                 
843                 if (Groups.size() == 0) { //you don't want to use groups
844                         newNamesOutput.close();
845                 }else{
846                         for (int i=0; i<Groups.size(); i++) {
847                                 (*(filehandles[Groups[i]])).close();
848                                 delete filehandles[Groups[i]];
849                         }
850                 }
851                 
852                 return 0;
853
854         }
855         catch(exception& e) {
856                 m->errorOut(e, "GetOTURepCommand", "process");
857                 exit(1);
858         }
859 }
860 //**********************************************************************************************************************
861 int GetOTURepCommand::processNames(string filename, string label) {
862         try{
863
864                 //create output file
865                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
866                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
867                 m->openOutputFile(outputFileName, out);
868                 vector<repStruct> reps;
869                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
870                 
871                 ofstream out2;
872                 string tempNameFile = filename + ".temp";
873                 m->openOutputFile(tempNameFile, out2);
874                 
875                 ifstream in;
876                 m->openInputFile(filename, in);
877                 
878                 int i = 0;
879                 while (!in.eof()) {
880                         string rep, binnames;
881                         in >> i >> rep >> binnames; m->gobble(in);
882                         out2 << rep << '\t' << binnames << endl;
883                         
884                         vector<string> names;
885                         m->splitAtComma(binnames, names);
886                         int binsize = names.size();
887                         
888                         //if you have a groupfile
889                         string group = "";
890                         if (groupfile != "") {
891                                 map<string, string> groups;
892                                 map<string, string>::iterator groupIt;
893                                 
894                                 //find the groups that are in this bin
895                                 for (size_t i = 0; i < names.size(); i++) {
896                                         string groupName = groupMap->getGroup(names[i]);
897                                         if (groupName == "not found") {  
898                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
899                                                 groupError = true;
900                                         } else {
901                                                 groups[groupName] = groupName;
902                                         }
903                                 }
904                                 
905                                 //turn the groups into a string
906                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
907                                         group += groupIt->first + "-";
908                                 }
909                                 //rip off last dash
910                                 group = group.substr(0, group.length()-1);
911                         }else{ group = ""; }
912
913                         
914                         //print out name and sequence for that bin
915                         string sequence = fasta->getSequence(rep);
916
917                         if (sequence != "not found") {
918                                 if (sorted == "") { //print them out
919                                         rep = rep + "\t" + toString(i+1);
920                                         rep = rep + "|" + toString(binsize);
921                                         if (groupfile != "") {
922                                                 rep = rep + "|" + group;
923                                         }
924                                         out << ">" << rep << endl;
925                                         out << sequence << endl;
926                                 }else { //save them
927                                         repStruct newRep(rep, i+1, binsize, group);
928                                         reps.push_back(newRep);
929                                 }
930                         }else { 
931                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
932                         }
933                 }
934                 
935                         
936                 if (sorted != "") { //then sort them and print them
937                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
938                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
939                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
940                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
941                         
942                         //print them
943                         for (int i = 0; i < reps.size(); i++) {
944                                 string sequence = fasta->getSequence(reps[i].name);
945                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
946                                 outputName = outputName + "|" + toString(reps[i].size);
947                                 if (groupfile != "") {
948                                         outputName = outputName + "|" + reps[i].group;
949                                 }
950                                 out << ">" << outputName << endl;
951                                 out << sequence << endl;
952                         }
953                 }
954                 
955                 in.close();
956                 out.close();
957                 out2.close();
958                 
959                 m->mothurRemove(filename);
960                 rename(tempNameFile.c_str(), filename.c_str());
961                 
962                 return 0;
963
964         }
965         catch(exception& e) {
966                 m->errorOut(e, "GetOTURepCommand", "processNames");
967                 exit(1);
968         }
969 }
970 //**********************************************************************************************************************
971 SeqMap GetOTURepCommand::getMap(int row) {
972         try {
973                 SeqMap rowMap;
974                 
975                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
976                 if (rowPositions[row] != -1){
977                         //go to row in file
978                         inRow.seekg(rowPositions[row]);
979                         
980                         int rowNum, numDists, colNum;
981                         float dist;
982                         
983                         inRow >> rowNum >> numDists;
984                         
985                         for(int i = 0; i < numDists; i++) {
986                                 inRow >> colNum >> dist;
987                                 rowMap[colNum] = dist;
988                                 
989                         }
990                 }
991                 
992                 return rowMap;
993         }
994         catch(exception& e) {
995                 m->errorOut(e, "GetOTURepCommand", "getMap");
996                 exit(1);
997         }
998 }
999 //**********************************************************************************************************************
1000