]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
working on current change
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){   
99         try {
100                 abort = true; calledHelp = true; 
101                 setParameters();
102                 vector<string> tempOutNames;
103                 outputTypes["fasta"] = tempOutNames;
104                 outputTypes["name"] = tempOutNames;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
108                 exit(1);
109         }
110 }
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option)  {
113         try{
114                 abort = false; calledHelp = false;   
115                 allLines = 1;
116                                 
117                 //allow user to run help
118                 if (option == "help") { 
119                         help(); abort = true; calledHelp = true;
120                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
121                 } else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string, string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string, string>::iterator it;
129                 
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["name"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("list");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
151                                 }
152                                 
153                                 it = parameters.find("fasta");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
159                                 }
160                                 
161                                 it = parameters.find("phylip");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
167                                 }
168                                 
169                                 it = parameters.find("column");
170                                 //user has given a template file
171                                 if(it != parameters.end()){ 
172                                         path = m->hasPath(it->second);
173                                         //if the user has not given a path then, add inputdir. else leave path alone.
174                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
175                                 }
176                                 
177                                 it = parameters.find("name");
178                                 //user has given a template file
179                                 if(it != parameters.end()){ 
180                                         path = m->hasPath(it->second);
181                                         //if the user has not given a path then, add inputdir. else leave path alone.
182                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
183                                 }
184                                 
185                                 it = parameters.find("group");
186                                 //user has given a template file
187                                 if(it != parameters.end()){ 
188                                         path = m->hasPath(it->second);
189                                         //if the user has not given a path then, add inputdir. else leave path alone.
190                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
191                                 }
192                         }
193
194                         
195                         //if the user changes the output directory command factory will send this info to us in the output parameter 
196                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
197                         
198                         //check for required parameters
199                         fastafile = validParameter.validFile(parameters, "fasta", true);
200                         if (fastafile == "not found") {                                 
201                                 fastafile = m->getFastaFile(); 
202                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204                         }
205                         else if (fastafile == "not open") { abort = true; }     
206                         else { m->setFastaFile(fastafile); }
207                 
208                         listfile = validParameter.validFile(parameters, "list", true);
209                         if (listfile == "not found") {                  
210                                 listfile = m->getListFile(); 
211                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
212                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
213                         }
214                         else if (listfile == "not open") { abort = true; }      
215                         else { m->setListFile(listfile); }
216                         
217                         phylipfile = validParameter.validFile(parameters, "phylip", true);
218                         if (phylipfile == "not found") { phylipfile = "";  }
219                         else if (phylipfile == "not open") { abort = true; }    
220                         else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile);   }
221                         
222                         columnfile = validParameter.validFile(parameters, "column", true);
223                         if (columnfile == "not found") { columnfile = ""; }
224                         else if (columnfile == "not open") { abort = true; }    
225                         else { distFile = columnfile; format = "column";  m->setColumnFile(columnfile); }
226                         
227                         namefile = validParameter.validFile(parameters, "name", true);
228                         if (namefile == "not open") { abort = true; }   
229                         else if (namefile == "not found") { namefile = ""; }
230                         else { m->setNameFile(namefile); }
231                         
232                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
233                                 //give priority to column, then phylip
234                                 columnfile = m->getColumnFile(); 
235                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
236                                 else { 
237                                         phylipfile = m->getPhylipFile(); 
238                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
239                                         else { 
240                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
241                                                 abort = true;
242                                         }
243                                 }
244                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
245                 
246                         if (columnfile != "") {  
247                                 if (namefile == "") {  
248                                         namefile = m->getNameFile(); 
249                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
250                                         else { 
251                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
252                                                 abort = true; 
253                                         }       
254                                 } 
255                         }
256
257                         //check for optional parameter and set defaults
258                         // ...at some point should added some additional type checking...
259                         label = validParameter.validFile(parameters, "label", false);                   
260                         if (label == "not found") { label = ""; allLines = 1;  }
261                         else { 
262                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
263                                 else { allLines = 1;  }
264                         }
265                         
266                         groupfile = validParameter.validFile(parameters, "group", true);
267                         if (groupfile == "not open") { groupfile = ""; abort = true; }
268                         else if (groupfile == "not found") { groupfile = ""; }
269                         else { m->setGroupFile(groupfile); }
270                         
271                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
272                         if (sorted == "none") { sorted=""; }
273                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
274                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
275                                 sorted = "";
276                         }
277                         
278                         if ((sorted == "group") && (groupfile == "")) {
279                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
280                                 sorted = "";
281                         }
282                         
283                         groups = validParameter.validFile(parameters, "groups", false);                 
284                         if (groups == "not found") { groups = ""; }
285                         else { 
286                                 if (groupfile == "") {
287                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
288                                         abort = true;
289                                 }else { 
290                                         m->splitAtDash(groups, Groups);
291                                 }
292                         }
293                         m->Groups = Groups;
294                         
295                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
296                         large = m->isTrue(temp);
297                         
298                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
299                         weighted = m->isTrue(temp);
300                         
301                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
302                         
303                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
304                         convert(temp, precision); 
305                         
306                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
307                         convert(temp, cutoff); 
308                         cutoff += (5 / (precision * 10.0));
309                 }
310         }
311         catch(exception& e) {
312                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
313                 exit(1);
314         }
315 }
316
317 //**********************************************************************************************************************
318
319 int GetOTURepCommand::execute(){
320         try {
321         
322                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
323                 int error;
324                 list = NULL;
325                 
326                 if (!large) {
327                         //read distance files
328                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
329                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
330                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
331                         
332                         readMatrix->setCutoff(cutoff);
333         
334                         if(namefile != ""){     
335                                 nameMap = new NameAssignment(namefile);
336                                 nameMap->readMap();
337                         }else{  nameMap = NULL;         }
338                         
339                         readMatrix->read(nameMap);
340                         
341                         if (m->control_pressed) { delete readMatrix; return 0; }
342
343                         list = readMatrix->getListVector();
344
345                         SparseMatrix* matrix = readMatrix->getMatrix();
346                         
347                         // Create a data structure to quickly access the distance information.
348                         // It consists of a vector of distance maps, where each map contains
349                         // all distances of a certain sequence. Vector and maps are accessed
350                         // via the index of a sequence in the distance matrix
351                         seqVec = vector<SeqMap>(list->size()); 
352                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
353                                 if (m->control_pressed) { delete readMatrix; return 0; }
354                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
355                         }
356                         //add dummy map for unweighted calc
357                         SeqMap dummy;
358                         seqVec.push_back(dummy);
359                         
360                         delete matrix;
361                         delete readMatrix;
362                         delete nameMap;
363                         
364                         if (m->control_pressed) { return 0; }
365                 }else {
366                         //process file and set up indexes
367                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
368                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
369                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
370                         
371                         formatMatrix->setCutoff(cutoff);
372         
373                         if(namefile != ""){     
374                                 nameMap = new NameAssignment(namefile);
375                                 nameMap->readMap();
376                         }else{  nameMap = NULL;         }
377                         
378                         formatMatrix->read(nameMap);
379                         
380                         if (m->control_pressed) { delete formatMatrix;  return 0; }
381
382                         list = formatMatrix->getListVector();
383                         
384                         distFile = formatMatrix->getFormattedFileName();
385                         
386                         //positions in file where the distances for each sequence begin
387                         //rowPositions[1] = position in file where distance related to sequence 1 start.
388                         rowPositions = formatMatrix->getRowPositions();
389                         rowPositions.push_back(-1); //dummy row for unweighted calc
390                         
391                         delete formatMatrix;
392                         delete nameMap;
393                         
394                         //openfile for getMap to use
395                         m->openInputFile(distFile, inRow);
396                         
397                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
398                 }
399                 
400                 
401                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
402                 if (list != NULL) {
403                         vector<string> names;
404                         string binnames;
405                         //map names to rows in sparsematrix
406                         for (int i = 0; i < list->size(); i++) {
407                                 names.clear();
408                                 binnames = list->get(i);
409                                 
410                                 m->splitAtComma(binnames, names);
411                                 
412                                 for (int j = 0; j < names.size(); j++) {
413                                         nameToIndex[names[j]] = i;
414                                 }
415                         }
416                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
417                 
418                                 
419                 if (m->control_pressed) { 
420                         if (large) {  inRow.close(); remove(distFile.c_str());  }
421                         return 0; 
422                 }
423                 
424                 if (groupfile != "") {
425                         //read in group map info.
426                         groupMap = new GroupMap(groupfile);
427                         int error = groupMap->readMap();
428                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
429                         
430                         if (Groups.size() != 0) {
431                                 SharedUtil* util = new SharedUtil();
432                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
433                                 delete util;
434                         }
435                 }
436                 
437                 //done with listvector from matrix
438                 if (list != NULL) { delete list; }
439                 
440                 input = new InputData(listfile, "list");
441                 list = input->getListVector();
442                 string lastLabel = list->getLabel();
443
444                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
445                 set<string> processedLabels;
446                 set<string> userLabels = labels;
447                 
448                 if (m->control_pressed) { 
449                         if (large) {  inRow.close(); remove(distFile.c_str());  }
450                         delete input; delete list; return 0; 
451                 }
452                 
453                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
454                 
455                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
456                         
457                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
458                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
459                                         error = process(list);
460                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
461                                         
462                                         if (m->control_pressed) { 
463                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
464                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
465                                                 delete input; delete list; return 0; 
466                                         }
467                                         
468                                         processedLabels.insert(list->getLabel());
469                                         userLabels.erase(list->getLabel());
470                         }
471                         
472                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
473                                         string saveLabel = list->getLabel();
474                                         
475                                         delete list;
476                                         list = input->getListVector(lastLabel);
477                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
478                                         error = process(list);
479                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
480                                         
481                                         if (m->control_pressed) { 
482                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
483                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
484                                                 delete input; delete list; return 0; 
485                                         }
486                                         
487                                         processedLabels.insert(list->getLabel());
488                                         userLabels.erase(list->getLabel());
489                                         
490                                         //restore real lastlabel to save below
491                                         list->setLabel(saveLabel);
492                         }
493                         
494                         lastLabel = list->getLabel();
495         
496                         delete list;
497                         list = input->getListVector();
498                 }
499                 
500                 //output error messages about any remaining user labels
501                 bool needToRun = false;
502                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
503                         m->mothurOut("Your file does not include the label " + (*it)); 
504                         if (processedLabels.count(lastLabel) != 1) {
505                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
506                                 needToRun = true;
507                         }else {
508                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
509                         }
510                 }
511                 
512                 //run last label if you need to
513                 if (needToRun == true)  {
514                         if (list != NULL) {     delete list;    }
515                         list = input->getListVector(lastLabel);
516                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
517                         error = process(list);
518                         delete list;
519                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
520                         
521                         if (m->control_pressed) { 
522                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
523                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
524                                         delete input; delete list; return 0; 
525                         }
526                 }
527                 
528                 //close and remove formatted matrix file
529                 if (large) {
530                         inRow.close();
531                         remove(distFile.c_str());
532                 }
533                 
534                 delete input;  
535                 
536                 if (!weighted) { nameFileMap.clear(); }
537                 
538                 //read fastafile
539                 fasta = new FastaMap();
540                 fasta->readFastaFile(fastafile);
541                 
542                 //if user gave a namesfile then use it
543                 if (namefile != "") {   readNamesFile();        }
544                 
545                 //output create and output the .rep.fasta files
546                 map<string, string>::iterator itNameFile;
547                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
548                         processNames(itNameFile->first, itNameFile->second);
549                 }
550                 
551                 delete fasta;
552                 if (groupfile != "") { delete groupMap;  }
553                 
554                 if (m->control_pressed) {  return 0; }
555                 
556                 //set fasta file as new current fastafile - use first one??
557                 string current = "";
558                 itTypes = outputTypes.find("fasta");
559                 if (itTypes != outputTypes.end()) {
560                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
561                 }
562                 
563                 itTypes = outputTypes.find("name");
564                 if (itTypes != outputTypes.end()) {
565                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
566                 }
567                 
568                 m->mothurOutEndLine();
569                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
570                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
571                 m->mothurOutEndLine();
572                 
573                 return 0;
574         }
575         catch(exception& e) {
576                 m->errorOut(e, "GetOTURepCommand", "execute");
577                 exit(1);
578         }
579 }
580
581 //**********************************************************************************************************************
582 void GetOTURepCommand::readNamesFile() {
583         try {
584                 ifstream in;
585                 vector<string> dupNames;
586                 m->openInputFile(namefile, in);
587                 
588                 string name, names, sequence;
589         
590                 while(!in.eof()){
591                         in >> name;                     //read from first column  A
592                         in >> names;            //read from second column  A,B,C,D
593                         
594                         dupNames.clear();
595                         
596                         //parse names into vector
597                         m->splitAtComma(names, dupNames);
598                         
599                         //store names in fasta map
600                         sequence = fasta->getSequence(name);
601                         for (int i = 0; i < dupNames.size(); i++) {
602                                 fasta->push_back(dupNames[i], sequence);
603                         }
604                 
605                         m->gobble(in);
606                 }
607                 in.close();
608
609         }
610         catch(exception& e) {
611                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
612                 exit(1);
613         }
614 }
615 //**********************************************************************************************************************
616 //read names file to find the weighted rep for each bin
617 void GetOTURepCommand::readNamesFile(bool w) {
618         try {
619                 ifstream in;
620                 vector<string> dupNames;
621                 m->openInputFile(namefile, in);
622                 
623                 string name, names, sequence;
624                 
625                 while(!in.eof()){
626                         in >> name;     m->gobble(in);          //read from first column  A
627                         in >> names;                                                    //read from second column  A,B,C,D
628                         
629                         dupNames.clear();
630                         
631                         //parse names into vector
632                         m->splitAtComma(names, dupNames);
633                         
634                         for (int i = 0; i < dupNames.size(); i++) {
635                                 nameFileMap[dupNames[i]] = name;
636                         }
637                         
638                         m->gobble(in);
639                 }
640                 in.close();
641                 
642         }
643         catch(exception& e) {
644                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
645                 exit(1);
646         }
647 }
648 //**********************************************************************************************************************
649 string GetOTURepCommand::findRep(vector<string> names) {
650         try{
651                 // if only 1 sequence in bin or processing the "unique" label, then 
652                 // the first sequence of the OTU is the representative one
653                 if ((names.size() == 1)) {
654                         return names[0];
655                 }else{
656                         vector<int> seqIndex(names.size());
657                         vector<float> max_dist(names.size());
658                         vector<float> total_dist(names.size());
659                         map<string, string>::iterator itNameFile;
660                         map<string, int>::iterator itNameIndex;
661
662                         //fill seqIndex and initialize sums
663                         for (size_t i = 0; i < names.size(); i++) {
664                                 if (weighted) {
665                                         seqIndex[i] = nameToIndex[names[i]];
666                                 }else { 
667                                         if (namefile == "") {
668                                                 itNameIndex = nameToIndex.find(names[i]);
669                                                 
670                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
671                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
672                                                         else {  seqIndex[i] = (seqVec.size()-1); }
673                                                 }else {
674                                                         seqIndex[i] = itNameIndex->second;
675                                                 }
676                                                 
677                                         }else {
678                                                 itNameFile = nameFileMap.find(names[i]);
679                                                 
680                                                 if (itNameFile == nameFileMap.end()) {
681                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
682                                                 }else{
683                                                         string name1 = itNameFile->first;
684                                                         string name2 = itNameFile->second;
685                                                         
686                                                         if (name1 == name2) { //then you are unique so add your real dists
687                                                                 seqIndex[i] = nameToIndex[names[i]];
688                                                         }else { //add dummy
689                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
690                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
691                                                         }
692                                                 }
693                                         }
694                                 }
695                                 max_dist[i] = 0.0;
696                                 total_dist[i] = 0.0;
697                         }
698                         
699                         // loop through all entries in seqIndex
700                         SeqMap::iterator it;
701                         SeqMap currMap;
702                         for (size_t i=0; i < seqIndex.size(); i++) {
703                                 if (m->control_pressed) {  return  "control"; }
704                         
705                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
706                                 else            {       currMap = getMap(seqIndex[i]);  }
707                                 
708                                 for (size_t j=0; j < seqIndex.size(); j++) {
709                                         it = currMap.find(seqIndex[j]);         
710                                         if (it != currMap.end()) {
711                                                 max_dist[i] = max(max_dist[i], it->second);
712                                                 max_dist[j] = max(max_dist[j], it->second);
713                                                 total_dist[i] += it->second;
714                                                 total_dist[j] += it->second;
715                                         }else{ //if you can't find the distance make it the cutoff
716                                                 max_dist[i] = max(max_dist[i], cutoff);
717                                                 max_dist[j] = max(max_dist[j], cutoff);
718                                                 total_dist[i] += cutoff;
719                                                 total_dist[j] += cutoff;
720                                         }
721                                 }
722                         }
723                         
724                         // sequence with the smallest maximum distance is the representative
725                         //if tie occurs pick sequence with smallest average distance
726                         float min = 10000;
727                         int minIndex;
728                         for (size_t i=0; i < max_dist.size(); i++) {
729                                 if (m->control_pressed) {  return  "control"; }
730                                 if (max_dist[i] < min) {
731                                         min = max_dist[i];
732                                         minIndex = i;
733                                 }else if (max_dist[i] == min) {
734                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
735                                         float newAverage = total_dist[i] / (float) total_dist.size();
736                                         
737                                         if (newAverage < currentAverage) {
738                                                 min = max_dist[i];
739                                                 minIndex = i;
740                                         }
741                                 }
742                         }
743                         
744                         return(names[minIndex]);
745                 }
746         }
747         catch(exception& e) {
748                 m->errorOut(e, "GetOTURepCommand", "FindRep");
749                 exit(1);
750         }
751 }
752
753 //**********************************************************************************************************************
754 int GetOTURepCommand::process(ListVector* processList) {
755         try{
756                 string name, sequence;
757                 string nameRep;
758
759                 //create output file
760                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
761                                 
762                 ofstream newNamesOutput;
763                 string outputNamesFile;
764                 map<string, ofstream*> filehandles;
765                 
766                 if (Groups.size() == 0) { //you don't want to use groups
767                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
768                         m->openOutputFile(outputNamesFile, newNamesOutput);
769                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
770                         outputNameFiles[outputNamesFile] = processList->getLabel();
771                 }else{ //you want to use groups
772                         ofstream* temp;
773                         for (int i=0; i<Groups.size(); i++) {
774                                 temp = new ofstream;
775                                 filehandles[Groups[i]] = temp;
776                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
777                                 
778                                 m->openOutputFile(outputNamesFile, *(temp));
779                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
780                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
781                         }
782                 }
783                 
784                 //for each bin in the list vector
785                 for (int i = 0; i < processList->size(); i++) {
786                         if (m->control_pressed) { 
787                                 out.close();  
788                                 if (Groups.size() == 0) { //you don't want to use groups
789                                         newNamesOutput.close();
790                                 }else{
791                                         for (int j=0; j<Groups.size(); j++) {
792                                                 (*(filehandles[Groups[j]])).close();
793                                                 delete filehandles[Groups[j]];
794                                         }
795                                 }
796                                 return 0; 
797                         }
798                         
799                         string temp = processList->get(i);
800                         vector<string> namesInBin;
801                         m->splitAtComma(temp, namesInBin);
802                         
803                         if (Groups.size() == 0) {
804                                 nameRep = findRep(namesInBin);
805                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
806                         }else{
807                                 map<string, vector<string> > NamesInGroup;
808                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
809                                         NamesInGroup[Groups[j]].resize(0);
810                                 }
811                                 
812                                 for (int j=0; j<namesInBin.size(); j++) {
813                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
814                                         
815                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
816                                         
817                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
818                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
819                                         }
820                                 }
821                                 
822                                 //get rep for each group in otu
823                                 for (int j=0; j<Groups.size(); j++) {
824                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
825                                                 //get rep for each group
826                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
827                                                 
828                                                 //output group rep and other members of this group
829                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
830                                                 
831                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
832                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
833                                                 }
834                                                 //output last name
835                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
836                                         }
837                                 }
838                         }
839                 }
840                 
841                 if (Groups.size() == 0) { //you don't want to use groups
842                         newNamesOutput.close();
843                 }else{
844                         for (int i=0; i<Groups.size(); i++) {
845                                 (*(filehandles[Groups[i]])).close();
846                                 delete filehandles[Groups[i]];
847                         }
848                 }
849                 
850                 return 0;
851
852         }
853         catch(exception& e) {
854                 m->errorOut(e, "GetOTURepCommand", "process");
855                 exit(1);
856         }
857 }
858 //**********************************************************************************************************************
859 int GetOTURepCommand::processNames(string filename, string label) {
860         try{
861
862                 //create output file
863                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
864                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
865                 m->openOutputFile(outputFileName, out);
866                 vector<repStruct> reps;
867                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
868                 
869                 ofstream out2;
870                 string tempNameFile = filename + ".temp";
871                 m->openOutputFile(tempNameFile, out2);
872                 
873                 ifstream in;
874                 m->openInputFile(filename, in);
875                 
876                 int i = 0;
877                 while (!in.eof()) {
878                         string rep, binnames;
879                         in >> i >> rep >> binnames; m->gobble(in);
880                         out2 << rep << '\t' << binnames << endl;
881                         
882                         vector<string> names;
883                         m->splitAtComma(binnames, names);
884                         int binsize = names.size();
885                         
886                         //if you have a groupfile
887                         string group = "";
888                         if (groupfile != "") {
889                                 map<string, string> groups;
890                                 map<string, string>::iterator groupIt;
891                                 
892                                 //find the groups that are in this bin
893                                 for (size_t i = 0; i < names.size(); i++) {
894                                         string groupName = groupMap->getGroup(names[i]);
895                                         if (groupName == "not found") {  
896                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
897                                                 groupError = true;
898                                         } else {
899                                                 groups[groupName] = groupName;
900                                         }
901                                 }
902                                 
903                                 //turn the groups into a string
904                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
905                                         group += groupIt->first + "-";
906                                 }
907                                 //rip off last dash
908                                 group = group.substr(0, group.length()-1);
909                         }else{ group = ""; }
910
911                         
912                         //print out name and sequence for that bin
913                         string sequence = fasta->getSequence(rep);
914
915                         if (sequence != "not found") {
916                                 if (sorted == "") { //print them out
917                                         rep = rep + "\t" + toString(i+1);
918                                         rep = rep + "|" + toString(binsize);
919                                         if (groupfile != "") {
920                                                 rep = rep + "|" + group;
921                                         }
922                                         out << ">" << rep << endl;
923                                         out << sequence << endl;
924                                 }else { //save them
925                                         repStruct newRep(rep, i+1, binsize, group);
926                                         reps.push_back(newRep);
927                                 }
928                         }else { 
929                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
930                         }
931                 }
932                 
933                         
934                 if (sorted != "") { //then sort them and print them
935                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
936                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
937                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
938                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
939                         
940                         //print them
941                         for (int i = 0; i < reps.size(); i++) {
942                                 string sequence = fasta->getSequence(reps[i].name);
943                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
944                                 outputName = outputName + "|" + toString(reps[i].size);
945                                 if (groupfile != "") {
946                                         outputName = outputName + "|" + reps[i].group;
947                                 }
948                                 out << ">" << outputName << endl;
949                                 out << sequence << endl;
950                         }
951                 }
952                 
953                 in.close();
954                 out.close();
955                 out2.close();
956                 
957                 remove(filename.c_str());
958                 rename(tempNameFile.c_str(), filename.c_str());
959                 
960                 return 0;
961
962         }
963         catch(exception& e) {
964                 m->errorOut(e, "GetOTURepCommand", "processNames");
965                 exit(1);
966         }
967 }
968 //**********************************************************************************************************************
969 SeqMap GetOTURepCommand::getMap(int row) {
970         try {
971                 SeqMap rowMap;
972                 
973                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
974                 if (rowPositions[row] != -1){
975                         //go to row in file
976                         inRow.seekg(rowPositions[row]);
977                         
978                         int rowNum, numDists, colNum;
979                         float dist;
980                         
981                         inRow >> rowNum >> numDists;
982                         
983                         for(int i = 0; i < numDists; i++) {
984                                 inRow >> colNum >> dist;
985                                 rowMap[colNum] = dist;
986                                 
987                         }
988                 }
989                 
990                 return rowMap;
991         }
992         catch(exception& e) {
993                 m->errorOut(e, "GetOTURepCommand", "getMap");
994                 exit(1);
995         }
996 }
997 //**********************************************************************************************************************
998