]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
removed read.dist, read.otu, read.tree and globaldata. added current to defaults...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){   
99         try {
100                 abort = true; calledHelp = true; 
101                 setParameters();
102                 vector<string> tempOutNames;
103                 outputTypes["fasta"] = tempOutNames;
104                 outputTypes["name"] = tempOutNames;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
108                 exit(1);
109         }
110 }
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option)  {
113         try{
114                 abort = false; calledHelp = false;   
115                 allLines = 1;
116                                 
117                 //allow user to run help
118                 if (option == "help") { 
119                         help(); abort = true; calledHelp = true;
120                 } else {
121                         vector<string> myArray = setParameters();
122                         
123                         OptionParser parser(option);
124                         map<string, string> parameters = parser.getParameters();
125                         
126                         ValidParameters validParameter;
127                         map<string, string>::iterator it;
128                 
129                         //check to make sure all parameters are valid for command
130                         for (it = parameters.begin(); it != parameters.end(); it++) { 
131                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
132                         }
133                         
134                         //initialize outputTypes
135                         vector<string> tempOutNames;
136                         outputTypes["fasta"] = tempOutNames;
137                         outputTypes["name"] = tempOutNames;
138                         
139                         //if the user changes the input directory command factory will send this info to us in the output parameter 
140                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
141                         if (inputDir == "not found"){   inputDir = "";          }
142                         else {
143                                 string path;
144                                 it = parameters.find("list");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
150                                 }
151                                 
152                                 it = parameters.find("fasta");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
158                                 }
159                                 
160                                 it = parameters.find("phylip");
161                                 //user has given a template file
162                                 if(it != parameters.end()){ 
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
166                                 }
167                                 
168                                 it = parameters.find("column");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
174                                 }
175                                 
176                                 it = parameters.find("name");
177                                 //user has given a template file
178                                 if(it != parameters.end()){ 
179                                         path = m->hasPath(it->second);
180                                         //if the user has not given a path then, add inputdir. else leave path alone.
181                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
182                                 }
183                                 
184                                 it = parameters.find("group");
185                                 //user has given a template file
186                                 if(it != parameters.end()){ 
187                                         path = m->hasPath(it->second);
188                                         //if the user has not given a path then, add inputdir. else leave path alone.
189                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
190                                 }
191                         }
192
193                         
194                         //if the user changes the output directory command factory will send this info to us in the output parameter 
195                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
196                         
197                         //check for required parameters
198                         fastafile = validParameter.validFile(parameters, "fasta", true);
199                         if (fastafile == "not found") {                                 
200                                 fastafile = m->getFastaFile(); 
201                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
202                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
203                         }
204                         else if (fastafile == "not open") { abort = true; }     
205                 
206                         listfile = validParameter.validFile(parameters, "list", true);
207                         if (listfile == "not found") {                  
208                                 listfile = m->getListFile(); 
209                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
210                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
211                         }
212                         else if (listfile == "not open") { abort = true; }      
213                         
214                         phylipfile = validParameter.validFile(parameters, "phylip", true);
215                         if (phylipfile == "not found") { phylipfile = "";  }
216                         else if (phylipfile == "not open") { abort = true; }    
217                         else { distFile = phylipfile; format = "phylip";   }
218                         
219                         columnfile = validParameter.validFile(parameters, "column", true);
220                         if (columnfile == "not found") { columnfile = ""; }
221                         else if (columnfile == "not open") { abort = true; }    
222                         else { distFile = columnfile; format = "column";   }
223                         
224                         namefile = validParameter.validFile(parameters, "name", true);
225                         if (namefile == "not open") { abort = true; }   
226                         else if (namefile == "not found") { namefile = ""; }
227                         
228                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
229                                 //give priority to column, then phylip
230                                 columnfile = m->getColumnFile(); 
231                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
232                                 else { 
233                                         phylipfile = m->getPhylipFile(); 
234                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
235                                         else { 
236                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
237                                                 abort = true;
238                                         }
239                                 }
240                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
241                 
242                         if (columnfile != "") {  
243                                 if (namefile == "") {  
244                                         namefile = m->getNameFile(); 
245                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
246                                         else { 
247                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
248                                                 abort = true; 
249                                         }       
250                                 } 
251                         }
252
253                         //check for optional parameter and set defaults
254                         // ...at some point should added some additional type checking...
255                         label = validParameter.validFile(parameters, "label", false);                   
256                         if (label == "not found") { label = ""; allLines = 1;  }
257                         else { 
258                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
259                                 else { allLines = 1;  }
260                         }
261                         
262                         groupfile = validParameter.validFile(parameters, "group", true);
263                         if (groupfile == "not open") { groupfile = ""; abort = true; }
264                         else if (groupfile == "not found") { groupfile = ""; }
265                                                 
266                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
267                         if (sorted == "none") { sorted=""; }
268                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
269                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
270                                 sorted = "";
271                         }
272                         
273                         if ((sorted == "group") && (groupfile == "")) {
274                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
275                                 sorted = "";
276                         }
277                         
278                         groups = validParameter.validFile(parameters, "groups", false);                 
279                         if (groups == "not found") { groups = ""; }
280                         else { 
281                                 if (groupfile == "") {
282                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
283                                         abort = true;
284                                 }else { 
285                                         m->splitAtDash(groups, Groups);
286                                 }
287                         }
288                         m->Groups = Groups;
289                         
290                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
291                         large = m->isTrue(temp);
292                         
293                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
294                         weighted = m->isTrue(temp);
295                         
296                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
297                         
298                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
299                         convert(temp, precision); 
300                         
301                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
302                         convert(temp, cutoff); 
303                         cutoff += (5 / (precision * 10.0));
304                 }
305         }
306         catch(exception& e) {
307                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
308                 exit(1);
309         }
310 }
311
312 //**********************************************************************************************************************
313
314 int GetOTURepCommand::execute(){
315         try {
316         
317                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
318                 int error;
319                 list = NULL;
320                 
321                 if (!large) {
322                         //read distance files
323                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
324                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
325                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
326                         
327                         readMatrix->setCutoff(cutoff);
328         
329                         if(namefile != ""){     
330                                 nameMap = new NameAssignment(namefile);
331                                 nameMap->readMap();
332                         }else{  nameMap = NULL;         }
333                         
334                         readMatrix->read(nameMap);
335                         
336                         if (m->control_pressed) { delete readMatrix; return 0; }
337
338                         list = readMatrix->getListVector();
339
340                         SparseMatrix* matrix = readMatrix->getMatrix();
341                         
342                         // Create a data structure to quickly access the distance information.
343                         // It consists of a vector of distance maps, where each map contains
344                         // all distances of a certain sequence. Vector and maps are accessed
345                         // via the index of a sequence in the distance matrix
346                         seqVec = vector<SeqMap>(list->size()); 
347                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
348                                 if (m->control_pressed) { delete readMatrix; return 0; }
349                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
350                         }
351                         //add dummy map for unweighted calc
352                         SeqMap dummy;
353                         seqVec.push_back(dummy);
354                         
355                         delete matrix;
356                         delete readMatrix;
357                         delete nameMap;
358                         
359                         if (m->control_pressed) { return 0; }
360                 }else {
361                         //process file and set up indexes
362                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
363                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
364                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
365                         
366                         formatMatrix->setCutoff(cutoff);
367         
368                         if(namefile != ""){     
369                                 nameMap = new NameAssignment(namefile);
370                                 nameMap->readMap();
371                         }else{  nameMap = NULL;         }
372                         
373                         formatMatrix->read(nameMap);
374                         
375                         if (m->control_pressed) { delete formatMatrix;  return 0; }
376
377                         list = formatMatrix->getListVector();
378                         
379                         distFile = formatMatrix->getFormattedFileName();
380                         
381                         //positions in file where the distances for each sequence begin
382                         //rowPositions[1] = position in file where distance related to sequence 1 start.
383                         rowPositions = formatMatrix->getRowPositions();
384                         rowPositions.push_back(-1); //dummy row for unweighted calc
385                         
386                         delete formatMatrix;
387                         delete nameMap;
388                         
389                         //openfile for getMap to use
390                         m->openInputFile(distFile, inRow);
391                         
392                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
393                 }
394                 
395                 
396                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
397                 if (list != NULL) {
398                         vector<string> names;
399                         string binnames;
400                         //map names to rows in sparsematrix
401                         for (int i = 0; i < list->size(); i++) {
402                                 names.clear();
403                                 binnames = list->get(i);
404                                 
405                                 m->splitAtComma(binnames, names);
406                                 
407                                 for (int j = 0; j < names.size(); j++) {
408                                         nameToIndex[names[j]] = i;
409                                 }
410                         }
411                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
412                 
413                                 
414                 if (m->control_pressed) { 
415                         if (large) {  inRow.close(); remove(distFile.c_str());  }
416                         return 0; 
417                 }
418                 
419                 if (groupfile != "") {
420                         //read in group map info.
421                         groupMap = new GroupMap(groupfile);
422                         int error = groupMap->readMap();
423                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
424                         
425                         if (Groups.size() != 0) {
426                                 SharedUtil* util = new SharedUtil();
427                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
428                                 delete util;
429                         }
430                 }
431                 
432                 //done with listvector from matrix
433                 if (list != NULL) { delete list; }
434                 
435                 input = new InputData(listfile, "list");
436                 list = input->getListVector();
437                 string lastLabel = list->getLabel();
438
439                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
440                 set<string> processedLabels;
441                 set<string> userLabels = labels;
442                 
443                 if (m->control_pressed) { 
444                         if (large) {  inRow.close(); remove(distFile.c_str());  }
445                         delete input; delete list; return 0; 
446                 }
447                 
448                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
449                 
450                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
451                         
452                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
453                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
454                                         error = process(list);
455                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
456                                         
457                                         if (m->control_pressed) { 
458                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
459                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
460                                                 delete input; delete list; return 0; 
461                                         }
462                                         
463                                         processedLabels.insert(list->getLabel());
464                                         userLabels.erase(list->getLabel());
465                         }
466                         
467                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
468                                         string saveLabel = list->getLabel();
469                                         
470                                         delete list;
471                                         list = input->getListVector(lastLabel);
472                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
473                                         error = process(list);
474                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
475                                         
476                                         if (m->control_pressed) { 
477                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
478                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
479                                                 delete input; delete list; return 0; 
480                                         }
481                                         
482                                         processedLabels.insert(list->getLabel());
483                                         userLabels.erase(list->getLabel());
484                                         
485                                         //restore real lastlabel to save below
486                                         list->setLabel(saveLabel);
487                         }
488                         
489                         lastLabel = list->getLabel();
490         
491                         delete list;
492                         list = input->getListVector();
493                 }
494                 
495                 //output error messages about any remaining user labels
496                 bool needToRun = false;
497                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
498                         m->mothurOut("Your file does not include the label " + (*it)); 
499                         if (processedLabels.count(lastLabel) != 1) {
500                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
501                                 needToRun = true;
502                         }else {
503                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
504                         }
505                 }
506                 
507                 //run last label if you need to
508                 if (needToRun == true)  {
509                         if (list != NULL) {     delete list;    }
510                         list = input->getListVector(lastLabel);
511                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
512                         error = process(list);
513                         delete list;
514                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
515                         
516                         if (m->control_pressed) { 
517                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
518                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
519                                         delete input; delete list; return 0; 
520                         }
521                 }
522                 
523                 //close and remove formatted matrix file
524                 if (large) {
525                         inRow.close();
526                         remove(distFile.c_str());
527                 }
528                 
529                 delete input;  
530                 
531                 if (!weighted) { nameFileMap.clear(); }
532                 
533                 //read fastafile
534                 fasta = new FastaMap();
535                 fasta->readFastaFile(fastafile);
536                 
537                 //if user gave a namesfile then use it
538                 if (namefile != "") {   readNamesFile();        }
539                 
540                 //output create and output the .rep.fasta files
541                 map<string, string>::iterator itNameFile;
542                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
543                         processNames(itNameFile->first, itNameFile->second);
544                 }
545                 
546                 delete fasta;
547                 if (groupfile != "") { delete groupMap;  }
548                 
549                 if (m->control_pressed) {  return 0; }
550                 
551                 //set fasta file as new current fastafile - use first one??
552                 string current = "";
553                 itTypes = outputTypes.find("fasta");
554                 if (itTypes != outputTypes.end()) {
555                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
556                 }
557                 
558                 itTypes = outputTypes.find("name");
559                 if (itTypes != outputTypes.end()) {
560                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
561                 }
562                 
563                 m->mothurOutEndLine();
564                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
565                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
566                 m->mothurOutEndLine();
567                 
568                 return 0;
569         }
570         catch(exception& e) {
571                 m->errorOut(e, "GetOTURepCommand", "execute");
572                 exit(1);
573         }
574 }
575
576 //**********************************************************************************************************************
577 void GetOTURepCommand::readNamesFile() {
578         try {
579                 ifstream in;
580                 vector<string> dupNames;
581                 m->openInputFile(namefile, in);
582                 
583                 string name, names, sequence;
584         
585                 while(!in.eof()){
586                         in >> name;                     //read from first column  A
587                         in >> names;            //read from second column  A,B,C,D
588                         
589                         dupNames.clear();
590                         
591                         //parse names into vector
592                         m->splitAtComma(names, dupNames);
593                         
594                         //store names in fasta map
595                         sequence = fasta->getSequence(name);
596                         for (int i = 0; i < dupNames.size(); i++) {
597                                 fasta->push_back(dupNames[i], sequence);
598                         }
599                 
600                         m->gobble(in);
601                 }
602                 in.close();
603
604         }
605         catch(exception& e) {
606                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
607                 exit(1);
608         }
609 }
610 //**********************************************************************************************************************
611 //read names file to find the weighted rep for each bin
612 void GetOTURepCommand::readNamesFile(bool w) {
613         try {
614                 ifstream in;
615                 vector<string> dupNames;
616                 m->openInputFile(namefile, in);
617                 
618                 string name, names, sequence;
619                 
620                 while(!in.eof()){
621                         in >> name;     m->gobble(in);          //read from first column  A
622                         in >> names;                                                    //read from second column  A,B,C,D
623                         
624                         dupNames.clear();
625                         
626                         //parse names into vector
627                         m->splitAtComma(names, dupNames);
628                         
629                         for (int i = 0; i < dupNames.size(); i++) {
630                                 nameFileMap[dupNames[i]] = name;
631                         }
632                         
633                         m->gobble(in);
634                 }
635                 in.close();
636                 
637         }
638         catch(exception& e) {
639                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
640                 exit(1);
641         }
642 }
643 //**********************************************************************************************************************
644 string GetOTURepCommand::findRep(vector<string> names) {
645         try{
646                 // if only 1 sequence in bin or processing the "unique" label, then 
647                 // the first sequence of the OTU is the representative one
648                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
649                         return names[0];
650                 }else{
651                         vector<int> seqIndex(names.size());
652                         vector<float> max_dist(names.size());
653                         vector<float> total_dist(names.size());
654                         map<string, string>::iterator itNameFile;
655                         map<string, int>::iterator itNameIndex;
656
657                         //fill seqIndex and initialize sums
658                         for (size_t i = 0; i < names.size(); i++) {
659                                 if (weighted) {
660                                         seqIndex[i] = nameToIndex[names[i]];
661                                 }else { 
662                                         if (namefile == "") {
663                                                 itNameIndex = nameToIndex.find(names[i]);
664                                                 
665                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
666                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
667                                                         else {  seqIndex[i] = (seqVec.size()-1); }
668                                                 }else {
669                                                         seqIndex[i] = itNameIndex->second;
670                                                 }
671                                                 
672                                         }else {
673                                                 itNameFile = nameFileMap.find(names[i]);
674                                                 
675                                                 if (itNameFile == nameFileMap.end()) {
676                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
677                                                 }else{
678                                                         string name1 = itNameFile->first;
679                                                         string name2 = itNameFile->second;
680                                                         
681                                                         if (name1 == name2) { //then you are unique so add your real dists
682                                                                 seqIndex[i] = nameToIndex[names[i]];
683                                                         }else { //add dummy
684                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
685                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
686                                                         }
687                                                 }
688                                         }
689                                 }
690                                 max_dist[i] = 0.0;
691                                 total_dist[i] = 0.0;
692                         }
693                         
694                         // loop through all entries in seqIndex
695                         SeqMap::iterator it;
696                         SeqMap currMap;
697                         for (size_t i=0; i < seqIndex.size(); i++) {
698                                 if (m->control_pressed) {  return  "control"; }
699                         
700                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
701                                 else            {       currMap = getMap(seqIndex[i]);  }
702                                 
703                                 for (size_t j=0; j < seqIndex.size(); j++) {
704                                         it = currMap.find(seqIndex[j]);         
705                                         if (it != currMap.end()) {
706                                                 max_dist[i] = max(max_dist[i], it->second);
707                                                 max_dist[j] = max(max_dist[j], it->second);
708                                                 total_dist[i] += it->second;
709                                                 total_dist[j] += it->second;
710                                         }else{ //if you can't find the distance make it the cutoff
711                                                 max_dist[i] = max(max_dist[i], cutoff);
712                                                 max_dist[j] = max(max_dist[j], cutoff);
713                                                 total_dist[i] += cutoff;
714                                                 total_dist[j] += cutoff;
715                                         }
716                                 }
717                         }
718                         
719                         // sequence with the smallest maximum distance is the representative
720                         //if tie occurs pick sequence with smallest average distance
721                         float min = 10000;
722                         int minIndex;
723                         for (size_t i=0; i < max_dist.size(); i++) {
724                                 if (m->control_pressed) {  return  "control"; }
725                                 if (max_dist[i] < min) {
726                                         min = max_dist[i];
727                                         minIndex = i;
728                                 }else if (max_dist[i] == min) {
729                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
730                                         float newAverage = total_dist[i] / (float) total_dist.size();
731                                         
732                                         if (newAverage < currentAverage) {
733                                                 min = max_dist[i];
734                                                 minIndex = i;
735                                         }
736                                 }
737                         }
738                         
739                         return(names[minIndex]);
740                 }
741         }
742         catch(exception& e) {
743                 m->errorOut(e, "GetOTURepCommand", "FindRep");
744                 exit(1);
745         }
746 }
747
748 //**********************************************************************************************************************
749 int GetOTURepCommand::process(ListVector* processList) {
750         try{
751                 string name, sequence;
752                 string nameRep;
753
754                 //create output file
755                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
756                                 
757                 ofstream newNamesOutput;
758                 string outputNamesFile;
759                 map<string, ofstream*> filehandles;
760                 
761                 if (Groups.size() == 0) { //you don't want to use groups
762                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
763                         m->openOutputFile(outputNamesFile, newNamesOutput);
764                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
765                         outputNameFiles[outputNamesFile] = processList->getLabel();
766                 }else{ //you want to use groups
767                         ofstream* temp;
768                         for (int i=0; i<Groups.size(); i++) {
769                                 temp = new ofstream;
770                                 filehandles[Groups[i]] = temp;
771                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
772                                 
773                                 m->openOutputFile(outputNamesFile, *(temp));
774                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
775                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
776                         }
777                 }
778                 
779                 //for each bin in the list vector
780                 for (int i = 0; i < processList->size(); i++) {
781                         if (m->control_pressed) { 
782                                 out.close();  
783                                 if (Groups.size() == 0) { //you don't want to use groups
784                                         newNamesOutput.close();
785                                 }else{
786                                         for (int j=0; j<Groups.size(); j++) {
787                                                 (*(filehandles[Groups[j]])).close();
788                                                 delete filehandles[Groups[j]];
789                                         }
790                                 }
791                                 return 0; 
792                         }
793                         
794                         string temp = processList->get(i);
795                         vector<string> namesInBin;
796                         m->splitAtComma(temp, namesInBin);
797                         
798                         if (Groups.size() == 0) {
799                                 nameRep = findRep(namesInBin);
800                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
801                         }else{
802                                 map<string, vector<string> > NamesInGroup;
803                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
804                                         NamesInGroup[Groups[j]].resize(0);
805                                 }
806                                 
807                                 for (int j=0; j<namesInBin.size(); j++) {
808                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
809                                         
810                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
811                                         
812                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
813                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
814                                         }
815                                 }
816                                 
817                                 //get rep for each group in otu
818                                 for (int j=0; j<Groups.size(); j++) {
819                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
820                                                 //get rep for each group
821                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
822                                                 
823                                                 //output group rep and other members of this group
824                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
825                                                 
826                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
827                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
828                                                 }
829                                                 //output last name
830                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
831                                         }
832                                 }
833                         }
834                 }
835                 
836                 if (Groups.size() == 0) { //you don't want to use groups
837                         newNamesOutput.close();
838                 }else{
839                         for (int i=0; i<Groups.size(); i++) {
840                                 (*(filehandles[Groups[i]])).close();
841                                 delete filehandles[Groups[i]];
842                         }
843                 }
844                 
845                 return 0;
846
847         }
848         catch(exception& e) {
849                 m->errorOut(e, "GetOTURepCommand", "process");
850                 exit(1);
851         }
852 }
853 //**********************************************************************************************************************
854 int GetOTURepCommand::processNames(string filename, string label) {
855         try{
856
857                 //create output file
858                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
859                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
860                 m->openOutputFile(outputFileName, out);
861                 vector<repStruct> reps;
862                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
863                 
864                 ofstream out2;
865                 string tempNameFile = filename + ".temp";
866                 m->openOutputFile(tempNameFile, out2);
867                 
868                 ifstream in;
869                 m->openInputFile(filename, in);
870                 
871                 int i = 0;
872                 while (!in.eof()) {
873                         string rep, binnames;
874                         in >> i >> rep >> binnames; m->gobble(in);
875                         out2 << rep << '\t' << binnames << endl;
876                         
877                         vector<string> names;
878                         m->splitAtComma(binnames, names);
879                         int binsize = names.size();
880                         
881                         //if you have a groupfile
882                         string group = "";
883                         if (groupfile != "") {
884                                 map<string, string> groups;
885                                 map<string, string>::iterator groupIt;
886                                 
887                                 //find the groups that are in this bin
888                                 for (size_t i = 0; i < names.size(); i++) {
889                                         string groupName = groupMap->getGroup(names[i]);
890                                         if (groupName == "not found") {  
891                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
892                                                 groupError = true;
893                                         } else {
894                                                 groups[groupName] = groupName;
895                                         }
896                                 }
897                                 
898                                 //turn the groups into a string
899                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
900                                         group += groupIt->first + "-";
901                                 }
902                                 //rip off last dash
903                                 group = group.substr(0, group.length()-1);
904                         }else{ group = ""; }
905
906                         
907                         //print out name and sequence for that bin
908                         string sequence = fasta->getSequence(rep);
909
910                         if (sequence != "not found") {
911                                 if (sorted == "") { //print them out
912                                         rep = rep + "\t" + toString(i+1);
913                                         rep = rep + "|" + toString(binsize);
914                                         if (groupfile != "") {
915                                                 rep = rep + "|" + group;
916                                         }
917                                         out << ">" << rep << endl;
918                                         out << sequence << endl;
919                                 }else { //save them
920                                         repStruct newRep(rep, i+1, binsize, group);
921                                         reps.push_back(newRep);
922                                 }
923                         }else { 
924                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
925                         }
926                 }
927                 
928                         
929                 if (sorted != "") { //then sort them and print them
930                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
931                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
932                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
933                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
934                         
935                         //print them
936                         for (int i = 0; i < reps.size(); i++) {
937                                 string sequence = fasta->getSequence(reps[i].name);
938                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
939                                 outputName = outputName + "|" + toString(reps[i].size);
940                                 if (groupfile != "") {
941                                         outputName = outputName + "|" + reps[i].group;
942                                 }
943                                 out << ">" << outputName << endl;
944                                 out << sequence << endl;
945                         }
946                 }
947                 
948                 in.close();
949                 out.close();
950                 out2.close();
951                 
952                 remove(filename.c_str());
953                 rename(tempNameFile.c_str(), filename.c_str());
954                 
955                 return 0;
956
957         }
958         catch(exception& e) {
959                 m->errorOut(e, "GetOTURepCommand", "processNames");
960                 exit(1);
961         }
962 }
963 //**********************************************************************************************************************
964 SeqMap GetOTURepCommand::getMap(int row) {
965         try {
966                 SeqMap rowMap;
967                 
968                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
969                 if (rowPositions[row] != -1){
970                         //go to row in file
971                         inRow.seekg(rowPositions[row]);
972                         
973                         int rowNum, numDists, colNum;
974                         float dist;
975                         
976                         inRow >> rowNum >> numDists;
977                         
978                         for(int i = 0; i < numDists; i++) {
979                                 inRow >> colNum >> dist;
980                                 rowMap[colNum] = dist;
981                                 
982                         }
983                 }
984                 
985                 return rowMap;
986         }
987         catch(exception& e) {
988                 m->errorOut(e, "GetOTURepCommand", "getMap");
989                 exit(1);
990         }
991 }
992 //**********************************************************************************************************************
993