]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added citation function to commands
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 GetOTURepCommand::GetOTURepCommand(){   
99         try {
100                 abort = true; calledHelp = true; 
101                 setParameters();
102                 vector<string> tempOutNames;
103                 outputTypes["fasta"] = tempOutNames;
104                 outputTypes["name"] = tempOutNames;
105         }
106         catch(exception& e) {
107                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
108                 exit(1);
109         }
110 }
111 //**********************************************************************************************************************
112 GetOTURepCommand::GetOTURepCommand(string option)  {
113         try{
114                 abort = false; calledHelp = false;   
115                 allLines = 1;
116                                 
117                 //allow user to run help
118                 if (option == "help") { 
119                         help(); abort = true; calledHelp = true;
120                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
121                 } else {
122                         vector<string> myArray = setParameters();
123                         
124                         OptionParser parser(option);
125                         map<string, string> parameters = parser.getParameters();
126                         
127                         ValidParameters validParameter;
128                         map<string, string>::iterator it;
129                 
130                         //check to make sure all parameters are valid for command
131                         for (it = parameters.begin(); it != parameters.end(); it++) { 
132                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
133                         }
134                         
135                         //initialize outputTypes
136                         vector<string> tempOutNames;
137                         outputTypes["fasta"] = tempOutNames;
138                         outputTypes["name"] = tempOutNames;
139                         
140                         //if the user changes the input directory command factory will send this info to us in the output parameter 
141                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
142                         if (inputDir == "not found"){   inputDir = "";          }
143                         else {
144                                 string path;
145                                 it = parameters.find("list");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
151                                 }
152                                 
153                                 it = parameters.find("fasta");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
159                                 }
160                                 
161                                 it = parameters.find("phylip");
162                                 //user has given a template file
163                                 if(it != parameters.end()){ 
164                                         path = m->hasPath(it->second);
165                                         //if the user has not given a path then, add inputdir. else leave path alone.
166                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
167                                 }
168                                 
169                                 it = parameters.find("column");
170                                 //user has given a template file
171                                 if(it != parameters.end()){ 
172                                         path = m->hasPath(it->second);
173                                         //if the user has not given a path then, add inputdir. else leave path alone.
174                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
175                                 }
176                                 
177                                 it = parameters.find("name");
178                                 //user has given a template file
179                                 if(it != parameters.end()){ 
180                                         path = m->hasPath(it->second);
181                                         //if the user has not given a path then, add inputdir. else leave path alone.
182                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
183                                 }
184                                 
185                                 it = parameters.find("group");
186                                 //user has given a template file
187                                 if(it != parameters.end()){ 
188                                         path = m->hasPath(it->second);
189                                         //if the user has not given a path then, add inputdir. else leave path alone.
190                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
191                                 }
192                         }
193
194                         
195                         //if the user changes the output directory command factory will send this info to us in the output parameter 
196                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
197                         
198                         //check for required parameters
199                         fastafile = validParameter.validFile(parameters, "fasta", true);
200                         if (fastafile == "not found") {                                 
201                                 fastafile = m->getFastaFile(); 
202                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204                         }
205                         else if (fastafile == "not open") { abort = true; }     
206                 
207                         listfile = validParameter.validFile(parameters, "list", true);
208                         if (listfile == "not found") {                  
209                                 listfile = m->getListFile(); 
210                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
211                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
212                         }
213                         else if (listfile == "not open") { abort = true; }      
214                         
215                         phylipfile = validParameter.validFile(parameters, "phylip", true);
216                         if (phylipfile == "not found") { phylipfile = "";  }
217                         else if (phylipfile == "not open") { abort = true; }    
218                         else { distFile = phylipfile; format = "phylip";   }
219                         
220                         columnfile = validParameter.validFile(parameters, "column", true);
221                         if (columnfile == "not found") { columnfile = ""; }
222                         else if (columnfile == "not open") { abort = true; }    
223                         else { distFile = columnfile; format = "column";   }
224                         
225                         namefile = validParameter.validFile(parameters, "name", true);
226                         if (namefile == "not open") { abort = true; }   
227                         else if (namefile == "not found") { namefile = ""; }
228                         
229                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
230                                 //give priority to column, then phylip
231                                 columnfile = m->getColumnFile(); 
232                                 if (columnfile != "") {  m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
233                                 else { 
234                                         phylipfile = m->getPhylipFile(); 
235                                         if (phylipfile != "") {  m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
236                                         else { 
237                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
238                                                 abort = true;
239                                         }
240                                 }
241                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
242                 
243                         if (columnfile != "") {  
244                                 if (namefile == "") {  
245                                         namefile = m->getNameFile(); 
246                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
247                                         else { 
248                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
249                                                 abort = true; 
250                                         }       
251                                 } 
252                         }
253
254                         //check for optional parameter and set defaults
255                         // ...at some point should added some additional type checking...
256                         label = validParameter.validFile(parameters, "label", false);                   
257                         if (label == "not found") { label = ""; allLines = 1;  }
258                         else { 
259                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
260                                 else { allLines = 1;  }
261                         }
262                         
263                         groupfile = validParameter.validFile(parameters, "group", true);
264                         if (groupfile == "not open") { groupfile = ""; abort = true; }
265                         else if (groupfile == "not found") { groupfile = ""; }
266                                                 
267                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
268                         if (sorted == "none") { sorted=""; }
269                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
270                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
271                                 sorted = "";
272                         }
273                         
274                         if ((sorted == "group") && (groupfile == "")) {
275                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
276                                 sorted = "";
277                         }
278                         
279                         groups = validParameter.validFile(parameters, "groups", false);                 
280                         if (groups == "not found") { groups = ""; }
281                         else { 
282                                 if (groupfile == "") {
283                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
284                                         abort = true;
285                                 }else { 
286                                         m->splitAtDash(groups, Groups);
287                                 }
288                         }
289                         m->Groups = Groups;
290                         
291                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
292                         large = m->isTrue(temp);
293                         
294                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
295                         weighted = m->isTrue(temp);
296                         
297                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
298                         
299                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
300                         convert(temp, precision); 
301                         
302                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
303                         convert(temp, cutoff); 
304                         cutoff += (5 / (precision * 10.0));
305                 }
306         }
307         catch(exception& e) {
308                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
309                 exit(1);
310         }
311 }
312
313 //**********************************************************************************************************************
314
315 int GetOTURepCommand::execute(){
316         try {
317         
318                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
319                 int error;
320                 list = NULL;
321                 
322                 if (!large) {
323                         //read distance files
324                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
325                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
326                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
327                         
328                         readMatrix->setCutoff(cutoff);
329         
330                         if(namefile != ""){     
331                                 nameMap = new NameAssignment(namefile);
332                                 nameMap->readMap();
333                         }else{  nameMap = NULL;         }
334                         
335                         readMatrix->read(nameMap);
336                         
337                         if (m->control_pressed) { delete readMatrix; return 0; }
338
339                         list = readMatrix->getListVector();
340
341                         SparseMatrix* matrix = readMatrix->getMatrix();
342                         
343                         // Create a data structure to quickly access the distance information.
344                         // It consists of a vector of distance maps, where each map contains
345                         // all distances of a certain sequence. Vector and maps are accessed
346                         // via the index of a sequence in the distance matrix
347                         seqVec = vector<SeqMap>(list->size()); 
348                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
349                                 if (m->control_pressed) { delete readMatrix; return 0; }
350                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
351                         }
352                         //add dummy map for unweighted calc
353                         SeqMap dummy;
354                         seqVec.push_back(dummy);
355                         
356                         delete matrix;
357                         delete readMatrix;
358                         delete nameMap;
359                         
360                         if (m->control_pressed) { return 0; }
361                 }else {
362                         //process file and set up indexes
363                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
364                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
365                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
366                         
367                         formatMatrix->setCutoff(cutoff);
368         
369                         if(namefile != ""){     
370                                 nameMap = new NameAssignment(namefile);
371                                 nameMap->readMap();
372                         }else{  nameMap = NULL;         }
373                         
374                         formatMatrix->read(nameMap);
375                         
376                         if (m->control_pressed) { delete formatMatrix;  return 0; }
377
378                         list = formatMatrix->getListVector();
379                         
380                         distFile = formatMatrix->getFormattedFileName();
381                         
382                         //positions in file where the distances for each sequence begin
383                         //rowPositions[1] = position in file where distance related to sequence 1 start.
384                         rowPositions = formatMatrix->getRowPositions();
385                         rowPositions.push_back(-1); //dummy row for unweighted calc
386                         
387                         delete formatMatrix;
388                         delete nameMap;
389                         
390                         //openfile for getMap to use
391                         m->openInputFile(distFile, inRow);
392                         
393                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
394                 }
395                 
396                 
397                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
398                 if (list != NULL) {
399                         vector<string> names;
400                         string binnames;
401                         //map names to rows in sparsematrix
402                         for (int i = 0; i < list->size(); i++) {
403                                 names.clear();
404                                 binnames = list->get(i);
405                                 
406                                 m->splitAtComma(binnames, names);
407                                 
408                                 for (int j = 0; j < names.size(); j++) {
409                                         nameToIndex[names[j]] = i;
410                                 }
411                         }
412                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
413                 
414                                 
415                 if (m->control_pressed) { 
416                         if (large) {  inRow.close(); remove(distFile.c_str());  }
417                         return 0; 
418                 }
419                 
420                 if (groupfile != "") {
421                         //read in group map info.
422                         groupMap = new GroupMap(groupfile);
423                         int error = groupMap->readMap();
424                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
425                         
426                         if (Groups.size() != 0) {
427                                 SharedUtil* util = new SharedUtil();
428                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
429                                 delete util;
430                         }
431                 }
432                 
433                 //done with listvector from matrix
434                 if (list != NULL) { delete list; }
435                 
436                 input = new InputData(listfile, "list");
437                 list = input->getListVector();
438                 string lastLabel = list->getLabel();
439
440                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
441                 set<string> processedLabels;
442                 set<string> userLabels = labels;
443                 
444                 if (m->control_pressed) { 
445                         if (large) {  inRow.close(); remove(distFile.c_str());  }
446                         delete input; delete list; return 0; 
447                 }
448                 
449                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
450                 
451                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
452                         
453                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
454                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
455                                         error = process(list);
456                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
457                                         
458                                         if (m->control_pressed) { 
459                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
460                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
461                                                 delete input; delete list; return 0; 
462                                         }
463                                         
464                                         processedLabels.insert(list->getLabel());
465                                         userLabels.erase(list->getLabel());
466                         }
467                         
468                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
469                                         string saveLabel = list->getLabel();
470                                         
471                                         delete list;
472                                         list = input->getListVector(lastLabel);
473                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
474                                         error = process(list);
475                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
476                                         
477                                         if (m->control_pressed) { 
478                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
479                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
480                                                 delete input; delete list; return 0; 
481                                         }
482                                         
483                                         processedLabels.insert(list->getLabel());
484                                         userLabels.erase(list->getLabel());
485                                         
486                                         //restore real lastlabel to save below
487                                         list->setLabel(saveLabel);
488                         }
489                         
490                         lastLabel = list->getLabel();
491         
492                         delete list;
493                         list = input->getListVector();
494                 }
495                 
496                 //output error messages about any remaining user labels
497                 bool needToRun = false;
498                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
499                         m->mothurOut("Your file does not include the label " + (*it)); 
500                         if (processedLabels.count(lastLabel) != 1) {
501                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
502                                 needToRun = true;
503                         }else {
504                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
505                         }
506                 }
507                 
508                 //run last label if you need to
509                 if (needToRun == true)  {
510                         if (list != NULL) {     delete list;    }
511                         list = input->getListVector(lastLabel);
512                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
513                         error = process(list);
514                         delete list;
515                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
516                         
517                         if (m->control_pressed) { 
518                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
519                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
520                                         delete input; delete list; return 0; 
521                         }
522                 }
523                 
524                 //close and remove formatted matrix file
525                 if (large) {
526                         inRow.close();
527                         remove(distFile.c_str());
528                 }
529                 
530                 delete input;  
531                 
532                 if (!weighted) { nameFileMap.clear(); }
533                 
534                 //read fastafile
535                 fasta = new FastaMap();
536                 fasta->readFastaFile(fastafile);
537                 
538                 //if user gave a namesfile then use it
539                 if (namefile != "") {   readNamesFile();        }
540                 
541                 //output create and output the .rep.fasta files
542                 map<string, string>::iterator itNameFile;
543                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
544                         processNames(itNameFile->first, itNameFile->second);
545                 }
546                 
547                 delete fasta;
548                 if (groupfile != "") { delete groupMap;  }
549                 
550                 if (m->control_pressed) {  return 0; }
551                 
552                 //set fasta file as new current fastafile - use first one??
553                 string current = "";
554                 itTypes = outputTypes.find("fasta");
555                 if (itTypes != outputTypes.end()) {
556                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
557                 }
558                 
559                 itTypes = outputTypes.find("name");
560                 if (itTypes != outputTypes.end()) {
561                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
562                 }
563                 
564                 m->mothurOutEndLine();
565                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
566                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
567                 m->mothurOutEndLine();
568                 
569                 return 0;
570         }
571         catch(exception& e) {
572                 m->errorOut(e, "GetOTURepCommand", "execute");
573                 exit(1);
574         }
575 }
576
577 //**********************************************************************************************************************
578 void GetOTURepCommand::readNamesFile() {
579         try {
580                 ifstream in;
581                 vector<string> dupNames;
582                 m->openInputFile(namefile, in);
583                 
584                 string name, names, sequence;
585         
586                 while(!in.eof()){
587                         in >> name;                     //read from first column  A
588                         in >> names;            //read from second column  A,B,C,D
589                         
590                         dupNames.clear();
591                         
592                         //parse names into vector
593                         m->splitAtComma(names, dupNames);
594                         
595                         //store names in fasta map
596                         sequence = fasta->getSequence(name);
597                         for (int i = 0; i < dupNames.size(); i++) {
598                                 fasta->push_back(dupNames[i], sequence);
599                         }
600                 
601                         m->gobble(in);
602                 }
603                 in.close();
604
605         }
606         catch(exception& e) {
607                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
608                 exit(1);
609         }
610 }
611 //**********************************************************************************************************************
612 //read names file to find the weighted rep for each bin
613 void GetOTURepCommand::readNamesFile(bool w) {
614         try {
615                 ifstream in;
616                 vector<string> dupNames;
617                 m->openInputFile(namefile, in);
618                 
619                 string name, names, sequence;
620                 
621                 while(!in.eof()){
622                         in >> name;     m->gobble(in);          //read from first column  A
623                         in >> names;                                                    //read from second column  A,B,C,D
624                         
625                         dupNames.clear();
626                         
627                         //parse names into vector
628                         m->splitAtComma(names, dupNames);
629                         
630                         for (int i = 0; i < dupNames.size(); i++) {
631                                 nameFileMap[dupNames[i]] = name;
632                         }
633                         
634                         m->gobble(in);
635                 }
636                 in.close();
637                 
638         }
639         catch(exception& e) {
640                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
641                 exit(1);
642         }
643 }
644 //**********************************************************************************************************************
645 string GetOTURepCommand::findRep(vector<string> names) {
646         try{
647                 // if only 1 sequence in bin or processing the "unique" label, then 
648                 // the first sequence of the OTU is the representative one
649                 if ((names.size() == 1)) {
650                         return names[0];
651                 }else{
652                         vector<int> seqIndex(names.size());
653                         vector<float> max_dist(names.size());
654                         vector<float> total_dist(names.size());
655                         map<string, string>::iterator itNameFile;
656                         map<string, int>::iterator itNameIndex;
657
658                         //fill seqIndex and initialize sums
659                         for (size_t i = 0; i < names.size(); i++) {
660                                 if (weighted) {
661                                         seqIndex[i] = nameToIndex[names[i]];
662                                 }else { 
663                                         if (namefile == "") {
664                                                 itNameIndex = nameToIndex.find(names[i]);
665                                                 
666                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
667                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
668                                                         else {  seqIndex[i] = (seqVec.size()-1); }
669                                                 }else {
670                                                         seqIndex[i] = itNameIndex->second;
671                                                 }
672                                                 
673                                         }else {
674                                                 itNameFile = nameFileMap.find(names[i]);
675                                                 
676                                                 if (itNameFile == nameFileMap.end()) {
677                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
678                                                 }else{
679                                                         string name1 = itNameFile->first;
680                                                         string name2 = itNameFile->second;
681                                                         
682                                                         if (name1 == name2) { //then you are unique so add your real dists
683                                                                 seqIndex[i] = nameToIndex[names[i]];
684                                                         }else { //add dummy
685                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
686                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
687                                                         }
688                                                 }
689                                         }
690                                 }
691                                 max_dist[i] = 0.0;
692                                 total_dist[i] = 0.0;
693                         }
694                         
695                         // loop through all entries in seqIndex
696                         SeqMap::iterator it;
697                         SeqMap currMap;
698                         for (size_t i=0; i < seqIndex.size(); i++) {
699                                 if (m->control_pressed) {  return  "control"; }
700                         
701                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
702                                 else            {       currMap = getMap(seqIndex[i]);  }
703                                 
704                                 for (size_t j=0; j < seqIndex.size(); j++) {
705                                         it = currMap.find(seqIndex[j]);         
706                                         if (it != currMap.end()) {
707                                                 max_dist[i] = max(max_dist[i], it->second);
708                                                 max_dist[j] = max(max_dist[j], it->second);
709                                                 total_dist[i] += it->second;
710                                                 total_dist[j] += it->second;
711                                         }else{ //if you can't find the distance make it the cutoff
712                                                 max_dist[i] = max(max_dist[i], cutoff);
713                                                 max_dist[j] = max(max_dist[j], cutoff);
714                                                 total_dist[i] += cutoff;
715                                                 total_dist[j] += cutoff;
716                                         }
717                                 }
718                         }
719                         
720                         // sequence with the smallest maximum distance is the representative
721                         //if tie occurs pick sequence with smallest average distance
722                         float min = 10000;
723                         int minIndex;
724                         for (size_t i=0; i < max_dist.size(); i++) {
725                                 if (m->control_pressed) {  return  "control"; }
726                                 if (max_dist[i] < min) {
727                                         min = max_dist[i];
728                                         minIndex = i;
729                                 }else if (max_dist[i] == min) {
730                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
731                                         float newAverage = total_dist[i] / (float) total_dist.size();
732                                         
733                                         if (newAverage < currentAverage) {
734                                                 min = max_dist[i];
735                                                 minIndex = i;
736                                         }
737                                 }
738                         }
739                         
740                         return(names[minIndex]);
741                 }
742         }
743         catch(exception& e) {
744                 m->errorOut(e, "GetOTURepCommand", "FindRep");
745                 exit(1);
746         }
747 }
748
749 //**********************************************************************************************************************
750 int GetOTURepCommand::process(ListVector* processList) {
751         try{
752                 string name, sequence;
753                 string nameRep;
754
755                 //create output file
756                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
757                                 
758                 ofstream newNamesOutput;
759                 string outputNamesFile;
760                 map<string, ofstream*> filehandles;
761                 
762                 if (Groups.size() == 0) { //you don't want to use groups
763                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
764                         m->openOutputFile(outputNamesFile, newNamesOutput);
765                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
766                         outputNameFiles[outputNamesFile] = processList->getLabel();
767                 }else{ //you want to use groups
768                         ofstream* temp;
769                         for (int i=0; i<Groups.size(); i++) {
770                                 temp = new ofstream;
771                                 filehandles[Groups[i]] = temp;
772                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
773                                 
774                                 m->openOutputFile(outputNamesFile, *(temp));
775                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
776                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
777                         }
778                 }
779                 
780                 //for each bin in the list vector
781                 for (int i = 0; i < processList->size(); i++) {
782                         if (m->control_pressed) { 
783                                 out.close();  
784                                 if (Groups.size() == 0) { //you don't want to use groups
785                                         newNamesOutput.close();
786                                 }else{
787                                         for (int j=0; j<Groups.size(); j++) {
788                                                 (*(filehandles[Groups[j]])).close();
789                                                 delete filehandles[Groups[j]];
790                                         }
791                                 }
792                                 return 0; 
793                         }
794                         
795                         string temp = processList->get(i);
796                         vector<string> namesInBin;
797                         m->splitAtComma(temp, namesInBin);
798                         
799                         if (Groups.size() == 0) {
800                                 nameRep = findRep(namesInBin);
801                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
802                         }else{
803                                 map<string, vector<string> > NamesInGroup;
804                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
805                                         NamesInGroup[Groups[j]].resize(0);
806                                 }
807                                 
808                                 for (int j=0; j<namesInBin.size(); j++) {
809                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
810                                         
811                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
812                                         
813                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
814                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
815                                         }
816                                 }
817                                 
818                                 //get rep for each group in otu
819                                 for (int j=0; j<Groups.size(); j++) {
820                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
821                                                 //get rep for each group
822                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
823                                                 
824                                                 //output group rep and other members of this group
825                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
826                                                 
827                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
828                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
829                                                 }
830                                                 //output last name
831                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
832                                         }
833                                 }
834                         }
835                 }
836                 
837                 if (Groups.size() == 0) { //you don't want to use groups
838                         newNamesOutput.close();
839                 }else{
840                         for (int i=0; i<Groups.size(); i++) {
841                                 (*(filehandles[Groups[i]])).close();
842                                 delete filehandles[Groups[i]];
843                         }
844                 }
845                 
846                 return 0;
847
848         }
849         catch(exception& e) {
850                 m->errorOut(e, "GetOTURepCommand", "process");
851                 exit(1);
852         }
853 }
854 //**********************************************************************************************************************
855 int GetOTURepCommand::processNames(string filename, string label) {
856         try{
857
858                 //create output file
859                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
860                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
861                 m->openOutputFile(outputFileName, out);
862                 vector<repStruct> reps;
863                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
864                 
865                 ofstream out2;
866                 string tempNameFile = filename + ".temp";
867                 m->openOutputFile(tempNameFile, out2);
868                 
869                 ifstream in;
870                 m->openInputFile(filename, in);
871                 
872                 int i = 0;
873                 while (!in.eof()) {
874                         string rep, binnames;
875                         in >> i >> rep >> binnames; m->gobble(in);
876                         out2 << rep << '\t' << binnames << endl;
877                         
878                         vector<string> names;
879                         m->splitAtComma(binnames, names);
880                         int binsize = names.size();
881                         
882                         //if you have a groupfile
883                         string group = "";
884                         if (groupfile != "") {
885                                 map<string, string> groups;
886                                 map<string, string>::iterator groupIt;
887                                 
888                                 //find the groups that are in this bin
889                                 for (size_t i = 0; i < names.size(); i++) {
890                                         string groupName = groupMap->getGroup(names[i]);
891                                         if (groupName == "not found") {  
892                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
893                                                 groupError = true;
894                                         } else {
895                                                 groups[groupName] = groupName;
896                                         }
897                                 }
898                                 
899                                 //turn the groups into a string
900                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
901                                         group += groupIt->first + "-";
902                                 }
903                                 //rip off last dash
904                                 group = group.substr(0, group.length()-1);
905                         }else{ group = ""; }
906
907                         
908                         //print out name and sequence for that bin
909                         string sequence = fasta->getSequence(rep);
910
911                         if (sequence != "not found") {
912                                 if (sorted == "") { //print them out
913                                         rep = rep + "\t" + toString(i+1);
914                                         rep = rep + "|" + toString(binsize);
915                                         if (groupfile != "") {
916                                                 rep = rep + "|" + group;
917                                         }
918                                         out << ">" << rep << endl;
919                                         out << sequence << endl;
920                                 }else { //save them
921                                         repStruct newRep(rep, i+1, binsize, group);
922                                         reps.push_back(newRep);
923                                 }
924                         }else { 
925                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
926                         }
927                 }
928                 
929                         
930                 if (sorted != "") { //then sort them and print them
931                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
932                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
933                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
934                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
935                         
936                         //print them
937                         for (int i = 0; i < reps.size(); i++) {
938                                 string sequence = fasta->getSequence(reps[i].name);
939                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
940                                 outputName = outputName + "|" + toString(reps[i].size);
941                                 if (groupfile != "") {
942                                         outputName = outputName + "|" + reps[i].group;
943                                 }
944                                 out << ">" << outputName << endl;
945                                 out << sequence << endl;
946                         }
947                 }
948                 
949                 in.close();
950                 out.close();
951                 out2.close();
952                 
953                 remove(filename.c_str());
954                 rename(tempNameFile.c_str(), filename.c_str());
955                 
956                 return 0;
957
958         }
959         catch(exception& e) {
960                 m->errorOut(e, "GetOTURepCommand", "processNames");
961                 exit(1);
962         }
963 }
964 //**********************************************************************************************************************
965 SeqMap GetOTURepCommand::getMap(int row) {
966         try {
967                 SeqMap rowMap;
968                 
969                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
970                 if (rowPositions[row] != -1){
971                         //go to row in file
972                         inRow.seekg(rowPositions[row]);
973                         
974                         int rowNum, numDists, colNum;
975                         float dist;
976                         
977                         inRow >> rowNum >> numDists;
978                         
979                         for(int i = 0; i < numDists; i++) {
980                                 inRow >> colNum >> dist;
981                                 rowMap[colNum] = dist;
982                                 
983                         }
984                 }
985                 
986                 return rowMap;
987         }
988         catch(exception& e) {
989                 m->errorOut(e, "GetOTURepCommand", "getMap");
990                 exit(1);
991         }
992 }
993 //**********************************************************************************************************************
994