]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added sparseDistanceMatrix class. Modified cluster commands to use the new sparse...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){       
41         try {
42                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
45                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
46                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
47                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
48                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
49                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
50                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
51                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
52                 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
53                 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
54                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
55                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
56                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
57                 
58                 vector<string> myArray;
59                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
60                 return myArray;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "GetOTURepCommand", "setParameters");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 string GetOTURepCommand::getHelpString(){       
69         try {
70                 string helpString = "";
71                 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
72                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
73                 helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
74                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
75                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
76                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
77                 helpString += "The default value for label is all labels in your inputfile.\n";
78                 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
79                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
80                 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false.\n";
81                 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
82                 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
83                 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
84                 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
85                 helpString += "The group parameter allows you provide a group file.\n";
86                 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
87                 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
88                 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
89                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
90                 return helpString;
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "GetOTURepCommand", "getHelpString");
94                 exit(1);
95         }
96 }
97 //**********************************************************************************************************************
98 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){        
99         try {
100         string outputFileName = "";
101                 map<string, vector<string> >::iterator it;
102         
103         //is this a type this command creates
104         it = outputTypes.find(type);
105         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
106         else {
107             if (type == "fasta")            {   outputFileName =  "rep.fasta";   }
108             else if (type == "name")        {   outputFileName =  "rep.names";   }
109             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
110         }
111         return outputFileName;
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
115                 exit(1);
116         }
117 }
118 //**********************************************************************************************************************
119 GetOTURepCommand::GetOTURepCommand(){   
120         try {
121                 abort = true; calledHelp = true; 
122                 setParameters();
123                 vector<string> tempOutNames;
124                 outputTypes["fasta"] = tempOutNames;
125                 outputTypes["name"] = tempOutNames;
126         }
127         catch(exception& e) {
128                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
129                 exit(1);
130         }
131 }
132 //**********************************************************************************************************************
133 GetOTURepCommand::GetOTURepCommand(string option)  {
134         try{
135                 abort = false; calledHelp = false;   
136                 allLines = 1;
137                                 
138                 //allow user to run help
139                 if (option == "help") { 
140                         help(); abort = true; calledHelp = true;
141                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
142                 } else {
143                         vector<string> myArray = setParameters();
144                         
145                         OptionParser parser(option);
146                         map<string, string> parameters = parser.getParameters();
147                         
148                         ValidParameters validParameter;
149                         map<string, string>::iterator it;
150                 
151                         //check to make sure all parameters are valid for command
152                         for (it = parameters.begin(); it != parameters.end(); it++) { 
153                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
154                         }
155                         
156                         //initialize outputTypes
157                         vector<string> tempOutNames;
158                         outputTypes["fasta"] = tempOutNames;
159                         outputTypes["name"] = tempOutNames;
160                         
161                         //if the user changes the input directory command factory will send this info to us in the output parameter 
162                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
163                         if (inputDir == "not found"){   inputDir = "";          }
164                         else {
165                                 string path;
166                                 it = parameters.find("list");
167                                 //user has given a template file
168                                 if(it != parameters.end()){ 
169                                         path = m->hasPath(it->second);
170                                         //if the user has not given a path then, add inputdir. else leave path alone.
171                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
172                                 }
173                                 
174                                 it = parameters.find("fasta");
175                                 //user has given a template file
176                                 if(it != parameters.end()){ 
177                                         path = m->hasPath(it->second);
178                                         //if the user has not given a path then, add inputdir. else leave path alone.
179                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
180                                 }
181                                 
182                                 it = parameters.find("phylip");
183                                 //user has given a template file
184                                 if(it != parameters.end()){ 
185                                         path = m->hasPath(it->second);
186                                         //if the user has not given a path then, add inputdir. else leave path alone.
187                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
188                                 }
189                                 
190                                 it = parameters.find("column");
191                                 //user has given a template file
192                                 if(it != parameters.end()){ 
193                                         path = m->hasPath(it->second);
194                                         //if the user has not given a path then, add inputdir. else leave path alone.
195                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
196                                 }
197                                 
198                                 it = parameters.find("name");
199                                 //user has given a template file
200                                 if(it != parameters.end()){ 
201                                         path = m->hasPath(it->second);
202                                         //if the user has not given a path then, add inputdir. else leave path alone.
203                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
204                                 }
205                                 
206                                 it = parameters.find("group");
207                                 //user has given a template file
208                                 if(it != parameters.end()){ 
209                                         path = m->hasPath(it->second);
210                                         //if the user has not given a path then, add inputdir. else leave path alone.
211                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
212                                 }
213                         }
214
215                         
216                         //if the user changes the output directory command factory will send this info to us in the output parameter 
217                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
218                         
219                         //check for required parameters
220                         fastafile = validParameter.validFile(parameters, "fasta", true);
221                         if (fastafile == "not found") { fastafile = ""; }
222                         else if (fastafile == "not open") { abort = true; }     
223                         else { m->setFastaFile(fastafile); }
224                 
225                         listfile = validParameter.validFile(parameters, "list", true);
226                         if (listfile == "not found") {                  
227                                 listfile = m->getListFile(); 
228                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
229                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
230                         }
231                         else if (listfile == "not open") { abort = true; }      
232                         else { m->setListFile(listfile); }
233                         
234                         phylipfile = validParameter.validFile(parameters, "phylip", true);
235                         if (phylipfile == "not found") { phylipfile = "";  }
236                         else if (phylipfile == "not open") { abort = true; }    
237                         else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile);   }
238                         
239                         columnfile = validParameter.validFile(parameters, "column", true);
240                         if (columnfile == "not found") { columnfile = ""; }
241                         else if (columnfile == "not open") { abort = true; }    
242                         else { distFile = columnfile; format = "column";  m->setColumnFile(columnfile); }
243                         
244                         namefile = validParameter.validFile(parameters, "name", true);
245                         if (namefile == "not open") { abort = true; }   
246                         else if (namefile == "not found") { namefile = ""; }
247                         else { m->setNameFile(namefile); }
248                         
249                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
250                                 //give priority to column, then phylip
251                                 columnfile = m->getColumnFile(); 
252                                 if (columnfile != "") {  distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
253                                 else { 
254                                         phylipfile = m->getPhylipFile(); 
255                                         if (phylipfile != "") {  distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
256                                         else { 
257                                                 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine(); 
258                                                 abort = true;
259                                         }
260                                 }
261                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
262                 
263                         if (columnfile != "") {  
264                                 if (namefile == "") {  
265                                         namefile = m->getNameFile(); 
266                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
267                                         else { 
268                                                 m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
269                                                 abort = true; 
270                                         }       
271                                 } 
272                         }
273
274                         //check for optional parameter and set defaults
275                         // ...at some point should added some additional type checking...
276                         label = validParameter.validFile(parameters, "label", false);                   
277                         if (label == "not found") { label = ""; allLines = 1;  }
278                         else { 
279                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
280                                 else { allLines = 1;  }
281                         }
282                         
283                         groupfile = validParameter.validFile(parameters, "group", true);
284                         if (groupfile == "not open") { groupfile = ""; abort = true; }
285                         else if (groupfile == "not found") { groupfile = ""; }
286                         else { m->setGroupFile(groupfile); }
287                         
288                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
289                         if (sorted == "none") { sorted=""; }
290                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
291                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
292                                 sorted = "";
293                         }
294                         
295                         if ((sorted == "group") && (groupfile == "")) {
296                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
297                                 sorted = "";
298                         }
299                         
300                         groups = validParameter.validFile(parameters, "groups", false);                 
301                         if (groups == "not found") { groups = ""; }
302                         else { 
303                                 if (groupfile == "") {
304                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
305                                         abort = true;
306                                 }else { 
307                                         m->splitAtDash(groups, Groups);
308                                 }
309                         }
310                         m->setGroups(Groups);
311                         
312                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
313                         large = m->isTrue(temp);
314                         
315                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {       temp = "f";    }
316                         weighted = m->isTrue(temp);
317                         
318                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
319                         
320                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
321                         m->mothurConvert(temp, precision); 
322                         
323                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
324                         m->mothurConvert(temp, cutoff); 
325                         cutoff += (5 / (precision * 10.0));
326                 }
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
330                 exit(1);
331         }
332 }
333
334 //**********************************************************************************************************************
335
336 int GetOTURepCommand::execute(){
337         try {
338         
339                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
340                 int error;
341                 list = NULL;
342                 
343                 if (!large) {
344                         //read distance files
345                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
346                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
347                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
348                         
349                         readMatrix->setCutoff(cutoff);
350         
351                         if(namefile != ""){     
352                                 nameMap = new NameAssignment(namefile);
353                                 nameMap->readMap();
354                         }else{  nameMap = NULL;         }
355                         
356                         readMatrix->read(nameMap);
357                         
358                         if (m->control_pressed) { delete readMatrix; return 0; }
359
360                         list = readMatrix->getListVector();
361
362                         SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
363                         
364                         // Create a data structure to quickly access the distance information.
365                         // It consists of a vector of distance maps, where each map contains
366                         // all distances of a certain sequence. Vector and maps are accessed
367                         // via the index of a sequence in the distance matrix
368                         seqVec = vector<SeqMap>(list->size()); 
369             for (int i = 0; i < matrix->seqVec.size(); i++) {
370                 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
371                     if (m->control_pressed) { delete readMatrix; return 0; }
372                     //already added everyone else in row
373                     if (i < matrix->seqVec[i][j].index) {  seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist;  }
374                 }
375                         }
376                         //add dummy map for unweighted calc
377                         SeqMap dummy;
378                         seqVec.push_back(dummy);
379                         
380                         delete matrix;
381                         delete readMatrix;
382                         delete nameMap;
383                         
384                         if (m->control_pressed) { return 0; }
385                 }else {
386                         //process file and set up indexes
387                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
388                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
389                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
390                         
391                         formatMatrix->setCutoff(cutoff);
392         
393                         if(namefile != ""){     
394                                 nameMap = new NameAssignment(namefile);
395                                 nameMap->readMap();
396                         }else{  nameMap = NULL;         }
397                         
398                         formatMatrix->read(nameMap);
399                         
400                         if (m->control_pressed) { delete formatMatrix;  return 0; }
401
402                         list = formatMatrix->getListVector();
403                         
404                         distFile = formatMatrix->getFormattedFileName();
405                         
406                         //positions in file where the distances for each sequence begin
407                         //rowPositions[1] = position in file where distance related to sequence 1 start.
408                         rowPositions = formatMatrix->getRowPositions();
409                         rowPositions.push_back(-1); //dummy row for unweighted calc
410                         
411                         delete formatMatrix;
412                         delete nameMap;
413                         
414                         //openfile for getMap to use
415                         m->openInputFile(distFile, inRow);
416                         
417                         if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
418                 }
419                 
420                 
421                 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
422                 if (list != NULL) {
423                         vector<string> names;
424                         string binnames;
425                         //map names to rows in sparsematrix
426                         for (int i = 0; i < list->size(); i++) {
427                                 names.clear();
428                                 binnames = list->get(i);
429                                 
430                                 m->splitAtComma(binnames, names);
431                                 
432                                 for (int j = 0; j < names.size(); j++) {
433                                         nameToIndex[names[j]] = i;
434                                 }
435                         }
436                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
437                 
438                                 
439                 if (m->control_pressed) { 
440                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
441                         return 0; 
442                 }
443                 
444                 if (groupfile != "") {
445                         //read in group map info.
446                         groupMap = new GroupMap(groupfile);
447                         int error = groupMap->readMap();
448                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
449                         
450                         if (Groups.size() != 0) {
451                                 SharedUtil* util = new SharedUtil();
452                                 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
453                                 util->setGroups(Groups, gNamesOfGroups, "getoturep");
454                                 groupMap->setNamesOfGroups(gNamesOfGroups);
455                                 delete util;
456                         }
457                 }
458                 
459                 //done with listvector from matrix
460                 if (list != NULL) { delete list; }
461                 
462                 input = new InputData(listfile, "list");
463                 list = input->getListVector();
464                 string lastLabel = list->getLabel();
465
466                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
467                 set<string> processedLabels;
468                 set<string> userLabels = labels;
469                 
470                 if (m->control_pressed) { 
471                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
472                         delete input; delete list; return 0; 
473                 }
474                 
475                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
476                 
477                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
478                         
479                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
480                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
481                                         error = process(list);
482                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
483                                         
484                                         if (m->control_pressed) { 
485                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
486                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
487                                                 delete input; delete list; return 0; 
488                                         }
489                                         
490                                         processedLabels.insert(list->getLabel());
491                                         userLabels.erase(list->getLabel());
492                         }
493                         
494                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
495                                         string saveLabel = list->getLabel();
496                                         
497                                         delete list;
498                                         list = input->getListVector(lastLabel);
499                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
500                                         error = process(list);
501                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
502                                         
503                                         if (m->control_pressed) { 
504                                                 if (large) {  inRow.close(); m->mothurRemove(distFile);  }
505                                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
506                                                 delete input; delete list; return 0; 
507                                         }
508                                         
509                                         processedLabels.insert(list->getLabel());
510                                         userLabels.erase(list->getLabel());
511                                         
512                                         //restore real lastlabel to save below
513                                         list->setLabel(saveLabel);
514                         }
515                         
516                         lastLabel = list->getLabel();
517         
518                         delete list;
519                         list = input->getListVector();
520                 }
521                 
522                 //output error messages about any remaining user labels
523                 bool needToRun = false;
524                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
525                         m->mothurOut("Your file does not include the label " + (*it)); 
526                         if (processedLabels.count(lastLabel) != 1) {
527                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
528                                 needToRun = true;
529                         }else {
530                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
531                         }
532                 }
533                 
534                 //run last label if you need to
535                 if (needToRun == true)  {
536                         if (list != NULL) {     delete list;    }
537                         list = input->getListVector(lastLabel);
538                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
539                         error = process(list);
540                         delete list;
541                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
542                         
543                         if (m->control_pressed) { 
544                                         if (large) {  inRow.close(); m->mothurRemove(distFile);  }
545                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } outputTypes.clear();
546                                         delete input; delete list; return 0; 
547                         }
548                 }
549                 
550                 //close and remove formatted matrix file
551                 if (large) {
552                         inRow.close();
553                         m->mothurRemove(distFile);
554                 }
555                 
556                 delete input;  
557                 
558                 if (!weighted) { nameFileMap.clear(); }
559                 
560                                 
561                 if (fastafile != "") {
562                         //read fastafile
563                         fasta = new FastaMap();
564                         fasta->readFastaFile(fastafile);
565                         
566                         //if user gave a namesfile then use it
567                         if (namefile != "") {   readNamesFile();        }
568                         
569                         //output create and output the .rep.fasta files
570                         map<string, string>::iterator itNameFile;
571                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
572                                 processFastaNames(itNameFile->first, itNameFile->second);
573                         }
574                 }else {
575                         //output create and output the .rep.fasta files
576                         map<string, string>::iterator itNameFile;
577                         for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
578                                 processNames(itNameFile->first, itNameFile->second);
579                         }
580                 }
581                 
582                                 
583                 if (groupfile != "") { delete groupMap; }
584                 
585                 if (m->control_pressed) {  return 0; }
586                 
587                 //set fasta file as new current fastafile - use first one??
588                 string current = "";
589                 itTypes = outputTypes.find("fasta");
590                 if (itTypes != outputTypes.end()) {
591                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
592                 }
593                 
594                 itTypes = outputTypes.find("name");
595                 if (itTypes != outputTypes.end()) {
596                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
597                 }
598                 
599                 m->mothurOutEndLine();
600                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
601                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
602                 m->mothurOutEndLine();
603                 
604                 return 0;
605         }
606         catch(exception& e) {
607                 m->errorOut(e, "GetOTURepCommand", "execute");
608                 exit(1);
609         }
610 }
611
612 //**********************************************************************************************************************
613 void GetOTURepCommand::readNamesFile() {
614         try {
615                 ifstream in;
616                 vector<string> dupNames;
617                 m->openInputFile(namefile, in);
618                 
619                 string name, names, sequence;
620         
621                 while(!in.eof()){
622                         in >> name;                     //read from first column  A
623                         in >> names;            //read from second column  A,B,C,D
624                         
625                         dupNames.clear();
626                         
627                         //parse names into vector
628                         m->splitAtComma(names, dupNames);
629                         
630                         //store names in fasta map
631                         sequence = fasta->getSequence(name);
632                         for (int i = 0; i < dupNames.size(); i++) {
633                                 fasta->push_back(dupNames[i], sequence);
634                         }
635                 
636                         m->gobble(in);
637                 }
638                 in.close();
639
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
643                 exit(1);
644         }
645 }
646 //**********************************************************************************************************************
647 //read names file to find the weighted rep for each bin
648 void GetOTURepCommand::readNamesFile(bool w) {
649         try {
650                 ifstream in;
651                 vector<string> dupNames;
652                 m->openInputFile(namefile, in);
653                 
654                 string name, names, sequence;
655                 
656                 while(!in.eof()){
657                         in >> name;     m->gobble(in);          //read from first column  A
658                         in >> names;                                                    //read from second column  A,B,C,D
659                         
660                         dupNames.clear();
661                         
662                         //parse names into vector
663                         m->splitAtComma(names, dupNames);
664                         
665                         for (int i = 0; i < dupNames.size(); i++) {
666                                 nameFileMap[dupNames[i]] = name;
667                         }
668                         
669                         m->gobble(in);
670                 }
671                 in.close();
672                 
673         }
674         catch(exception& e) {
675                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
676                 exit(1);
677         }
678 }
679 //**********************************************************************************************************************
680 string GetOTURepCommand::findRep(vector<string> names) {
681         try{
682                 // if only 1 sequence in bin or processing the "unique" label, then 
683                 // the first sequence of the OTU is the representative one
684                 if ((names.size() == 1)) {
685                         return names[0];
686                 }else{
687                         vector<int> seqIndex(names.size());
688                         vector<float> max_dist(names.size());
689                         vector<float> total_dist(names.size());
690                         map<string, string>::iterator itNameFile;
691                         map<string, int>::iterator itNameIndex;
692
693                         //fill seqIndex and initialize sums
694                         for (size_t i = 0; i < names.size(); i++) {
695                                 if (weighted) {
696                                         seqIndex[i] = nameToIndex[names[i]];
697                                 }else { 
698                                         if (namefile == "") {
699                                                 itNameIndex = nameToIndex.find(names[i]);
700                                                 
701                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
702                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
703                                                         else {  seqIndex[i] = (seqVec.size()-1); }
704                                                 }else {
705                                                         seqIndex[i] = itNameIndex->second;
706                                                 }
707                                                 
708                                         }else {
709                                                 itNameFile = nameFileMap.find(names[i]);
710                                                 
711                                                 if (itNameFile == nameFileMap.end()) {
712                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
713                                                 }else{
714                                                         string name1 = itNameFile->first;
715                                                         string name2 = itNameFile->second;
716                                                         
717                                                         if (name1 == name2) { //then you are unique so add your real dists
718                                                                 seqIndex[i] = nameToIndex[names[i]];
719                                                         }else { //add dummy
720                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
721                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
722                                                         }
723                                                 }
724                                         }
725                                 }
726                                 max_dist[i] = 0.0;
727                                 total_dist[i] = 0.0;
728                         }
729                         
730                         // loop through all entries in seqIndex
731                         SeqMap::iterator it;
732                         SeqMap currMap;
733                         for (size_t i=0; i < seqIndex.size(); i++) {
734                                 if (m->control_pressed) {  return  "control"; }
735                         
736                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
737                                 else            {       currMap = getMap(seqIndex[i]);  }
738                                 
739                                 for (size_t j=0; j < seqIndex.size(); j++) {
740                                         it = currMap.find(seqIndex[j]);         
741                                         if (it != currMap.end()) {
742                                                 max_dist[i] = max(max_dist[i], it->second);
743                                                 max_dist[j] = max(max_dist[j], it->second);
744                                                 total_dist[i] += it->second;
745                                                 total_dist[j] += it->second;
746                                         }else{ //if you can't find the distance make it the cutoff
747                                                 max_dist[i] = max(max_dist[i], cutoff);
748                                                 max_dist[j] = max(max_dist[j], cutoff);
749                                                 total_dist[i] += cutoff;
750                                                 total_dist[j] += cutoff;
751                                         }
752                                 }
753                         }
754                         
755                         // sequence with the smallest maximum distance is the representative
756                         //if tie occurs pick sequence with smallest average distance
757                         float min = 10000;
758                         int minIndex;
759                         for (size_t i=0; i < max_dist.size(); i++) {
760                                 if (m->control_pressed) {  return  "control"; }
761                                 if (max_dist[i] < min) {
762                                         min = max_dist[i];
763                                         minIndex = i;
764                                 }else if (max_dist[i] == min) {
765                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
766                                         float newAverage = total_dist[i] / (float) total_dist.size();
767                                         
768                                         if (newAverage < currentAverage) {
769                                                 min = max_dist[i];
770                                                 minIndex = i;
771                                         }
772                                 }
773                         }
774                         
775                         return(names[minIndex]);
776                 }
777         }
778         catch(exception& e) {
779                 m->errorOut(e, "GetOTURepCommand", "FindRep");
780                 exit(1);
781         }
782 }
783
784 //**********************************************************************************************************************
785 int GetOTURepCommand::process(ListVector* processList) {
786         try{
787                 string name, sequence;
788                 string nameRep;
789
790                 //create output file
791                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
792                                 
793                 ofstream newNamesOutput;
794                 string outputNamesFile;
795                 map<string, ofstream*> filehandles;
796                 
797                 if (Groups.size() == 0) { //you don't want to use groups
798                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name");
799                         m->openOutputFile(outputNamesFile, newNamesOutput);
800                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
801                         outputNameFiles[outputNamesFile] = processList->getLabel();
802                 }else{ //you want to use groups
803                         ofstream* temp;
804                         for (int i=0; i<Groups.size(); i++) {
805                                 temp = new ofstream;
806                                 filehandles[Groups[i]] = temp;
807                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name");
808                                 
809                                 m->openOutputFile(outputNamesFile, *(temp));
810                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
811                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
812                         }
813                 }
814                 
815                 //for each bin in the list vector
816                 for (int i = 0; i < processList->size(); i++) {
817                         if (m->control_pressed) { 
818                                 out.close();  
819                                 if (Groups.size() == 0) { //you don't want to use groups
820                                         newNamesOutput.close();
821                                 }else{
822                                         for (int j=0; j<Groups.size(); j++) {
823                                                 (*(filehandles[Groups[j]])).close();
824                                                 delete filehandles[Groups[j]];
825                                         }
826                                 }
827                                 return 0; 
828                         }
829                         
830                         string temp = processList->get(i);
831                         vector<string> namesInBin;
832                         m->splitAtComma(temp, namesInBin);
833                         
834                         if (Groups.size() == 0) {
835                                 nameRep = findRep(namesInBin);
836                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
837                         }else{
838                                 map<string, vector<string> > NamesInGroup;
839                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
840                                         NamesInGroup[Groups[j]].resize(0);
841                                 }
842                                 
843                                 for (int j=0; j<namesInBin.size(); j++) {
844                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
845                                         
846                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
847                                         
848                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
849                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
850                                         }
851                                 }
852                                 
853                                 //get rep for each group in otu
854                                 for (int j=0; j<Groups.size(); j++) {
855                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
856                                                 //get rep for each group
857                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
858                                                 
859                                                 //output group rep and other members of this group
860                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
861                                                 
862                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
863                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
864                                                 }
865                                                 //output last name
866                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
867                                         }
868                                 }
869                         }
870                 }
871                 
872                 if (Groups.size() == 0) { //you don't want to use groups
873                         newNamesOutput.close();
874                 }else{
875                         for (int i=0; i<Groups.size(); i++) {
876                                 (*(filehandles[Groups[i]])).close();
877                                 delete filehandles[Groups[i]];
878                         }
879                 }
880                 
881                 return 0;
882
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "GetOTURepCommand", "process");
886                 exit(1);
887         }
888 }
889 //**********************************************************************************************************************
890 int GetOTURepCommand::processFastaNames(string filename, string label) {
891         try{
892
893                 //create output file
894                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
895                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
896                 m->openOutputFile(outputFileName, out);
897                 vector<repStruct> reps;
898                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
899                 
900                 ofstream out2;
901                 string tempNameFile = filename + ".temp";
902                 m->openOutputFile(tempNameFile, out2);
903                 
904                 ifstream in;
905                 m->openInputFile(filename, in);
906                 
907                 int i = 0;
908                 while (!in.eof()) {
909                         string rep, binnames;
910                         in >> i >> rep >> binnames; m->gobble(in);
911                         out2 << rep << '\t' << binnames << endl;
912                         
913                         vector<string> names;
914                         m->splitAtComma(binnames, names);
915                         int binsize = names.size();
916                         
917                         //if you have a groupfile
918                         string group = "";
919                         if (groupfile != "") {
920                                 map<string, string> groups;
921                                 map<string, string>::iterator groupIt;
922                                 
923                                 //find the groups that are in this bin
924                                 for (size_t i = 0; i < names.size(); i++) {
925                                         string groupName = groupMap->getGroup(names[i]);
926                                         if (groupName == "not found") {  
927                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
928                                                 groupError = true;
929                                         } else {
930                                                 groups[groupName] = groupName;
931                                         }
932                                 }
933                                 
934                                 //turn the groups into a string
935                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
936                                         group += groupIt->first + "-";
937                                 }
938                                 //rip off last dash
939                                 group = group.substr(0, group.length()-1);
940                         }else{ group = ""; }
941
942                         
943                         //print out name and sequence for that bin
944                         string sequence = fasta->getSequence(rep);
945
946                         if (sequence != "not found") {
947                                 if (sorted == "") { //print them out
948                                         rep = rep + "\t" + toString(i+1);
949                                         rep = rep + "|" + toString(binsize);
950                                         if (groupfile != "") {
951                                                 rep = rep + "|" + group;
952                                         }
953                                         out << ">" << rep << endl;
954                                         out << sequence << endl;
955                                 }else { //save them
956                                         repStruct newRep(rep, i+1, binsize, group);
957                                         reps.push_back(newRep);
958                                 }
959                         }else { 
960                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
961                         }
962                 }
963                 
964                         
965                 if (sorted != "") { //then sort them and print them
966                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
967                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
968                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
969                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
970                         
971                         //print them
972                         for (int i = 0; i < reps.size(); i++) {
973                                 string sequence = fasta->getSequence(reps[i].name);
974                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
975                                 outputName = outputName + "|" + toString(reps[i].size);
976                                 if (groupfile != "") {
977                                         outputName = outputName + "|" + reps[i].group;
978                                 }
979                                 out << ">" << outputName << endl;
980                                 out << sequence << endl;
981                         }
982                 }
983                 
984                 in.close();
985                 out.close();
986                 out2.close();
987                 
988                 m->mothurRemove(filename);
989                 rename(tempNameFile.c_str(), filename.c_str());
990                 
991                 return 0;
992
993         }
994         catch(exception& e) {
995                 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
996                 exit(1);
997         }
998 }
999 //**********************************************************************************************************************
1000 int GetOTURepCommand::processNames(string filename, string label) {
1001         try{
1002                 
1003                 //create output file
1004                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1005                 
1006                 ofstream out2;
1007                 string tempNameFile = filename + ".temp";
1008                 m->openOutputFile(tempNameFile, out2);
1009                 
1010                 ifstream in;
1011                 m->openInputFile(filename, in);
1012                 
1013                 int i = 0;
1014                 string rep, binnames;
1015                 while (!in.eof()) {
1016                         if (m->control_pressed) { break; }
1017                         in >> i >> rep >> binnames; m->gobble(in);
1018                         out2 << rep << '\t' << binnames << endl;
1019                 }
1020                 in.close();
1021                 out2.close();
1022                 
1023                 m->mothurRemove(filename);
1024                 rename(tempNameFile.c_str(), filename.c_str());
1025                 
1026                 return 0;
1027         }
1028         catch(exception& e) {
1029                 m->errorOut(e, "GetOTURepCommand", "processNames");
1030                 exit(1);
1031         }
1032 }
1033 //**********************************************************************************************************************
1034 SeqMap GetOTURepCommand::getMap(int row) {
1035         try {
1036                 SeqMap rowMap;
1037                 
1038                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1039                 if (rowPositions[row] != -1){
1040                         //go to row in file
1041                         inRow.seekg(rowPositions[row]);
1042                         
1043                         int rowNum, numDists, colNum;
1044                         float dist;
1045                         
1046                         inRow >> rowNum >> numDists;
1047                         
1048                         for(int i = 0; i < numDists; i++) {
1049                                 inRow >> colNum >> dist;
1050                                 rowMap[colNum] = dist;
1051                                 
1052                         }
1053                 }
1054                 
1055                 return rowMap;
1056         }
1057         catch(exception& e) {
1058                 m->errorOut(e, "GetOTURepCommand", "getMap");
1059                 exit(1);
1060         }
1061 }
1062 //**********************************************************************************************************************
1063