]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
get.oturep ifstream
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){   
40         try {
41                 abort = true;
42                 //initialize outputTypes
43                 vector<string> tempOutNames;
44                 outputTypes["fasta"] = tempOutNames;
45                 outputTypes["name"] = tempOutNames;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> GetOTURepCommand::getValidParameters(){  
54         try {
55                 string Array[] =  {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
56                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
57                 return myArray;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
61                 exit(1);
62         }
63 }
64 //**********************************************************************************************************************
65 vector<string> GetOTURepCommand::getRequiredParameters(){       
66         try {
67                 string Array[] =  {"fasta","list"};
68                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
69                 return myArray;
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
73                 exit(1);
74         }
75 }
76 //**********************************************************************************************************************
77 vector<string> GetOTURepCommand::getRequiredFiles(){    
78         try {
79                 vector<string> myArray;
80                 return myArray;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 GetOTURepCommand::GetOTURepCommand(string option)  {
89         try{
90                 globaldata = GlobalData::getInstance();
91                 abort = false;
92                 allLines = 1;
93                 labels.clear();
94                                 
95                 //allow user to run help
96                 if (option == "help") { 
97                         help(); abort = true;
98                 } else {
99                         //valid paramters for this command
100                         string Array[] =  {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
101                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
102                         
103                         OptionParser parser(option);
104                         map<string, string> parameters = parser.getParameters();
105                         
106                         ValidParameters validParameter;
107                         map<string, string>::iterator it;
108                 
109                         //check to make sure all parameters are valid for command
110                         for (it = parameters.begin(); it != parameters.end(); it++) { 
111                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
112                         }
113                         
114                         //initialize outputTypes
115                         vector<string> tempOutNames;
116                         outputTypes["fasta"] = tempOutNames;
117                         outputTypes["name"] = tempOutNames;
118                         
119                         //if the user changes the input directory command factory will send this info to us in the output parameter 
120                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
121                         if (inputDir == "not found"){   inputDir = "";          }
122                         else {
123                                 string path;
124                                 it = parameters.find("list");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
130                                 }
131                                 
132                                 it = parameters.find("fasta");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
138                                 }
139                                 
140                                 it = parameters.find("phylip");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
146                                 }
147                                 
148                                 it = parameters.find("column");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
154                                 }
155                                 
156                                 it = parameters.find("name");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
162                                 }
163                                 
164                                 it = parameters.find("group");
165                                 //user has given a template file
166                                 if(it != parameters.end()){ 
167                                         path = m->hasPath(it->second);
168                                         //if the user has not given a path then, add inputdir. else leave path alone.
169                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
170                                 }
171                         }
172
173                         
174                         //if the user changes the output directory command factory will send this info to us in the output parameter 
175                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
176                         
177                         //check for required parameters
178                         fastafile = validParameter.validFile(parameters, "fasta", true);
179                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
180                         else if (fastafile == "not open") { abort = true; }     
181                 
182                         listfile = validParameter.validFile(parameters, "list", true);
183                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
184                         else if (listfile == "not open") { abort = true; }      
185                         
186                         phylipfile = validParameter.validFile(parameters, "phylip", true);
187                         if (phylipfile == "not found") { phylipfile = "";  }
188                         else if (phylipfile == "not open") { abort = true; }    
189                         else { distFile = phylipfile; format = "phylip";   }
190                         
191                         columnfile = validParameter.validFile(parameters, "column", true);
192                         if (columnfile == "not found") { columnfile = ""; }
193                         else if (columnfile == "not open") { abort = true; }    
194                         else { distFile = columnfile; format = "column";   }
195                         
196                         namefile = validParameter.validFile(parameters, "name", true);
197                         if (namefile == "not open") { abort = true; }   
198                         else if (namefile == "not found") { namefile = ""; }
199                         
200                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
201                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
202                 
203                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
204
205                         //check for optional parameter and set defaults
206                         // ...at some point should added some additional type checking...
207                         label = validParameter.validFile(parameters, "label", false);                   
208                         if (label == "not found") { label = ""; allLines = 1;  }
209                         else { 
210                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
211                                 else { allLines = 1;  }
212                         }
213                         
214                         groupfile = validParameter.validFile(parameters, "group", true);
215                         if (groupfile == "not open") { groupfile = ""; abort = true; }
216                         else if (groupfile == "not found") { groupfile = ""; }
217                                                 
218                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
219                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
220                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
221                                 sorted = "";
222                         }
223                         
224                         if ((sorted == "group") && (groupfile == "")) {
225                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
226                                 sorted = "";
227                         }
228                         
229                         groups = validParameter.validFile(parameters, "groups", false);                 
230                         if (groups == "not found") { groups = ""; }
231                         else { 
232                                 if (groupfile == "") {
233                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
234                                         abort = true;
235                                 }else { 
236                                         m->splitAtDash(groups, Groups);
237                                 }
238                         }
239                         globaldata->Groups = Groups;
240                         
241                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
242                         large = m->isTrue(temp);
243                         
244                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {      if (namefile == "") { temp = "F"; } else { temp = "t"; }        }
245                         weighted = m->isTrue(temp);
246                         
247                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
248                         
249                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
250                         convert(temp, precision); 
251                         
252                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
253                         convert(temp, cutoff); 
254                         cutoff += (5 / (precision * 10.0));
255                 }
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
259                 exit(1);
260         }
261 }
262
263 //**********************************************************************************************************************
264
265 void GetOTURepCommand::help(){
266         try {
267                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
268                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
269                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
270                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
271                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
272                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
273                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
274                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
275                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
276                 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false with no namesfile and true when a name file is provided.\n");
277                 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
278                 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
279                 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
280                 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
281                 m->mothurOut("The group parameter allows you provide a group file.\n");
282                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
283                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
284                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
285                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "GetOTURepCommand", "help");
289                 exit(1);
290         }
291 }
292
293 //**********************************************************************************************************************
294
295 GetOTURepCommand::~GetOTURepCommand(){}
296
297 //**********************************************************************************************************************
298
299 int GetOTURepCommand::execute(){
300         try {
301         
302                 if (abort == true) { return 0; }
303                 int error;
304                 
305                 if (!large) {
306                         //read distance files
307                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
308                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
309                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
310                         
311                         readMatrix->setCutoff(cutoff);
312         
313                         if(namefile != ""){     
314                                 nameMap = new NameAssignment(namefile);
315                                 nameMap->readMap();
316                         }else{  nameMap = NULL;         }
317                         
318                         readMatrix->read(nameMap);
319                         
320                         if (m->control_pressed) { delete readMatrix; return 0; }
321
322                         //get matrix
323                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
324                         globaldata->gListVector = readMatrix->getListVector();
325
326                         SparseMatrix* matrix = readMatrix->getMatrix();
327                         
328                         // Create a data structure to quickly access the distance information.
329                         // It consists of a vector of distance maps, where each map contains
330                         // all distances of a certain sequence. Vector and maps are accessed
331                         // via the index of a sequence in the distance matrix
332                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
333                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
334                                 if (m->control_pressed) { delete readMatrix; return 0; }
335                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
336                         }
337                         //add dummy map for unweighted calc
338                         SeqMap dummy;
339                         seqVec.push_back(dummy);
340                         
341                         delete matrix;
342                         delete readMatrix;
343                         delete nameMap;
344                         
345                         if (m->control_pressed) { return 0; }
346                 }else {
347                         //process file and set up indexes
348                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
349                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
350                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
351                         
352                         formatMatrix->setCutoff(cutoff);
353         
354                         if(namefile != ""){     
355                                 nameMap = new NameAssignment(namefile);
356                                 nameMap->readMap();
357                         }else{  nameMap = NULL;         }
358                         
359                         formatMatrix->read(nameMap);
360                         
361                         if (m->control_pressed) { delete formatMatrix;  return 0; }
362
363                         //get matrix
364                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
365                         globaldata->gListVector = formatMatrix->getListVector();
366                         
367                         distFile = formatMatrix->getFormattedFileName();
368                         
369                         //positions in file where the distances for each sequence begin
370                         //rowPositions[1] = position in file where distance related to sequence 1 start.
371                         rowPositions = formatMatrix->getRowPositions();
372                         rowPositions.push_back(-1); //dummy row for unweighted calc
373                         
374                         delete formatMatrix;
375                         delete nameMap;
376                         
377                         //openfile for getMap to use
378                         m->openInputFile(distFile, inRow);
379                         
380                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
381                 }
382                 
383                 
384                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
385                 if (globaldata->gListVector != NULL) {
386                         vector<string> names;
387                         string binnames;
388                         //map names to rows in sparsematrix
389                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
390                                 names.clear();
391                                 binnames = globaldata->gListVector->get(i);
392                                 
393                                 m->splitAtComma(binnames, names);
394                                 
395                                 for (int j = 0; j < names.size(); j++) {
396                                         nameToIndex[names[j]] = i;
397                                 }
398                         }
399                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
400                 
401                                 
402                 if (m->control_pressed) { 
403                         if (large) {  inRow.close(); remove(distFile.c_str());  }
404                         return 0; 
405                 }
406                 
407                 if (groupfile != "") {
408                         //read in group map info.
409                         groupMap = new GroupMap(groupfile);
410                         int error = groupMap->readMap();
411                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
412                         
413                         if (Groups.size() != 0) {
414                                 SharedUtil* util = new SharedUtil();
415                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
416                                 delete util;
417                         }
418                 }
419                                                                 
420                 //set format to list so input can get listvector
421                 globaldata->setFormat("list");
422         
423                 //read list file
424                 read = new ReadOTUFile(listfile);
425                 read->read(&*globaldata); 
426                 
427                 input = globaldata->ginput;
428                 list = globaldata->gListVector;
429                 string lastLabel = list->getLabel();
430
431                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
432                 set<string> processedLabels;
433                 set<string> userLabels = labels;
434                 
435                 if (m->control_pressed) { 
436                         if (large) {  inRow.close(); remove(distFile.c_str());  }
437                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
438                 }
439                 
440                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
441                 
442                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
443                         
444                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
445                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
446                                         error = process(list);
447                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
448                                         
449                                         if (m->control_pressed) { 
450                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
451                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
452                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
453                                         }
454                                         
455                                         processedLabels.insert(list->getLabel());
456                                         userLabels.erase(list->getLabel());
457                         }
458                         
459                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
460                                         string saveLabel = list->getLabel();
461                                         
462                                         delete list;
463                                         list = input->getListVector(lastLabel);
464                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
465                                         error = process(list);
466                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
467                                         
468                                         if (m->control_pressed) { 
469                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
470                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
471                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
472                                         }
473                                         
474                                         processedLabels.insert(list->getLabel());
475                                         userLabels.erase(list->getLabel());
476                                         
477                                         //restore real lastlabel to save below
478                                         list->setLabel(saveLabel);
479                         }
480                         
481                         lastLabel = list->getLabel();
482         
483                         delete list;
484                         list = input->getListVector();
485                 }
486                 
487                 //output error messages about any remaining user labels
488                 bool needToRun = false;
489                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
490                         m->mothurOut("Your file does not include the label " + (*it)); 
491                         if (processedLabels.count(lastLabel) != 1) {
492                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
493                                 needToRun = true;
494                         }else {
495                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
496                         }
497                 }
498                 
499                 //run last label if you need to
500                 if (needToRun == true)  {
501                         if (list != NULL) {     delete list;    }
502                         list = input->getListVector(lastLabel);
503                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
504                         error = process(list);
505                         delete list;
506                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
507                         
508                         if (m->control_pressed) { 
509                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
510                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
511                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
512                         }
513                 }
514                 
515                 //close and remove formatted matrix file
516                 if (large) {
517                         inRow.close();
518                         remove(distFile.c_str());
519                 }
520                 
521                 globaldata->gListVector = NULL;
522                 delete input;  globaldata->ginput = NULL;
523                 delete read;
524                 
525                 if (!weighted) { nameFileMap.clear(); }
526                 
527                 //read fastafile
528                 fasta = new FastaMap();
529                 fasta->readFastaFile(fastafile);
530                 
531                 //if user gave a namesfile then use it
532                 if (namefile != "") {   readNamesFile();        }
533                 
534                 //output create and output the .rep.fasta files
535                 map<string, string>::iterator itNameFile;
536                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
537                         processNames(itNameFile->first, itNameFile->second);
538                 }
539                 
540                 delete fasta;
541                 if (groupfile != "") {
542                         delete groupMap;  globaldata->gGroupmap = NULL;
543                 }
544                 
545                 if (m->control_pressed) {  return 0; }
546                 
547                 m->mothurOutEndLine();
548                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
549                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
550                 m->mothurOutEndLine();
551                 
552                 return 0;
553         }
554         catch(exception& e) {
555                 m->errorOut(e, "GetOTURepCommand", "execute");
556                 exit(1);
557         }
558 }
559
560 //**********************************************************************************************************************
561 void GetOTURepCommand::readNamesFile() {
562         try {
563                 ifstream in;
564                 vector<string> dupNames;
565                 m->openInputFile(namefile, in);
566                 
567                 string name, names, sequence;
568         
569                 while(!in.eof()){
570                         in >> name;                     //read from first column  A
571                         in >> names;            //read from second column  A,B,C,D
572                         
573                         dupNames.clear();
574                         
575                         //parse names into vector
576                         m->splitAtComma(names, dupNames);
577                         
578                         //store names in fasta map
579                         sequence = fasta->getSequence(name);
580                         for (int i = 0; i < dupNames.size(); i++) {
581                                 fasta->push_back(dupNames[i], sequence);
582                         }
583                 
584                         m->gobble(in);
585                 }
586                 in.close();
587
588         }
589         catch(exception& e) {
590                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
591                 exit(1);
592         }
593 }
594 //**********************************************************************************************************************
595 //read names file to find the weighted rep for each bin
596 void GetOTURepCommand::readNamesFile(bool w) {
597         try {
598                 ifstream in;
599                 vector<string> dupNames;
600                 m->openInputFile(namefile, in);
601                 
602                 string name, names, sequence;
603                 
604                 while(!in.eof()){
605                         in >> name;     m->gobble(in);          //read from first column  A
606                         in >> names;                                                    //read from second column  A,B,C,D
607                         
608                         dupNames.clear();
609                         
610                         //parse names into vector
611                         m->splitAtComma(names, dupNames);
612                         
613                         for (int i = 0; i < dupNames.size(); i++) {
614                                 nameFileMap[dupNames[i]] = name;
615                         }
616                         
617                         m->gobble(in);
618                 }
619                 in.close();
620                 
621         }
622         catch(exception& e) {
623                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
624                 exit(1);
625         }
626 }
627 //**********************************************************************************************************************
628 string GetOTURepCommand::findRep(vector<string> names) {
629         try{
630                 // if only 1 sequence in bin or processing the "unique" label, then 
631                 // the first sequence of the OTU is the representative one
632                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
633                         return names[0];
634                 }else{
635                         vector<int> seqIndex(names.size());
636                         vector<float> max_dist(names.size());
637                         vector<float> total_dist(names.size());
638                         map<string, string>::iterator itNameFile;
639                         map<string, int>::iterator itNameIndex;
640
641                         //fill seqIndex and initialize sums
642                         for (size_t i = 0; i < names.size(); i++) {
643                                 if (weighted) {
644                                         seqIndex[i] = nameToIndex[names[i]];
645                                 }else { 
646                                         if (namefile == "") {
647                                                 itNameIndex = nameToIndex.find(names[i]);
648                                                 
649                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
650                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
651                                                         else {  seqIndex[i] = (seqVec.size()-1); }
652                                                 }else {
653                                                         seqIndex[i] = itNameIndex->second;
654                                                 }
655                                                 
656                                         }else {
657                                                 itNameFile = nameFileMap.find(names[i]);
658                                                 
659                                                 if (itNameFile == nameFileMap.end()) {
660                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
661                                                 }else{
662                                                         string name1 = itNameFile->first;
663                                                         string name2 = itNameFile->second;
664                                                         
665                                                         if (name1 == name2) { //then you are unique so add your real dists
666                                                                 seqIndex[i] = nameToIndex[names[i]];
667                                                         }else { //add dummy
668                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
669                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
670                                                         }
671                                                 }
672                                         }
673                                 }
674                                 max_dist[i] = 0.0;
675                                 total_dist[i] = 0.0;
676                         }
677                         
678                         // loop through all entries in seqIndex
679                         SeqMap::iterator it;
680                         SeqMap currMap;
681                         for (size_t i=0; i < seqIndex.size(); i++) {
682                                 if (m->control_pressed) {  return  "control"; }
683                         
684                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
685                                 else            {       currMap = getMap(seqIndex[i]);  }
686                                 
687                                 for (size_t j=0; j < seqIndex.size(); j++) {
688                                         it = currMap.find(seqIndex[j]);         
689                                         if (it != currMap.end()) {
690                                                 max_dist[i] = max(max_dist[i], it->second);
691                                                 max_dist[j] = max(max_dist[j], it->second);
692                                                 total_dist[i] += it->second;
693                                                 total_dist[j] += it->second;
694                                         }else{ //if you can't find the distance make it the cutoff
695                                                 max_dist[i] = max(max_dist[i], cutoff);
696                                                 max_dist[j] = max(max_dist[j], cutoff);
697                                                 total_dist[i] += cutoff;
698                                                 total_dist[j] += cutoff;
699                                         }
700                                 }
701                         }
702                         
703                         // sequence with the smallest maximum distance is the representative
704                         //if tie occurs pick sequence with smallest average distance
705                         float min = 10000;
706                         int minIndex;
707                         for (size_t i=0; i < max_dist.size(); i++) {
708                                 if (m->control_pressed) {  return  "control"; }
709                                 if (max_dist[i] < min) {
710                                         min = max_dist[i];
711                                         minIndex = i;
712                                 }else if (max_dist[i] == min) {
713                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
714                                         float newAverage = total_dist[i] / (float) total_dist.size();
715                                         
716                                         if (newAverage < currentAverage) {
717                                                 min = max_dist[i];
718                                                 minIndex = i;
719                                         }
720                                 }
721                         }
722                         
723                         return(names[minIndex]);
724                 }
725         }
726         catch(exception& e) {
727                 m->errorOut(e, "GetOTURepCommand", "FindRep");
728                 exit(1);
729         }
730 }
731
732 //**********************************************************************************************************************
733 int GetOTURepCommand::process(ListVector* processList) {
734         try{
735                 string name, sequence;
736                 string nameRep;
737
738                 //create output file
739                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
740                                 
741                 ofstream newNamesOutput;
742                 string outputNamesFile;
743                 map<string, ofstream*> filehandles;
744                 
745                 if (Groups.size() == 0) { //you don't want to use groups
746                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
747                         m->openOutputFile(outputNamesFile, newNamesOutput);
748                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
749                         outputNameFiles[outputNamesFile] = processList->getLabel();
750                 }else{ //you want to use groups
751                         ofstream* temp;
752                         for (int i=0; i<Groups.size(); i++) {
753                                 temp = new ofstream;
754                                 filehandles[Groups[i]] = temp;
755                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
756                                 
757                                 m->openOutputFile(outputNamesFile, *(temp));
758                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
759                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
760                         }
761                 }
762                 
763                 //for each bin in the list vector
764                 for (int i = 0; i < processList->size(); i++) {
765                         if (m->control_pressed) { 
766                                 out.close();  
767                                 if (Groups.size() == 0) { //you don't want to use groups
768                                         newNamesOutput.close();
769                                 }else{
770                                         for (int j=0; j<Groups.size(); j++) {
771                                                 (*(filehandles[Groups[j]])).close();
772                                                 delete filehandles[Groups[j]];
773                                         }
774                                 }
775                                 return 0; 
776                         }
777                         
778                         string temp = processList->get(i);
779                         vector<string> namesInBin;
780                         m->splitAtComma(temp, namesInBin);
781                         
782                         if (Groups.size() == 0) {
783                                 nameRep = findRep(namesInBin);
784                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
785                         }else{
786                                 map<string, vector<string> > NamesInGroup;
787                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
788                                         NamesInGroup[Groups[j]].resize(0);
789                                 }
790                                 
791                                 for (int j=0; j<namesInBin.size(); j++) {
792                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
793                                         
794                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
795                                         
796                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
797                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
798                                         }
799                                 }
800                                 
801                                 //get rep for each group in otu
802                                 for (int j=0; j<Groups.size(); j++) {
803                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
804                                                 //get rep for each group
805                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
806                                                 
807                                                 //output group rep and other members of this group
808                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
809                                                 
810                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
811                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
812                                                 }
813                                                 //output last name
814                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
815                                         }
816                                 }
817                         }
818                 }
819                 
820                 if (Groups.size() == 0) { //you don't want to use groups
821                         newNamesOutput.close();
822                 }else{
823                         for (int i=0; i<Groups.size(); i++) {
824                                 (*(filehandles[Groups[i]])).close();
825                                 delete filehandles[Groups[i]];
826                         }
827                 }
828                 
829                 return 0;
830
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "GetOTURepCommand", "process");
834                 exit(1);
835         }
836 }
837 //**********************************************************************************************************************
838 int GetOTURepCommand::processNames(string filename, string label) {
839         try{
840
841                 //create output file
842                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
843                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
844                 m->openOutputFile(outputFileName, out);
845                 vector<repStruct> reps;
846                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
847                 
848                 ofstream out2;
849                 string tempNameFile = filename + ".temp";
850                 m->openOutputFile(tempNameFile, out2);
851                 
852                 ifstream in;
853                 m->openInputFile(filename, in);
854                 
855                 int i = 0;
856                 while (!in.eof()) {
857                         string rep, binnames;
858                         in >> i >> rep >> binnames; m->gobble(in);
859                         out2 << rep << '\t' << binnames << endl;
860                         
861                         vector<string> names;
862                         m->splitAtComma(binnames, names);
863                         int binsize = names.size();
864                         
865                         //if you have a groupfile
866                         string group = "";
867                         if (groupfile != "") {
868                                 map<string, string> groups;
869                                 map<string, string>::iterator groupIt;
870                                 
871                                 //find the groups that are in this bin
872                                 for (size_t i = 0; i < names.size(); i++) {
873                                         string groupName = groupMap->getGroup(names[i]);
874                                         if (groupName == "not found") {  
875                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
876                                                 groupError = true;
877                                         } else {
878                                                 groups[groupName] = groupName;
879                                         }
880                                 }
881                                 
882                                 //turn the groups into a string
883                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
884                                         group += groupIt->first + "-";
885                                 }
886                                 //rip off last dash
887                                 group = group.substr(0, group.length()-1);
888                         }else{ group = ""; }
889
890                         
891                         //print out name and sequence for that bin
892                         string sequence = fasta->getSequence(rep);
893
894                         if (sequence != "not found") {
895                                 if (sorted == "") { //print them out
896                                         rep = rep + "\t" + toString(i+1);
897                                         rep = rep + "|" + toString(binsize);
898                                         if (groupfile != "") {
899                                                 rep = rep + "|" + group;
900                                         }
901                                         out << ">" << rep << endl;
902                                         out << sequence << endl;
903                                 }else { //save them
904                                         repStruct newRep(rep, i+1, binsize, group);
905                                         reps.push_back(newRep);
906                                 }
907                         }else { 
908                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
909                         }
910                 }
911                 
912                         
913                 if (sorted != "") { //then sort them and print them
914                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
915                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
916                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
917                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
918                         
919                         //print them
920                         for (int i = 0; i < reps.size(); i++) {
921                                 string sequence = fasta->getSequence(reps[i].name);
922                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
923                                 outputName = outputName + "|" + toString(reps[i].size);
924                                 if (groupfile != "") {
925                                         outputName = outputName + "|" + reps[i].group;
926                                 }
927                                 out << ">" << outputName << endl;
928                                 out << sequence << endl;
929                         }
930                 }
931                 
932                 in.close();
933                 out.close();
934                 out2.close();
935                 
936                 remove(filename.c_str());
937                 rename(tempNameFile.c_str(), filename.c_str());
938                 
939                 return 0;
940
941         }
942         catch(exception& e) {
943                 m->errorOut(e, "GetOTURepCommand", "processNames");
944                 exit(1);
945         }
946 }
947 //**********************************************************************************************************************
948 SeqMap GetOTURepCommand::getMap(int row) {
949         try {
950                 SeqMap rowMap;
951                 
952                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
953                 if (rowPositions[row] != -1){
954                         //go to row in file
955                         inRow.seekg(rowPositions[row]);
956                         
957                         int rowNum, numDists, colNum;
958                         float dist;
959                         
960                         inRow >> rowNum >> numDists;
961                         
962                         for(int i = 0; i < numDists; i++) {
963                                 inRow >> colNum >> dist;
964                                 rowMap[colNum] = dist;
965                                 
966                         }
967                 }
968                 
969                 return rowMap;
970         }
971         catch(exception& e) {
972                 m->errorOut(e, "GetOTURepCommand", "getMap");
973                 exit(1);
974         }
975 }
976 //**********************************************************************************************************************
977